<a href="https://colab.research.google.com/github/Djensonsan/Spotify-Sequential-Skip-Prediction-Challenge/blob/main/similarity_measures/mahalanobis_distance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/Djensonsan/Spotify-Sequential-Skip-Prediction-Challenge/blob/main/similarity_measures/mahalanobis_distance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mahalanobis Distance
By Jens Leysen


## Imports & Constants

In [159]:
# Install your required packages here
!pip install pandas numpy matplotlib sklearn fsspec gcsfs tqdm



In [160]:
# Path to credentials for cloud bucket:
%env GOOGLE_APPLICATION_CREDENTIALS=/content/drive/My Drive/CS/AI/Credentials/ai-project-2020-f4dfbc25326c.json

env: GOOGLE_APPLICATION_CREDENTIALS=/content/drive/My Drive/CS/AI/Credentials/ai-project-2020-f4dfbc25326c.json


In [161]:
from google.cloud import storage

import numpy as np
import time
import pandas as pd
from scipy.stats import chi2
import sklearn
from glob import glob
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [162]:
pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 800)

In [163]:
from functools import partial
from tqdm import tqdm
tqdm = partial(tqdm, position=0, leave=True)

In [164]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [165]:
# define constants
bucket_name = "ai-project-2020-spotify"
client = storage.Client()
bucket = client.get_bucket(bucket_name)

## Import Session Logs

In [166]:
# Cloud bucket contains larger datasets:
train_files = list(bucket.list_blobs(prefix='training_set/'))
for blob in [blob for blob in train_files if '20180715' in blob.name]:
  print(blob.name)

training_set/log_0_20180715_000000000000.csv.gz
training_set/log_1_20180715_000000000000.csv.gz
training_set/log_2_20180715_000000000000.csv.gz
training_set/log_3_20180715_000000000000.csv.gz
training_set/log_4_20180715_000000000000.csv.gz
training_set/log_5_20180715_000000000000.csv.gz
training_set/log_6_20180715_000000000000.csv.gz
training_set/log_7_20180715_000000000000.csv.gz


In [168]:
#Cloud bucket contains larger datasets:
logs = pd.read_csv(f"gs://{bucket_name}/training_set/log_0_20180715_000000000000.csv.gz")
# Path to mini logs:
# logs = pd.read_csv('/content/drive/My Drive/CS/AI/Data/mini/log_mini.csv')
# logs.columns

In [169]:
def categorical_to_dummies(df, categorical_cols):
    """ Create dummies (one hot encoding) for each categorical variables """
    dummies = pd.get_dummies(df[categorical_cols], prefix=categorical_cols)
    return df.drop(columns=categorical_cols).join(dummies)

In [170]:
# remove date for convenience (could encode this as well)
logs.drop(columns=['date'], inplace=True)

# Create dummies (one hot encoding) for each categorical variable in logs
categorical_cols = ['context_type', 'hist_user_behavior_reason_start', 'hist_user_behavior_reason_end']
logs = categorical_to_dummies(logs, categorical_cols)
print(logs.shape)

(2990609, 44)


## Import Track Features

In [171]:
track_features_1 = pd.read_csv('/content/drive/My Drive/CS/AI/Data/track_features/tf_000000000000.csv').set_index('track_id')
track_features_2 = pd.read_csv('/content/drive/My Drive/CS/AI/Data/track_features/tf_000000000001.csv').set_index('track_id')
track_features = track_features_1.append(track_features_2)

In [172]:
# Create dummies (one hot encoding) for each categorical variable in track_features
track_features = categorical_to_dummies(track_features, ['mode'])

def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result
track_features = normalize(track_features)

track_features.head(n=3)

Unnamed: 0_level_0,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,danceability,dyn_range_mean,energy,flatness,instrumentalness,key,liveness,loudness,mechanism,organism,speechiness,tempo,time_signature,valence,acoustic_vector_0,acoustic_vector_1,acoustic_vector_2,acoustic_vector_3,acoustic_vector_4,acoustic_vector_5,acoustic_vector_6,acoustic_vector_7,mode_major,mode_minor
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
t_2e8f4b71-8a0b-4b9c-b7d8-fb5208e87f9f,0.167239,0.304348,0.958289,0.719233,0.366513,0.337232,0.440801,0.110348,0.238851,0.865473,0.6533884,0.0,0.769258,0.647393,0.19917,0.775905,0.034969,0.401501,0.8,0.223398,0.607061,0.233876,0.745271,0.656751,0.633754,0.653592,0.288896,0.399172,1.0,0.0
t_dae2ec0e-ec7b-4b3e-b60c-4a884d0eccb0,0.066561,0.188406,0.727204,0.843003,0.362229,0.395253,0.508696,0.130109,0.420482,0.856651,3.941564e-09,0.0,0.085844,0.734892,0.357639,0.763381,0.051367,0.565365,0.8,0.484707,0.558104,0.315577,0.644835,0.757328,0.627808,0.403121,0.510257,0.171852,1.0,0.0
t_cf0164dd-1531-4399-bfa6-dec19cd1fedc,0.045423,0.347826,0.962039,0.054904,0.495025,0.597579,0.553525,0.17794,0.842951,0.820145,0.1041599,0.0,0.407325,0.764843,0.304721,0.503674,0.073928,0.555589,0.8,0.818449,0.57848,0.460787,0.567144,0.47372,0.719832,0.685979,0.315397,0.422179,1.0,0.0


## Data Joining

In [173]:
# Join track features and logs
data = logs.join(track_features, on='track_id_clean', how='left')
data['session_id'].nunique()

178342

In [174]:
data.head()

Unnamed: 0,session_id,session_position,session_length,track_id_clean,skip_1,skip_2,skip_3,not_skipped,context_switch,no_pause_before_play,short_pause_before_play,long_pause_before_play,hist_user_behavior_n_seekfwd,hist_user_behavior_n_seekback,hist_user_behavior_is_shuffle,hour_of_day,premium,context_type_catalog,context_type_charts,context_type_editorial_playlist,context_type_personalized_playlist,context_type_radio,context_type_user_collection,hist_user_behavior_reason_start_appload,hist_user_behavior_reason_start_backbtn,hist_user_behavior_reason_start_clickrow,hist_user_behavior_reason_start_endplay,hist_user_behavior_reason_start_fwdbtn,hist_user_behavior_reason_start_playbtn,hist_user_behavior_reason_start_popup,hist_user_behavior_reason_start_remote,hist_user_behavior_reason_start_trackdone,hist_user_behavior_reason_start_trackerror,hist_user_behavior_reason_start_uriopen,hist_user_behavior_reason_end_appload,hist_user_behavior_reason_end_backbtn,hist_user_behavior_reason_end_clickrow,hist_user_behavior_reason_end_endplay,hist_user_behavior_reason_end_fwdbtn,hist_user_behavior_reason_end_logout,hist_user_behavior_reason_end_popup,hist_user_behavior_reason_end_remote,hist_user_behavior_reason_end_trackdone,hist_user_behavior_reason_end_uriopen,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,danceability,dyn_range_mean,energy,flatness,instrumentalness,key,liveness,loudness,mechanism,organism,speechiness,tempo,time_signature,valence,acoustic_vector_0,acoustic_vector_1,acoustic_vector_2,acoustic_vector_3,acoustic_vector_4,acoustic_vector_5,acoustic_vector_6,acoustic_vector_7,mode_major,mode_minor
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,1,20,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,False,False,False,True,0,0,0,0,0,0,True,16,True,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0.084783,0.985507,0.996813,0.015915,0.438572,0.480043,0.654555,0.145592,0.553473,0.886287,0.003484481,0.090909,0.678553,0.806081,0.546784,0.327509,0.07183,0.536128,0.8,0.152256,0.164759,0.767624,0.726183,0.441198,0.359116,0.730704,0.236833,0.580984,1.0,0.0
1,0_00006f66-33e5-4de7-a324-2d18e439fc1e,2,20,t_9099cd7b-c238-47b7-9381-f23f2c1d1043,False,False,False,True,0,1,0,0,0,0,True,16,True,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0.116834,0.985507,0.989673,0.062072,0.654835,0.745897,0.879322,0.214942,0.72684,0.878262,1.031319e-07,0.636364,0.104322,0.825062,0.824766,0.134194,0.063012,0.520179,0.8,0.337156,0.211725,0.756546,0.778338,0.40778,0.335201,0.765185,0.230647,0.560879,0.0,1.0
2,0_00006f66-33e5-4de7-a324-2d18e439fc1e,3,20,t_fc5df5ba-5396-49a7-8b29-35d0d28249e0,False,False,False,True,0,1,0,0,0,0,True,16,True,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0.11371,0.985507,0.999998,0.355611,0.53218,0.54793,0.681213,0.153304,0.563018,0.881542,2.659045e-08,0.909091,0.135776,0.817156,0.774327,0.303257,0.046729,0.580142,0.8,0.373866,0.198437,0.762347,0.740823,0.404552,0.375268,0.763143,0.166574,0.570818,1.0,0.0
3,0_00006f66-33e5-4de7-a324-2d18e439fc1e,4,20,t_23cff8d6-d874-4b20-83dc-94e450e8aa20,False,False,False,True,0,1,0,0,0,0,True,16,True,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0.078998,0.985507,0.999504,0.772472,0.641786,0.73937,0.866782,0.214539,0.529492,0.850761,6.598412e-06,0.090909,0.103722,0.788291,0.630996,0.61614,0.236906,0.447951,0.8,0.649426,0.215648,0.734013,0.76456,0.40935,0.330305,0.783042,0.231117,0.614465,1.0,0.0
4,0_00006f66-33e5-4de7-a324-2d18e439fc1e,5,20,t_64f3743c-f624-46bb-a579-0f3f9a07a123,False,False,False,True,0,1,0,0,0,0,True,16,True,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0.102003,0.985507,0.99985,0.00663,0.732462,0.80594,0.859664,0.231531,0.650067,0.856799,2.066657e-06,0.727273,0.120842,0.831098,0.759465,0.173777,0.248284,0.588155,0.8,0.652927,0.140517,0.74071,0.713375,0.468176,0.359157,0.702825,0.154084,0.571117,1.0,0.0


In [175]:
len(data)

2990609

## Session Heterogeneity

In [176]:
def mahalanobis(x=None, data=None, cov=None, session_id=None):
    """
    Compute the Mahalanobis Distance between each row of x and the data  
    x    : vector or matrix of data with, say, p columns.
    data : ndarray of the distribution from which Mahalanobis distance of each observation of x is to be computed.
    cov  : covariance matrix (p x p) of the distribution. If None, will be computed from data.

    Note: the function first checks whether the length of the data isn't smaller than the feature dimensionality since this will lead to a singular matrix.
    """
    # if len(data) <= len(data.columns):
    #   print("Session is a singular matrix: "+session_id) 
    #   return
    x_mu = x  - np.mean(data)
    if cov is None:
        cov = np.cov(data.values.T)
    try:
      if np.isfinite(np.linalg.cond(cov)):
        inv_covmat = np.linalg.inv(cov)
        left = np.dot(x_mu, inv_covmat)
        mahal = np.dot(left, x_mu.T)
        return mahal.diagonal()
      else:
        print("Ill conditioned matrix detected: "+session_id)
    except np.linalg.LinAlgError:
      print("Session is a singular matrix: "+session_id) 

### Ill-conditioned Matrices 

In [177]:
ill_data = data[data['session_id'] == "0_0081d804-4dba-495a-b045-f5bac84a1034"]

In [178]:
ill_data["track_id_clean"].nunique()

10

In [179]:
ill_data.head(n=20)

Unnamed: 0,session_id,session_position,session_length,track_id_clean,skip_1,skip_2,skip_3,not_skipped,context_switch,no_pause_before_play,short_pause_before_play,long_pause_before_play,hist_user_behavior_n_seekfwd,hist_user_behavior_n_seekback,hist_user_behavior_is_shuffle,hour_of_day,premium,context_type_catalog,context_type_charts,context_type_editorial_playlist,context_type_personalized_playlist,context_type_radio,context_type_user_collection,hist_user_behavior_reason_start_appload,hist_user_behavior_reason_start_backbtn,hist_user_behavior_reason_start_clickrow,hist_user_behavior_reason_start_endplay,hist_user_behavior_reason_start_fwdbtn,hist_user_behavior_reason_start_playbtn,hist_user_behavior_reason_start_popup,hist_user_behavior_reason_start_remote,hist_user_behavior_reason_start_trackdone,hist_user_behavior_reason_start_trackerror,hist_user_behavior_reason_start_uriopen,hist_user_behavior_reason_end_appload,hist_user_behavior_reason_end_backbtn,hist_user_behavior_reason_end_clickrow,hist_user_behavior_reason_end_endplay,hist_user_behavior_reason_end_fwdbtn,hist_user_behavior_reason_end_logout,hist_user_behavior_reason_end_popup,hist_user_behavior_reason_end_remote,hist_user_behavior_reason_end_trackdone,hist_user_behavior_reason_end_uriopen,duration,release_year,us_popularity_estimate,acousticness,beat_strength,bounciness,danceability,dyn_range_mean,energy,flatness,instrumentalness,key,liveness,loudness,mechanism,organism,speechiness,tempo,time_signature,valence,acoustic_vector_0,acoustic_vector_1,acoustic_vector_2,acoustic_vector_3,acoustic_vector_4,acoustic_vector_5,acoustic_vector_6,acoustic_vector_7,mode_major,mode_minor
6225,0_0081d804-4dba-495a-b045-f5bac84a1034,1,10,t_de860a61-bb57-4386-9d5a-6ba111c4c266,False,False,False,True,0,0,0,0,0,1,False,10,True,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.093627,0.985507,0.999863,0.161263,0.499818,0.656422,0.530174,0.202161,0.690109,0.842302,3.439734e-12,0.181818,0.497821,0.786571,0.339545,0.490872,0.762337,0.682195,0.8,0.311037,0.148548,0.708984,0.680546,0.507687,0.384879,0.640295,0.151015,0.531164,1.0,0.0
6226,0_0081d804-4dba-495a-b045-f5bac84a1034,2,10,t_0e3dec82-10b4-49f1-8c2e-cd19249f7d2c,False,False,False,True,0,1,0,0,0,0,False,10,True,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0.098953,0.985507,0.999986,0.04581,0.811033,0.845228,0.756607,0.232764,0.574926,0.865003,0.001236856,0.363636,0.276028,0.77868,0.767442,0.171152,0.069828,0.343966,0.8,0.490338,0.152825,0.708801,0.728128,0.502372,0.353319,0.712038,0.206122,0.576608,0.0,1.0
6227,0_0081d804-4dba-495a-b045-f5bac84a1034,3,10,t_eb0063cc-948f-41d4-9dab-bf4b0e70d21e,False,False,False,True,0,1,0,0,0,0,False,10,True,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0.075691,0.985507,0.999949,0.128416,0.661626,0.754333,0.762671,0.218245,0.598394,0.886083,2.686992e-09,0.909091,0.083905,0.795941,0.72796,0.217088,0.187033,0.695624,0.8,0.447219,0.132826,0.723623,0.725775,0.488026,0.360454,0.703543,0.120504,0.573679,0.0,1.0
6228,0_0081d804-4dba-495a-b045-f5bac84a1034,4,10,t_4319026b-d2b5-478d-bf37-8f2873645c5c,False,True,True,False,0,1,0,0,0,0,False,10,True,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0.105273,0.985507,0.999836,0.048298,0.430963,0.552477,0.487613,0.17106,0.848814,0.838851,6.20966e-13,0.090909,0.142508,0.823357,0.301518,0.505632,0.253322,0.608404,0.8,0.519909,0.067672,0.74437,0.750766,0.51522,0.340839,0.673965,0.107608,0.552522,1.0,0.0
6229,0_0081d804-4dba-495a-b045-f5bac84a1034,5,10,t_e8fa3463-7ef7-4ca5-8e65-2df979012c34,True,True,True,False,0,1,0,0,0,0,False,10,True,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.109175,0.985507,0.999851,0.003608,0.612289,0.665231,0.759574,0.184044,0.490411,0.895969,6.614249e-06,1.0,0.292313,0.756288,0.638528,0.261065,0.143536,0.560493,0.8,0.334536,0.10749,0.73415,0.718526,0.4876,0.361572,0.686729,0.115337,0.575707,1.0,0.0
6230,0_0081d804-4dba-495a-b045-f5bac84a1034,6,10,t_9a03f300-6504-4c7a-92b1-4cd88354ee74,True,True,True,False,0,1,0,0,0,0,False,10,True,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.111246,0.985507,0.999733,0.729601,0.639643,0.739044,0.699465,0.214914,0.547037,0.863502,1.570476e-09,0.090909,0.159421,0.790539,0.296029,0.730602,0.324981,0.335221,0.8,0.547744,0.145339,0.711911,0.749319,0.503215,0.356832,0.683814,0.164933,0.552446,1.0,0.0
6231,0_0081d804-4dba-495a-b045-f5bac84a1034,7,10,t_297efa33-b070-4e5d-bb42-111237295b6c,True,True,True,False,0,1,0,0,0,0,False,10,True,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.09945,0.985507,0.999784,0.85335,0.597117,0.575265,0.687967,0.150999,0.215248,0.891501,0.000122629,0.818182,0.105838,0.701018,0.708812,0.648722,0.039837,0.368011,0.8,0.273083,0.206712,0.689242,0.767506,0.467126,0.346695,0.72993,0.195445,0.566124,0.0,1.0
6232,0_0081d804-4dba-495a-b045-f5bac84a1034,8,10,t_e9901b0e-dc9d-429c-a8a9-dde6ab877d54,True,True,True,False,0,1,0,0,0,0,False,10,True,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.123103,0.985507,0.999855,0.4787,0.672385,0.735058,0.777183,0.204157,0.48159,0.895897,0.04556551,0.090909,0.109985,0.735285,0.787234,0.376995,0.036742,0.544092,0.8,0.13383,0.377018,0.648566,0.603508,0.381117,0.352551,0.746767,0.252259,0.626588,1.0,0.0
6233,0_0081d804-4dba-495a-b045-f5bac84a1034,9,10,t_6dc02fe6-4d78-414a-8439-31240ee02270,True,True,True,False,0,1,0,0,0,0,False,10,True,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.131329,0.985507,0.999918,0.037167,0.537019,0.593854,0.767937,0.169352,0.271398,0.915052,2.121508e-07,0.0,0.174854,0.819359,0.585202,0.300754,0.042419,0.464104,0.8,0.037118,0.195193,0.744543,0.769769,0.423441,0.385263,0.739196,0.163656,0.550579,1.0,0.0
6234,0_0081d804-4dba-495a-b045-f5bac84a1034,10,10,t_9a31436c-a57a-4d7a-9bbc-50f0deca33de,False,False,True,False,0,0,0,1,0,0,False,11,True,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.102116,0.985507,0.999993,0.089489,0.574496,0.597544,0.587175,0.163907,0.908604,0.837779,0.0001089541,0.727273,0.118732,0.807635,0.643636,0.265287,0.072638,0.373595,0.8,0.756898,0.149583,0.744786,0.745554,0.44811,0.307055,0.734422,0.149044,0.593393,1.0,0.0


### Mahalanobis 

In [180]:
track_features.columns

Index(['duration', 'release_year', 'us_popularity_estimate', 'acousticness',
       'beat_strength', 'bounciness', 'danceability', 'dyn_range_mean',
       'energy', 'flatness', 'instrumentalness', 'key', 'liveness', 'loudness',
       'mechanism', 'organism', 'speechiness', 'tempo', 'time_signature',
       'valence', 'acoustic_vector_0', 'acoustic_vector_1',
       'acoustic_vector_2', 'acoustic_vector_3', 'acoustic_vector_4',
       'acoustic_vector_5', 'acoustic_vector_6', 'acoustic_vector_7',
       'mode_major', 'mode_minor'],
      dtype='object')

In [181]:
# Columns to use for mahalanobis distance:
mahalanobis_features_columns = ['acoustic_vector_1',
       'acoustic_vector_2', 'acoustic_vector_3', 'acoustic_vector_4',
       'acoustic_vector_5', 'acoustic_vector_6', 'acoustic_vector_7']
mahalanobis_features_columns_track_id = ['acoustic_vector_1',
       'acoustic_vector_2', 'acoustic_vector_3', 'acoustic_vector_4',
       'acoustic_vector_5', 'acoustic_vector_6', 'acoustic_vector_7']
mahalanobis_features_columns_track_id.append('track_id_clean')

In [182]:
print(mahalanobis_features_columns)

['acoustic_vector_1', 'acoustic_vector_2', 'acoustic_vector_3', 'acoustic_vector_4', 'acoustic_vector_5', 'acoustic_vector_6', 'acoustic_vector_7']


In [183]:
print(mahalanobis_features_columns_track_id)

['acoustic_vector_1', 'acoustic_vector_2', 'acoustic_vector_3', 'acoustic_vector_4', 'acoustic_vector_5', 'acoustic_vector_6', 'acoustic_vector_7', 'track_id_clean']


In [184]:
# Covariance Matrix Estimation
cov = data[mahalanobis_features_columns].cov()

In [None]:
cov.shape

In [None]:
type(cov)

In [None]:
print(cov)

In [None]:
def extend_matrix(vec_matrix, row, mahalanobis_features_columns):
   dummy_vector = [] 
   for column in mahalanobis_features_columns:
        dummy_vector.extend([row[column]])
   df_length = len(vec_matrix)
   # vec_matrix: rows = tracks, columns = features
   vec_matrix.loc[df_length] = dummy_vector

In [None]:
def calculate_heterogeneity(df, mahalanobis_features_columns):
  """
  Will calculate the mahalanobis distance of a dataframe in unstacked form.
  The function will go through all rows in the provided dataframe, 
  it will create a vec_matrix which is a stacked matrix holding all the track features for each track in the session.
  This vec_matric holds only unique tracks (duplicates are dropped before calulating mahalanobis distance).
  Next, it will calculate the mahanalobis distance between each vector in vec_matrix (so each track) and the mean representation of tracks in that session.
  At the end of each session, the mahalanobis distance is joined to the df.
  """
  current_session = df['session_id'].iloc[0]
  df['mahalanobis'] = np.nan
  vec_matrix = pd.DataFrame(columns=mahalanobis_features_columns_track_id)
  for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    if current_session != row['session_id']:
      vec_matrix = vec_matrix.drop_duplicates()
      vec_matrix['mahalanobis'] = mahalanobis(x = vec_matrix[mahalanobis_features_columns], data = vec_matrix[mahalanobis_features_columns], cov=cov, session_id = current_session)
      for index, vec in vec_matrix.iterrows():
        df.loc[(df['session_id'] == current_session) & (df['track_id_clean'] == vec['track_id_clean']), 'mahalanobis'] = vec['mahalanobis']
      vec_matrix = pd.DataFrame(columns=mahalanobis_features_columns_track_id)
      current_session = row['session_id']
      extend_matrix(vec_matrix, row, mahalanobis_features_columns_track_id)
    else:
      extend_matrix(vec_matrix, row, mahalanobis_features_columns_track_id)
  end_time = time.time()
  return df

In [None]:
# Note: there seem to be multiple ways to get a singular matrix. I haven't looked into it that much but would be interesting to know what causes these singular matrices mathematically. 
new_data = calculate_heterogeneity(data[0:40], mahalanobis_features_columns)

In [None]:
# Save new data to csv
new_data.to_csv('/content/drive/My Drive/CS/AI/Data/log_0_20180715_data_mahalanobis.csv')

In [None]:
new_data.head()

In [185]:
### Sanity Check

(7, 7)

In [186]:
type(cov)

pandas.core.frame.DataFrame

In [187]:
print(cov)

                   acoustic_vector_1  acoustic_vector_2  acoustic_vector_3  \
acoustic_vector_1           0.008803           0.003621          -0.002990   
acoustic_vector_2           0.003621           0.011816           0.005682   
acoustic_vector_3          -0.002990           0.005682           0.016072   
acoustic_vector_4          -0.004936          -0.005950          -0.000809   
acoustic_vector_5           0.002526           0.000317          -0.007844   
acoustic_vector_6          -0.001644           0.003899           0.005388   
acoustic_vector_7           0.004259           0.001087           0.002316   

                   acoustic_vector_4  acoustic_vector_5  acoustic_vector_6  \
acoustic_vector_1          -0.004936           0.002526          -0.001644   
acoustic_vector_2          -0.005950           0.000317           0.003899   
acoustic_vector_3          -0.000809          -0.007844           0.005388   
acoustic_vector_4           0.022274          -0.000326        

In [188]:
def extend_matrix(vec_matrix, row, mahalanobis_features_columns):
   dummy_vector = [] 
   for column in mahalanobis_features_columns:
        dummy_vector.extend([row[column]])
   df_length = len(vec_matrix)
   # vec_matrix: rows = tracks, columns = features
   vec_matrix.loc[df_length] = dummy_vector

In [202]:
def calculate_heterogeneity(df, mahalanobis_features_columns):
  """
  Will calculate the mahalanobis distance of a dataframe in unstacked form.
  The function will go through all rows in the provided dataframe, 
  it will create a vec_matrix which is a stacked matrix holding all the track features for each track in the session.
  This vec_matric holds only unique tracks (duplicates are dropped before calulating mahalanobis distance).
  Next, it will calculate the mahanalobis distance between each vector in vec_matrix (so each track) and the mean representation of tracks in that session.
  At the end of each session, the mahalanobis distance is joined to the df.
  """
  current_session = df['session_id'].iloc[0]
  df['mahalanobis'] = np.nan
  vec_matrix = pd.DataFrame(columns=mahalanobis_features_columns_track_id)
  for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    if current_session != row['session_id']:
      vec_matrix = vec_matrix.drop_duplicates()
      vec_matrix['mahalanobis'] = mahalanobis(x = vec_matrix[mahalanobis_features_columns], data = vec_matrix[mahalanobis_features_columns], cov=cov, session_id = current_session)
      for index, vec in vec_matrix.iterrows():
        df.loc[(df['session_id'] == current_session) & (df['track_id_clean'] == vec['track_id_clean']), 'mahalanobis'] = vec['mahalanobis']
      vec_matrix = pd.DataFrame(columns=mahalanobis_features_columns_track_id)
      current_session = row['session_id']
      extend_matrix(vec_matrix, row, mahalanobis_features_columns_track_id)
    else:
      extend_matrix(vec_matrix, row, mahalanobis_features_columns_track_id)
  end_time = time.time()
  return df

In [206]:
# Note: there seem to be multiple ways to get a singular matrix. I haven't looked into it that much but would be interesting to know what causes these singular matrices mathematically. 
new_data = calculate_heterogeneity(data[0:40], mahalanobis_features_columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
  0%|          | 0/40 [00:00<?, ?it/s]


ValueError: ignored

In [None]:
# Save new data to csv
new_data.to_csv('/content/drive/My Drive/CS/AI/Data/log_0_20180715_data_mahalanobis.csv')

In [None]:
new_data.head()

### Sanity Check

In [None]:
new_data[new_data['mahalanobis']<0]

In [None]:
new_data['mahalanobis'].max()

In [None]:
new_data['mahalanobis'].min()

In [None]:
# Todo: review ill-conditioned part and amount of features
degrees_of_freedom = len(mahalanobis_features_columns)-1
print(chi2.ppf((1-0.05), df=degrees_of_freedom))

# Compute the P-Values
# new_data['p_value'] = 1 - chi2.cdf(new_data['mahalanobis'], degrees_of_freedom)

# Extreme values with a significance level of 0.05
# new_data.loc[new_data.p_value < 0.05]