<a href="https://colab.research.google.com/github/Dev-ika/ML-models-learning/blob/main/mahalanobis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import scipy as sp
from scipy.spatial.distance import mahalanobis
from ipykernel import kernelapp as app

In [None]:
datadict = {
'country': ['Argentina', 'Bolivia', 'Brazil', 'Chile', 'Ecuador', 'Colombia', 'Paraguay', 'Peru', 'Venezuela'],
'd1': [0.34, -0.19, 0.37, 1.17, -0.31, -0.3, -0.48, -0.15, -0.61],
'd2': [-0.57, -0.69, -0.28, 0.68, -2.19, -0.83, -0.53, -1, -1.39],
'd3': [-0.02, -0.55, 0.07, 1.2, -0.14, -0.85, -0.9, -0.47, -1.02],
'd4': [-0.69, -0.18, 0.05, 1.43, -0.02, -0.7, -0.72, 0.23, -1.08],
'd5': [-0.83, -0.69, -0.39, 1.31, -0.7, -0.75, -1.04, -0.52, -1.22],
'd6': [-0.45, -0.77, 0.05, 1.37, -0.1, -0.67, -1.4, -0.35, -0.89]}

In [None]:
pairsdict = {
'country1': ['Argentina', 'Chile', 'Ecuador', 'Peru'],
'country2': ['Bolivia', 'Venezuela', 'Colombia', 'Peru']}

In [None]:
#DataFrame that contains the data for each country
df = pd.DataFrame(datadict)
print(df)

     country    d1    d2    d3    d4    d5    d6
0  Argentina  0.34 -0.57 -0.02 -0.69 -0.83 -0.45
1    Bolivia -0.19 -0.69 -0.55 -0.18 -0.69 -0.77
2     Brazil  0.37 -0.28  0.07  0.05 -0.39  0.05
3      Chile  1.17  0.68  1.20  1.43  1.31  1.37
4    Ecuador -0.31 -2.19 -0.14 -0.02 -0.70 -0.10
5   Colombia -0.30 -0.83 -0.85 -0.70 -0.75 -0.67
6   Paraguay -0.48 -0.53 -0.90 -0.72 -1.04 -1.40
7       Peru -0.15 -1.00 -0.47  0.23 -0.52 -0.35
8  Venezuela -0.61 -1.39 -1.02 -1.08 -1.22 -0.89


In [None]:
#DataFrame that contains the pairs for which we calculate the Mahalanobis distance
pairs = pd.DataFrame(pairsdict)
print(pairs)

    country1   country2
0  Argentina    Bolivia
1      Chile  Venezuela
2    Ecuador   Colombia
3       Peru       Peru


In [None]:
#Add data to the country pairs
pairs = pairs.merge(df, how='left', left_on=['country1'], right_on=['country'])
pairs = pairs.merge(df, how='left', left_on=['country2'], right_on=['country'])
pairs

Unnamed: 0,country1,country2,country_x,d1_x,d2_x,d3_x,d4_x,d5_x,d6_x,country_y,d1_y,d2_y,d3_y,d4_y,d5_y,d6_y
0,Argentina,Bolivia,Argentina,0.34,-0.57,-0.02,-0.69,-0.83,-0.45,Bolivia,-0.19,-0.69,-0.55,-0.18,-0.69,-0.77
1,Chile,Venezuela,Chile,1.17,0.68,1.2,1.43,1.31,1.37,Venezuela,-0.61,-1.39,-1.02,-1.08,-1.22,-0.89
2,Ecuador,Colombia,Ecuador,-0.31,-2.19,-0.14,-0.02,-0.7,-0.1,Colombia,-0.3,-0.83,-0.85,-0.7,-0.75,-0.67
3,Peru,Peru,Peru,-0.15,-1.0,-0.47,0.23,-0.52,-0.35,Peru,-0.15,-1.0,-0.47,0.23,-0.52,-0.35


In [None]:
#Convert data columns to list in a single cell
pairs['vector1'] = pairs[['d1_x','d2_x','d3_x','d4_x','d5_x','d6_x']].values.tolist()
pairs['vector2'] = pairs[['d1_y','d2_y','d3_y','d4_y','d5_y','d6_y']].values.tolist()
print(pairs)

    country1   country2  country_x  d1_x  d2_x  d3_x  d4_x  d5_x  d6_x  \
0  Argentina    Bolivia  Argentina  0.34 -0.57 -0.02 -0.69 -0.83 -0.45   
1      Chile  Venezuela      Chile  1.17  0.68  1.20  1.43  1.31  1.37   
2    Ecuador   Colombia    Ecuador -0.31 -2.19 -0.14 -0.02 -0.70 -0.10   
3       Peru       Peru       Peru -0.15 -1.00 -0.47  0.23 -0.52 -0.35   

   country_y  d1_y  d2_y  d3_y  d4_y  d5_y  d6_y  \
0    Bolivia -0.19 -0.69 -0.55 -0.18 -0.69 -0.77   
1  Venezuela -0.61 -1.39 -1.02 -1.08 -1.22 -0.89   
2   Colombia -0.30 -0.83 -0.85 -0.70 -0.75 -0.67   
3       Peru -0.15 -1.00 -0.47  0.23 -0.52 -0.35   

                                     vector1  \
0  [0.34, -0.57, -0.02, -0.69, -0.83, -0.45]   
1        [1.17, 0.68, 1.2, 1.43, 1.31, 1.37]   
2   [-0.31, -2.19, -0.14, -0.02, -0.7, -0.1]   
3   [-0.15, -1.0, -0.47, 0.23, -0.52, -0.35]   

                                      vector2  
0  [-0.19, -0.69, -0.55, -0.18, -0.69, -0.77]  
1  [-0.61, -1.39, -1.02, -1.08,

In [None]:
mahala = pairs[['country1', 'country2', 'vector1', 'vector2']]

In [None]:
#Calculate covariance matrix
covmx = df.cov()
invcovmx = sp.linalg.inv(covmx)

In [None]:
#Calculate Mahalanobis distance
mahala['mahala_dist'] = mahala.apply(lambda x: (mahalanobis(x['vector1'], x['vector2'], invcovmx)), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [None]:
mahala = mahala[['country1', 'country2', 'mahala_dist']]

In [None]:
mahala

Unnamed: 0,country1,country2,mahala_dist
0,Argentina,Bolivia,3.003186
1,Chile,Venezuela,3.82902
2,Ecuador,Colombia,3.915868
3,Peru,Peru,0.0
