In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC,LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import ADASYN
from sklearn.metrics import hamming_loss,accuracy_score
import warnings
from sklearn.cluster import KMeans
from scipy.spatial import distance
from scipy.spatial.distance import hamming
import matplotlib.pyplot as plt 
from sklearn.metrics import silhouette_score,hamming_loss
from sklearn.multiclass import OneVsRestClassifier
warnings.filterwarnings('ignore')



In [3]:
MFCC_data=pd.read_csv("https://raw.githubusercontent.com/71sgupta/HW5_ML/master/Frogs_MFCCs.csv")
#MFCC_data

# 1 a) Train and Test Split

In [4]:
MFCC_train_data,MFCC_test_data=train_test_split(MFCC_data, test_size=0.3)
print("Train Data : ")
print(len(MFCC_train_data))
print(MFCC_train_data.head())

print("Test Data : ")
print(MFCC_test_data.head())

Train Data : 
5036
      MFCCs_ 1  MFCCs_ 2  MFCCs_ 3  MFCCs_ 4  MFCCs_ 5  MFCCs_ 6  MFCCs_ 7  \
4519       1.0  0.133752  0.193796  0.556891  0.184403  0.071990 -0.129180   
6395       1.0  0.493082  0.504096  0.246327  0.023502  0.175972  0.091741   
6295       1.0  0.212997  0.303120  0.302054  0.152182  0.202291  0.125354   
3463       1.0  0.167891  0.185879  0.546299  0.184312  0.058415 -0.098446   
5736       1.0  0.514452  0.449875  0.272170  0.036705  0.123338  0.027606   

      MFCCs_ 8  MFCCs_ 9  MFCCs_10  ...  MFCCs_17  MFCCs_18  MFCCs_19  \
4519 -0.017578  0.193294  0.082975  ...  0.211924 -0.079567 -0.191343   
6395 -0.064137  0.066681  0.137811  ...  0.024912 -0.012978  0.025401   
6295 -0.154379 -0.054431  0.105831  ...  0.121852  0.034841 -0.002391   
3463 -0.032581  0.097713 -0.016290  ...  0.069935 -0.129639 -0.117241   
5736 -0.030304  0.077557  0.062399  ...  0.057378 -0.001126  0.026482   

      MFCCs_20  MFCCs_21  MFCCs_22           Family      Genus  \
4519 -0

# 1 b) i) Research exact match and hamming score/ loss methods for evaluating multi-label classification and use them in evaluating the classifiers in this problem.

##### Exact Match : This method ignores partially correct(considering them incorrect) and extend the accuracy of single label case for multi-label prediction. This is called Exact Match Ratio

##### Hamming Loss : It gives how many times relevance of an example to a class label is incorrectly predicted It takes into account the prediction error(incorrect label predicted) and the missing error(a relevant label is not predicted),normalized over total number of classes and total number of examples

# 1 b(ii)

In [109]:
X=MFCC_train_data.iloc[:,:-4]
y=pd.DataFrame(MFCC_train_data.iloc[:,-4])
params_dict = {"C": np.logspace(-3, 6, 20), "gamma": np.linspace(0.0001, 10, 10)}
svm = SVC(kernel="rbf",tol=0.1)
search = GridSearchCV(estimator=svm, param_grid=params_dict,cv=10)
search.fit(X, np.ravel(y))
res=search.best_params_

In [111]:
print("The weight for svm penalty is : "+str(round(res['C'],5)))
print("The width of guassian is : "+str(round(1/np.sqrt(2*res['gamma']),5)))

X=MFCC_train_data.iloc[:,:-4]
X_test=MFCC_test_data.iloc[:,:-4]
y_test=MFCC_test_data.iloc[:,-4]
y=pd.DataFrame(MFCC_train_data.iloc[:,-4])
#svm = SVC(kernel="rbf",decision_function_shape='ovr',C=res['C'],gamma=res['gamma'])
#svm.fit(X,np.ravel(y))
pred=search.predict(X_test)
hm_loss=hamming_loss(y_test,pred)
acc_score=accuracy_score(y_test,pred)
print("Hamming Loss for Family using guassian kernel and one vs rest classifier: "+str(round(hm_loss,5)))
print("Exact Match score for Family using guassian kernel and one vs rest classifier: "+str(round(acc_score,5)))

The weight for svm penalty is : 54.55595
The width of guassian is : 0.38729
Hamming Loss for Family using guassian kernel and one vs rest classifier: 0.01065
Exact Match score for Family using guassian kernel and one vs rest classifier: 0.98935


In [112]:
X=MFCC_train_data.iloc[:,:-4]
y=pd.DataFrame(MFCC_train_data.iloc[:,-3])
params_dict = {"C": np.logspace(-3, 6, 20), "gamma": np.linspace(0.0001, 10, 10)}
svm = SVC(kernel="rbf",decision_function_shape='ovr')
search = GridSearchCV(estimator=svm, param_grid=params_dict,cv=10)
search.fit(X, np.ravel(y))
res1=search.best_params_

In [113]:
print("The weight for svm penalty is : "+str(round(res1['C'],5)))
print("The width of guassian is : "+str(round(1/np.sqrt(2*res1['gamma']),5)))

X=MFCC_train_data.iloc[:,:-4]
X_test=MFCC_test_data.iloc[:,:-4]
y_test=MFCC_test_data.iloc[:,-3]
y=pd.DataFrame(MFCC_train_data.iloc[:,-3])
svm = SVC(kernel="rbf",decision_function_shape='ovr',C=res1['C'],gamma=res1['gamma'])
svm.fit(X,np.ravel(y))
pred=svm.predict(X_test)
hm_loss1=hamming_loss(y_test,pred)
acc_score1=accuracy_score(y_test,pred)
print("Hamming Loss for Genus using guassian kernel and one vs rest classifier: "+str(round(hm_loss1,5)))
print("Exact Match score for Genus using guassian kernel and one vs rest classifier: "+str(round(acc_score1,5)))

The weight for svm penalty is : 54.55595
The width of guassian is : 0.38729
Hamming Loss for Genus using guassian kernel and one vs rest classifier: 0.01112
Exact Match score for Genus using guassian kernel and one vs rest classifier: 0.98888


In [114]:
X=MFCC_train_data.iloc[:,:-4]
y=pd.DataFrame(MFCC_train_data.iloc[:,-2])
params_dict = {"C": np.logspace(-3, 6, 20), "gamma": np.linspace(0.0001, 10, 10)}
svm = SVC(kernel="rbf",decision_function_shape='ovr')
search = GridSearchCV(estimator=svm, param_grid=params_dict,cv=10)
search.fit(X, np.ravel(y))
res2=search.best_params_

In [115]:
print("The weight for svm penalty is : "+str(round(res2['C'],5)))
print("The width of guassian is : "+str(round(1/np.sqrt(2*res2['gamma']),5)))

X=MFCC_train_data.iloc[:,:-4]
X_test=MFCC_test_data.iloc[:,:-4]
y_test=MFCC_test_data.iloc[:,-2]
y=pd.DataFrame(MFCC_train_data.iloc[:,-2])
svm = SVC(kernel="rbf",decision_function_shape='ovr',C=res2['C'],gamma=res2['gamma'])
svm.fit(X,np.ravel(y))
pred=svm.predict(X_test)
hm_loss2=hamming_loss(y_test,pred)
acc_score2=accuracy_score(y_test,pred)
print("Hamming Loss for Species using guassian kernel and one vs rest classifier: "+str(round(hm_loss2,5)))
print("Exact Match score for Species using guassian kernel and one vs rest classifier: "+str(round(acc_score2,5)))

The weight for svm penalty is : 6.15848
The width of guassian is : 0.38729
Hamming Loss for Species using guassian kernel and one vs rest classifier: 0.01112
Exact Match score for Species using guassian kernel and one vs rest classifier: 0.98888


In [116]:
avg_res=(hm_loss+hm_loss1+hm_loss2)/3
avg_em=(acc_score+acc_score1+acc_score2)/3

print("The average hamming loss using guassian kernel and one vs rest classifier is : "+str(round(avg_res,5)))
print("The average exact match score using guassian kernel and one vs rest classifier is : "+str(round(avg_em,5)))

The average hamming loss using guassian kernel and one vs rest classifier is : 0.01096
The average exact match score using guassian kernel and one vs rest classifier is : 0.98904


# 1 b(ii) After Standardization

In [None]:
X=MFCC_train_data.iloc[:,:-4]
y=pd.DataFrame(MFCC_train_data.iloc[:,-4])
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
params_dict = {"C": np.logspace(-3, 6, 20), "gamma": np.linspace(0.0001, 10, 10)}
svm = SVC(kernel="rbf",decision_function_shape='ovr')

search = GridSearchCV(estimator=svm, param_grid=params_dict,cv=10)

search.fit(X_scaled, np.ravel(y))
res3=search.best_params_

In [136]:
print("The weight for svm penalty is : "+str(round(res3['C'],5)))
print("The width of guassian is : "+str(round(1/np.sqrt(2*res3['gamma']),5)))

X=MFCC_train_data.iloc[:,:-4]
X_test=MFCC_test_data.iloc[:,:-4]
scaler=StandardScaler()
X=scaler.fit_transform(X)
X_test=scaler.transform(X_test)
y_test=MFCC_test_data.iloc[:,-4]
y=pd.DataFrame(MFCC_train_data.iloc[:,-4])
svm = SVC(kernel="rbf",decision_function_shape='ovr',C=res3['C'],gamma=res3['gamma'])
svm.fit(X,np.ravel(y))
pred=svm.predict(X_test)
hm_loss3=hamming_loss(y_test,pred)
acc_score3=accuracy_score(y_test,pred)
print("Hamming Loss for Family using guassian kernel and one vs rest classifier: "+str(round(hm_loss3,5)))
print("Exact Match score for Family using guassian kernel and one vs rest classifier: "+str(round(acc_score3,5)))

The weight for svm penalty is : 112883.78917
The width of guassian is : 70.71068
Hamming Loss for Family using guassian kernel and one vs rest classifier: 0.01621
Exact Match score for Family using guassian kernel and one vs rest classifier: 0.98379


In [23]:
X=MFCC_train_data.iloc[:,:-4]
y=pd.DataFrame(MFCC_train_data.iloc[:,-3])
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
params_dict = {"C": np.logspace(-3, 6, 20), "gamma": np.linspace(0.0001, 10, 10)}
svm = SVC(kernel="rbf",decision_function_shape='ovr')

search = GridSearchCV(estimator=svm, param_grid=params_dict,cv=10)

search.fit(X_scaled, np.ravel(y))
res4=search.best_params_

In [137]:
print("The weight for svm penalty is : "+str(round(res4['C'],5)))
print("The width of guassian is : "+str(round(1/np.sqrt(2*res4['gamma']),5)))

X=MFCC_train_data.iloc[:,:-4]
X_test=MFCC_test_data.iloc[:,:-4]
scaler=StandardScaler()
X=scaler.fit_transform(X)
X_test=scaler.transform(X_test)
y_test=MFCC_test_data.iloc[:,-3]
y=pd.DataFrame(MFCC_train_data.iloc[:,-3])
svm = SVC(kernel="rbf",decision_function_shape='ovr',C=res4['C'],gamma=res4['gamma'])
svm.fit(X,np.ravel(y))
pred=svm.predict(X_test)
hm_loss4=hamming_loss(y_test,pred)
acc_score4=accuracy_score(y_test,pred)
print("Hamming Loss for Genus using guassian kernel and one vs rest classifier: "+str(round(hm_loss4,5)))
print("Exact Match score for Genus using guassian kernel and one vs rest classifier: "+str(round(acc_score4,5)))

The weight for svm penalty is : 335981.82863
The width of guassian is : 70.71068
Hamming Loss for Genus using guassian kernel and one vs rest classifier: 0.01482
Exact Match score for Genus using guassian kernel and one vs rest classifier: 0.98518


In [None]:
X=MFCC_train_data.iloc[:,:-4]
y=pd.DataFrame(MFCC_train_data.iloc[:,-2])
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
params_dict = {"C": np.logspace(-3, 6, 20), "gamma": np.linspace(0.0001, 10, 10)}
svm = SVC(kernel="rbf",decision_function_shape='ovr')

search = GridSearchCV(estimator=svm, param_grid=params_dict,cv=10)

search.fit(X_scaled, np.ravel(y))
res5=search.best_params_

In [5]:
print("The weight for svm penalty is : "+str(round(res5['C'],5)))
print("The width of guassian is : "+str(round(1/np.sqrt(2*res5['gamma']),5)))

X=MFCC_train_data.iloc[:,:-4]
X_test=MFCC_test_data.iloc[:,:-4]
scaler=StandardScaler()
X=scaler.fit_transform(X)
X_test=scaler.transform(X_test)
y_test=MFCC_test_data.iloc[:,-2]
y=pd.DataFrame(MFCC_train_data.iloc[:,-2])
svm = SVC(kernel="rbf",decision_function_shape='ovr',C=res5['C'],gamma=res5['gamma'])
svm.fit(X,np.ravel(y))
pred=svm.predict(X_test)
hm_loss5=hamming_loss(y_test,pred)
acc_score5=accuracy_score(y_test,pred)
print("Hamming Loss for Species using guassian kernel and one vs rest classifier: "+str(round(hm_loss5,5)))
print("Exact Match score for Species using guassian kernel and one vs rest classifier: "+str(round(acc_score5,5)))

The weight for svm penalty is : 112883.78917
The width of guassian is : 70.71068
Hamming Loss for Species using guassian kernel and one vs rest classifier: 0.02084
Exact Match score for Species using guassian kernel and one vs rest classifier: 0.97916


In [7]:
avg_res=(hm_loss3+hm_loss4+hm_loss5)/3
avg_em=(acc_score3+acc_score4+acc_score5)/3

print("The average hamming loss using guassian kernel and one vs rest classifier after standardization is : "+str(round(avg_res,5)))
print("The average exact match score using guassian kernel and one vs rest classifier after standardization is : "+str(round(avg_em,5)))


The average hamming loss using guassian kernel and one vs rest classifier after standardization is : 0.01729
The average exact match score using guassian kernel and one vs rest classifier after standardization is : 0.98271


# 1 b(iii)

In [117]:
X=MFCC_train_data.iloc[:,:-4]
y=pd.DataFrame(MFCC_train_data.iloc[:,-4])
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
params_dict = {"C": np.logspace(-3, 6,20)}
svm = LinearSVC(penalty='l1',dual=False,max_iter=20000)

search = GridSearchCV(estimator=svm, param_grid=params_dict,cv=10)

search.fit(X_scaled, np.ravel(y))
res6=search.best_params_

In [118]:
print("The weight for svm penalty is : "+str(round(res6['C'],5)))

X=MFCC_train_data.iloc[:,:-4]
X_test=MFCC_test_data.iloc[:,:-4]
scaler=StandardScaler()
X=scaler.fit_transform(X)
X_test=scaler.transform(X_test)
y_test=MFCC_test_data.iloc[:,-4]
y=pd.DataFrame(MFCC_train_data.iloc[:,-4])
svm = LinearSVC(penalty='l1',C=res6['C'],dual=False,max_iter=20000)
svm.fit(X,np.ravel(y))
pred=svm.predict(X_test)
hm_loss6=hamming_loss(y_test,pred)
acc_score6=accuracy_score(y_test,pred)
print("Hamming Loss for Family using l1 penalty: "+str(round(hm_loss6,5)))
print("Exact Match score for Family using l1 penalty: "+str(round(acc_score6,5)))

The weight for svm penalty is : 54.55595
Hamming Loss for Family using l1 penalty: 0.06623
Exact Match score for Family using l1 penalty: 0.93377


In [119]:
X=MFCC_train_data.iloc[:,:-4]
y=pd.DataFrame(MFCC_train_data.iloc[:,-3])
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
params_dict = {"C": np.logspace(-3, 6, 20)}
svm = LinearSVC(penalty='l1',dual=False,max_iter=20000)

search = GridSearchCV(estimator=svm, param_grid=params_dict,cv=10)

search.fit(X_scaled, np.ravel(y))
res7=search.best_params_

In [120]:
print("The weight for svm penalty is : "+str(round(res7['C'],5)))

X=MFCC_train_data.iloc[:,:-4]
X_test=MFCC_test_data.iloc[:,:-4]
scaler=StandardScaler()
X=scaler.fit_transform(X)
X_test=scaler.transform(X_test)
y_test=MFCC_test_data.iloc[:,-3]
y=pd.DataFrame(MFCC_train_data.iloc[:,-3])
svm = LinearSVC(penalty='l1',C=res7['C'],dual=False,max_iter=20000)
svm.fit(X,np.ravel(y))
pred=svm.predict(X_test)
hm_loss7=hamming_loss(y_test,pred)
acc_score7=accuracy_score(y_test,pred)
print("Hamming Loss for Genus using l1 penalty: "+str(round(hm_loss7,5)))
print("Exact Match score for Genus using l1 penalty: "+str(round(acc_score7,5)))

The weight for svm penalty is : 6.15848
Hamming Loss for Genus using l1 penalty: 0.05373
Exact Match score for Genus using l1 penalty: 0.94627


In [121]:
X=MFCC_train_data.iloc[:,:-4]
y=pd.DataFrame(MFCC_train_data.iloc[:,-2])
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
params_dict = {"C": np.logspace(-3, 6, 20)}
svm = LinearSVC(penalty='l1',dual=False,max_iter=20000)

search = GridSearchCV(estimator=svm, param_grid=params_dict,cv=10)

search.fit(X_scaled, np.ravel(y))
res8=search.best_params_

In [122]:
print("The weight for svm penalty is : "+str(round(res8['C'],5)))

X=MFCC_train_data.iloc[:,:-4]
X_test=MFCC_test_data.iloc[:,:-4]
scaler=StandardScaler()
X=scaler.fit_transform(X)
X_test=scaler.transform(X_test)
y_test=MFCC_test_data.iloc[:,-2]
y=pd.DataFrame(MFCC_train_data.iloc[:,-2])
svm = LinearSVC(penalty='l1',C=res8['C'],dual=False,max_iter=20000)
svm.fit(X,np.ravel(y))
pred=svm.predict(X_test)
hm_loss8=hamming_loss(y_test,pred)
acc_score8=accuracy_score(y_test,pred)
print("Hamming Loss for Species using l1 penalty: "+str(round(hm_loss8,5)))
print("Exact Match score for Species using l1 penalty: "+str(round(acc_score8,5)))

The weight for svm penalty is : 2.06914
Hamming Loss for Species using l1 penalty: 0.04169
Exact Match score for Species using l1 penalty: 0.95831


In [123]:
avg_res=(hm_loss6+hm_loss7+hm_loss8)/3
avg_em=(acc_score6+acc_score7+acc_score8)/3

print("The average hamming loss using l1 penalty is : "+str(round(avg_res,5)))
print("The average exact match score using l1 penalty is : "+str(round(avg_em,5)))

The average hamming loss using l1 penalty is : 0.05388
The average exact match score using l1 penalty is : 0.94612


# 1 b(iv)

In [124]:
X=MFCC_train_data.iloc[:,:-4]
y=pd.DataFrame(MFCC_train_data.iloc[:,-4])
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
params_dict = {"C": np.logspace(-3,6,20)}
svm = LinearSVC(penalty='l1',class_weight='balanced',dual=False,max_iter=20000)
search = GridSearchCV(estimator=svm, param_grid=params_dict,cv=10)
search.fit(X_scaled, np.ravel(y))
res9=search.best_params_

In [125]:
print("The weight for svm penalty is : "+str(round(res9['C'],5)))

X=MFCC_train_data.iloc[:,:-4]
X_test=MFCC_test_data.iloc[:,:-4]
scaler=StandardScaler()
X=scaler.fit_transform(X)
X_test=scaler.transform(X_test)
y_test=MFCC_test_data.iloc[:,-4]
y=pd.DataFrame(MFCC_train_data.iloc[:,-4])
svm = LinearSVC(penalty='l1',C=res9['C'],class_weight='balanced',dual=False,max_iter=20000)
svm.fit(X,np.ravel(y))
pred=svm.predict(X_test)
hm_loss9=hamming_loss(y_test,pred)
acc_score9=accuracy_score(y_test,pred)
print("Hamming Loss for Family using l1 penalty: "+str(round(hm_loss9,5)))
print("Exact Match score for Family using l1 penalty: "+str(round(acc_score9,5)))

The weight for svm penalty is : 2.06914
Hamming Loss for Family using l1 penalty: 0.07272
Exact Match score for Family using l1 penalty: 0.92728


In [126]:
X=MFCC_train_data.iloc[:,:-4]
y=pd.DataFrame(MFCC_train_data.iloc[:,-3])
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
params_dict = {"C": np.logspace(-3,6,20)}
svm = LinearSVC(penalty='l1',class_weight='balanced',dual=False,max_iter=20000)
search = GridSearchCV(estimator=svm, param_grid=params_dict,cv=10)
search.fit(X_scaled, np.ravel(y))
res10=search.best_params_

In [127]:
print("The weight for svm penalty is : "+str(round(res10['C'],5)))

X=MFCC_train_data.iloc[:,:-4]
X_test=MFCC_test_data.iloc[:,:-4]
scaler=StandardScaler()
X=scaler.fit_transform(X)
X_test=scaler.transform(X_test)
y_test=MFCC_test_data.iloc[:,-3]
y=pd.DataFrame(MFCC_train_data.iloc[:,-3])
svm = LinearSVC(penalty='l1',C=res10['C'],class_weight='balanced',dual=False,max_iter=20000)
svm.fit(X,np.ravel(y))
pred=svm.predict(X_test)
hm_loss10=hamming_loss(y_test,pred)
acc_score10=accuracy_score(y_test,pred)
print("Hamming Loss for Genus using l1 penalty: "+str(round(hm_loss10,5)))
print("Exact Match score for Genus using l1 penalty: "+str(round(acc_score10,5)))

The weight for svm penalty is : 0.69519
Hamming Loss for Genus using l1 penalty: 0.05604
Exact Match score for Genus using l1 penalty: 0.94396


In [128]:
X=MFCC_train_data.iloc[:,:-4]
y=pd.DataFrame(MFCC_train_data.iloc[:,-2])
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
params_dict = {"C": np.logspace(-3,6,20)}
svm = LinearSVC(penalty='l1',class_weight='balanced',dual=False,max_iter=20000)
search = GridSearchCV(estimator=svm, param_grid=params_dict,cv=10)
search.fit(X_scaled, np.ravel(y))
res11=search.best_params_


In [129]:
print("The weight for svm penalty is : "+str(round(res11['C'],5)))

X=MFCC_train_data.iloc[:,:-4]
X_test=MFCC_test_data.iloc[:,:-4]
scaler=StandardScaler()
X=scaler.fit_transform(X)
X_test=scaler.transform(X_test)
y_test=MFCC_test_data.iloc[:,-2]
y=pd.DataFrame(MFCC_train_data.iloc[:,-2])
svm = LinearSVC(penalty='l1',C=res11['C'],class_weight='balanced',dual=False,max_iter=20000)
svm.fit(X,np.ravel(y))
pred=svm.predict(X_test)
hm_loss11=hamming_loss(y_test,pred)
acc_score11=accuracy_score(y_test,pred)
print("Hamming Loss for Species using l1 penalty: "+str(round(hm_loss11,5)))
print("Exact Match score for Species using l1 penalty: "+str(round(acc_score11,5)))

The weight for svm penalty is : 0.69519
Hamming Loss for Species using l1 penalty: 0.04076
Exact Match score for Species using l1 penalty: 0.95924


In [132]:
avg_res=(hm_loss9+hm_loss10+hm_loss11)/3
avg_em=(acc_score9+acc_score10+acc_score11)/3

print("The average hamming loss using l1 penalty is : "+str(round(avg_res,5)))
print("The average exact match score using l1 penalty is : "+str(round(avg_em,5)))


The average hamming loss using l1 penalty is : 0.05651
The average exact match score using l1 penalty is : 0.94349


### After comparing all the trained classifiers we found that classifier with guassian kernel gave better results i.e high accuracy and low hamming loss as compared to the l1 penalized SVM .So the data is not linearly separable as we got higher accuracy with guassian kernels classifiers.

# 2. K-Means Clustering on a Multi-Class and Multi-Label Data Set

In [131]:
MFCC_data=pd.read_csv("https://raw.githubusercontent.com/71sgupta/HW5_ML/master/Frogs_MFCCs.csv")
print(MFCC_data.head())

   MFCCs_ 1  MFCCs_ 2  MFCCs_ 3  MFCCs_ 4  MFCCs_ 5  MFCCs_ 6  MFCCs_ 7  \
0       1.0  0.152936 -0.105586  0.200722  0.317201  0.260764  0.100945   
1       1.0  0.171534 -0.098975  0.268425  0.338672  0.268353  0.060835   
2       1.0  0.152317 -0.082973  0.287128  0.276014  0.189867  0.008714   
3       1.0  0.224392  0.118985  0.329432  0.372088  0.361005  0.015501   
4       1.0  0.087817 -0.068345  0.306967  0.330923  0.249144  0.006884   

   MFCCs_ 8  MFCCs_ 9  MFCCs_10  ...  MFCCs_17  MFCCs_18  MFCCs_19  MFCCs_20  \
0 -0.150063 -0.171128  0.124676  ... -0.108351 -0.077623 -0.009568  0.057684   
1 -0.222475 -0.207693  0.170883  ... -0.090974 -0.056510 -0.035303  0.020140   
2 -0.242234 -0.219153  0.232538  ... -0.050691 -0.023590 -0.066722 -0.025083   
3 -0.194347 -0.098181  0.270375  ... -0.136009 -0.177037 -0.130498 -0.054766   
4 -0.265423 -0.172700  0.266434  ... -0.048885 -0.053074 -0.088550 -0.031346   

   MFCCs_21  MFCCs_22           Family      Genus         Species  R

# 2 a)

In [73]:
X=MFCC_data.iloc[:,:-4]
best_k={}
for i in range(2,51):
    model=KMeans(n_clusters=i)
    model.fit(X)
    pred=model.predict(X)
    sum=0
    X_arr=np.array(X)
    centroids=model.cluster_centers_

    score=silhouette_score(X,pred)
    best_k.update({i:score})
values=list(best_k.values())
max_val=max(values)
max_ind=values.index(max_val)
keys=list(best_k.keys())
bestk=keys[max_ind]
print("Best Value of K in k-means clustering using Silhouette is : " +str(bestk))

Best Value of K in k-means clustering using Silhouette is : 4


# 2 b)

In [74]:
def most_frequent(List): 
    return max(set(List), key = List.count) 
  
model=KMeans(n_clusters=bestk)
model.fit(X)
pred=model.predict(X)
samples_in_cluster={}
for i in range(0,bestk):
    indexes=np.where(pred==i)[0]
    samples_in_cluster.update({i:list(indexes)})
family=MFCC_data.iloc[:,-4]
genus=MFCC_data.iloc[:,-3]
species=MFCC_data.iloc[:,-2]
cluster_labels={}
for i in range(0,bestk):
    temp=[]
    p=family[samples_in_cluster[i]]
    q=genus[samples_in_cluster[i]]
    r=species[samples_in_cluster[i]]
    #print(len(p))
    #print(samples_in_cluster[i])
    p_freq=most_frequent(list(p))
    q_freq=most_frequent(list(q))
    r_freq=most_frequent(list(r))
    temp.append(p_freq)
    temp.append(q_freq)
    temp.append(r_freq)
  
    cluster_labels.update({i:temp})
#print(cluster_labels)

for i in range(0,len(cluster_labels)):
    print(" ")
    print("The labels for the "+str(i+1)+" cluster are : ")
    print("Family : "+str(cluster_labels[i][0]))
    print("Genus : "+str(cluster_labels[i][1]))
    print("Species : "+str(cluster_labels[i][2]))



 
The labels for the 1 cluster are : 
Family : Hylidae
Genus : Hypsiboas
Species : HypsiboasCordobae
 
The labels for the 2 cluster are : 
Family : Leptodactylidae
Genus : Adenomera
Species : AdenomeraHylaedactylus
 
The labels for the 3 cluster are : 
Family : Hylidae
Genus : Hypsiboas
Species : HypsiboasCinerascens
 
The labels for the 4 cluster are : 
Family : Dendrobatidae
Genus : Ameerega
Species : Ameeregatrivittata


# 2 c)  

In [153]:
#hamming loss
labels=MFCC_data.iloc[:,-4:-1]
#print(labels)
hm_loss=0
hm_sc=0
for i in range(0,bestk):
    lb=(labels.iloc)[samples_in_cluster[i],:]
    for j in samples_in_cluster[i]:
        hm_loss=hm_loss+hamming_loss(labels.iloc[j,:],cluster_labels[i])
        hm_sc=hm_sc+accuracy_score(labels.iloc[j,:],cluster_labels[i])
print("The average hamming loss/hamming distance is : "+str(hm_loss/len(labels)))
print("The average hamming score is : "+str(hm_sc/len(labels)))


The average hamming loss/hamming distance is : 0.2224229789205482
The average hamming score is : 0.7775770210794481


# Monte carlo- simulation

In [None]:
monte_carlo=[]
monte_carlo_score=[]
for mc in range(1,51):
    X=MFCC_data.iloc[:,:-4]
    best_k={}
    for i in range(2,51):
        model=KMeans(n_clusters=i,random_state=5)
        model.fit(X)
        pred=model.predict(X)
        sum=0
        X_arr=np.array(X)
        centroids=model.cluster_centers_

        score=silhouette_score(X,pred)
        best_k.update({i:score})
    values=list(best_k.values())
    max_val=max(values)
    max_ind=values.index(max_val)
    keys=list(best_k.keys())
    bestk=keys[max_ind]


    def most_frequent(List): 
        return max(set(List), key = List.count) 

    model=KMeans(n_clusters=bestk)
    model.fit(X)
    pred=model.predict(X)
    samples_in_cluster={}
    for i in range(0,bestk):
        indexes=np.where(pred==i)[0]
        samples_in_cluster.update({i:list(indexes)})
    family=MFCC_data.iloc[:,-4]
    genus=MFCC_data.iloc[:,-3]
    species=MFCC_data.iloc[:,-2]
    cluster_labels={}
    for i in range(0,bestk):
        temp=[]
        p=family[samples_in_cluster[i]]
        q=genus[samples_in_cluster[i]]
        r=species[samples_in_cluster[i]]
        #print(len(p))
        #print(samples_in_cluster[i])
        p_freq=most_frequent(list(p))
        q_freq=most_frequent(list(q))
        r_freq=most_frequent(list(r))
        temp.append(p_freq)
        temp.append(q_freq)
        temp.append(r_freq)

        cluster_labels.update({i:temp})
    #print(cluster_labels)

    #hamming loss
    labels=MFCC_data.iloc[:,-4:-1]
    #print(labels)
    hm_loss=0
    hm_score=0
    for i in range(0,bestk):
        lb=(labels.iloc)[samples_in_cluster[i],:]
        for j in samples_in_cluster[i]:
            hm_loss=hm_loss+hamming_loss(labels.iloc[j,:],cluster_labels[i])
            
            hm_score=hm_score+accuracy_score(labels.iloc[j,:],cluster_labels[i])
    #print("The average hamming loss is : "+str(hm_loss/len(labels)))
    ls=(hm_loss/len(labels))
    monte_carlo.append(ls)
    monte_carlo_score.append(hm_score/len(labels))
  
  

In [151]:
m_carlo_df=pd.concat([pd.DataFrame(monte_carlo),pd.DataFrame(monte_carlo_score)],axis=1)
m_carlo_df.columns=['Hamming Distances/Hamming Loss','Hamming Score']
print("Hamming distances ,loss and scores after running 50 times :")
print(m_carlo_df)
stats=m_carlo_df.describe()
print("The average of 50 Hamming Distances/hamming losses : "+str(list(stats.iloc[1:2,0])[0]))
print("The standard deviation of 50 Hamming Distances/hamming losses : "+str(list(stats.iloc[2:3,0])[0]))
print("")
print("The average of 50 Hamming Scores : "+str(list(stats.iloc[1:2,1])[0]))
print("The standard deviation of 50 Hamming Scores : "+str(list(stats.iloc[2:3,1])[0]))

Hamming distances ,loss and scores after running 50 times :
    Hamming Distances/Hamming Loss  Hamming Score
0                         0.222423       0.777577
1                         0.222423       0.777577
2                         0.222423       0.777577
3                         0.222423       0.777577
4                         0.233634       0.766366
5                         0.233356       0.766644
6                         0.222423       0.777577
7                         0.222423       0.777577
8                         0.222423       0.777577
9                         0.222423       0.777577
10                        0.245263       0.754737
11                        0.233356       0.766644
12                        0.222423       0.777577
13                        0.245263       0.754737
14                        0.222284       0.777716
15                        0.222284       0.777716
16                        0.222423       0.777577
17                        0.222423      