# Anuran Calls (MFCCs) Data Set 

Classification using SVM with different kernel functions. 

**Shiyu Mou**

**shiyumou@usc.edu**

In [85]:
import numpy as np
import scipy as sp
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from sklearn.metrics import hamming_loss


In [42]:
def load_data():   
    filename = "Anuran Calls (MFCCs)/Frogs_MFCCs.csv"
    df = pd.read_csv(filename)
    X = df.iloc[:, :22]
    y = df.iloc[:, 22:-1]
    y_fam = df['Family']
    y_gen = df['Genus']
    y_spe = df['Species']
    return df, X, y_fam, y_gen, y_spe, y

In [43]:
df, X, y_fam, y_gen, y_spe, y = load_data()
X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.3, random_state=42)
# print(y)
print(df.shape)
print(y_train.shape)

(7195, 26)
(5036, 3)


## Gaussian Kernel SVM

In [46]:
def GaussianSVM(X_train, y_train):
    params_dict = {"C": np.logspace(-3, 6, 5), "gamma": np.linspace(0.1, 2, 5)}
    svm = SVC(kernel="rbf")
    # grid search
    search = GridSearchCV(estimator=svm, param_grid=params_dict, cv=10)
    search.fit(X_train, y_train)   
    print("Best parameter values:", search.best_params_)
    print("CV Score with best parameter values:", search.best_score_)
    return search.best_params_, search.best_score_, search

In [47]:
# Family
print("The 'Family' Label: ")
fam_params, fam_score, fam_model = GaussianSVM(X_train, y_train['Family'])

The 'Family' Label: 
Best parameter values: {'C': 31.622776601683793, 'gamma': 2.0}
CV Score with best parameter values: 0.9932486100079428


In [48]:
# Genus
print("The 'Genus' Label: ")
Gen_params, Gen_score, Gen_model = GaussianSVM(X_train, y_train['Genus'])

The 'Genus' Label: 
Best parameter values: {'C': 31.622776601683793, 'gamma': 1.525}
CV Score with best parameter values: 0.9910643367752184


In [49]:
# Species
print("The 'Species' Label: ")
Spe_params, Spe_score, Spe_model = GaussianSVM(X_train, y_train['Species'])

The 'Species' Label: 
Best parameter values: {'C': 31.622776601683793, 'gamma': 1.525}
CV Score with best parameter values: 0.9906671961874504


### Train and test with the best parameters.

In [102]:
def GaussianSVM_best(X_train, y_train, C_, gamma_, X_test):
    svm = SVC(kernel="rbf", C=C_, gamma=gamma_)
    svm.fit(X_train, y_train)   
    y_predict = svm.predict(X_test)
    return y_predict

In [178]:
# Family
y_p_fam = GaussianSVM_best(X_train, y_train['Family'], fam_params['C'], fam_params['gamma'], X_test)
y_p_fam = pd.DataFrame(y_p_fam)
# Genus
y_p_gen = GaussianSVM_best(X_train, y_train['Genus'], Gen_params['C'], Gen_params['gamma'], X_test)
y_p_gen = pd.DataFrame(y_p_gen)
# Species
y_p_spe = GaussianSVM_best(X_train, y_train['Species'], Spe_params['C'], Spe_params['gamma'], X_test)
y_p_spe = pd.DataFrame(y_p_spe)
# combine
y_predict = pd.concat([y_p_fam, y_p_gen, y_p_spe], axis=1, sort=False)

In [123]:
def hammingLoss(y_predict, y_test):
    loss_list = list()
    for i in range(len(y_test)):
        loss_list.append(hamming_loss(y_predict.iloc[i,:], y_test.iloc[i,:]))
#     print(loss_list)
    return sum(loss_list) / len(loss_list)  

In [164]:
def exactMatch(y_predict, y_test):
    loss_list = list()
    for i in range(len(y_test)):
        if set(y_predict.values[i,:]) == set(y_test.iloc[i,:]):
            loss_list.append(1)
        else:
            loss_list.append(0)
#     print(loss_list)
    return sum(loss_list) / len(loss_list)     

In [179]:
h_loss = hammingLoss(y_predict, y_test)
print('Hamming loss for Gaussian Kernel: ', h_loss)       
e_score = exactMatch(y_predict, y_test)
print('Exact Match Score for Gaussian Kernel: ', e_score)  

Hamming loss for Gaussian Kernel:  0.007410838351088467
Exact Match Score for Gaussian Kernel:  0.9898100972672533


## L1 penalization SVM

In [62]:
def normalize(df):
    x = df.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled)
    return df

In [64]:
X_train_norm = normalize(X_train)
X_test_norm = normalize(X_test)
# y_train_norm = normalize(y_train)
# y_test_norm = normalize(y_test)

In [56]:
def L1PSVM(X_train, y_train):
    params_dict = {"C": np.logspace(-3, 6, 10)}
#     svm = SVC(kernel="rbf")
    svm = LinearSVC(penalty='l1', dual=False)
    # grid search
    search = GridSearchCV(estimator=svm, param_grid=params_dict, cv=10)
    search.fit(X_train, y_train)   
    print("Best parameter values:", search.best_params_)
    print("CV Score with best parameter values:", search.best_score_)
    return search.best_params_, search.best_score_, search

In [65]:
# Family
print("The 'Family' Label: ")
fam_params_l1, fam_score_l1, fam_model_l1 = L1PSVM(X_train_norm, y_train['Family'])

The 'Family' Label: 
Best parameter values: {'C': 10.0}
CV Score with best parameter values: 0.937251787132645


In [66]:
# Genus
print("The 'Genus' Label: ")
Gen_params_l1, Gen_score_l1, Gen_model_l1 = L1PSVM(X_train_norm, y_train['Genus'])

The 'Genus' Label: 
Best parameter values: {'C': 100.0}
CV Score with best parameter values: 0.9479745830023828


In [67]:
# Species
print("The 'Species' Label: ")
Spe_params_l1, Spe_score_l1, Spe_model_l1 = L1PSVM(X_train_norm, y_train['Species'])

The 'Species' Label: 
Best parameter values: {'C': 1000000.0}
CV Score with best parameter values: 0.9577045274027005


In [125]:
def L1_SVM_best(X_train, y_train, C_, X_test):
    svm = LinearSVC(penalty='l1', dual=False, C=C_)
    svm.fit(X_train, y_train)   
    y_predict = svm.predict(X_test)
    return y_predict

In [195]:
# Family
y_p_fam_l1 = L1_SVM_best(X_train, y_train['Family'], fam_params_l1['C'], X_test)
y_p_fam_l1 = pd.DataFrame(y_p_fam_l1)
# Genus
y_p_gen_l1 = L1_SVM_best(X_train, y_train['Genus'], Gen_params_l1['C'], X_test)
y_p_gen_l1 = pd.DataFrame(y_p_gen_l1)
# Species
y_p_spe_l1 = L1_SVM_best(X_train, y_train['Species'], Spe_params_l1['C'], X_test)
y_p_spe_l1 = pd.DataFrame(y_p_spe_l1)
# combine
y_predict_l1 = pd.concat([y_p_fam_l1, y_p_gen_l1, y_p_spe_l1], axis=1, sort=False)

In [196]:
h_loss_l1 = hammingLoss(y_predict_l1, y_test)
print('Hamming loss for L1 Penalty: ', h_loss_l1)     
e_score_l1 = exactMatch(y_predict_l1, y_test)
print('Exact Match Score for L1 Penalty: ', e_score_l1)  

Hamming loss for L1 Penalty:  0.05727960475528792
Exact Match Score for L1 Penalty:  0.9101435849930524


## Resampling

In [80]:
def L1PSVM_Resample(X_train, y_train):
    print('Original dataset shape {}'.format(Counter(y_train))) 
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_sample(X_train, y_train)    
    print('Resampled dataset shape {}'.format(Counter(y_res)))
    params_dict = {"C": np.logspace(-3, 6, 10)}
#     svm = SVC(kernel="rbf")
    svm = LinearSVC(penalty='l1', dual=False)
    # grid search
    search = GridSearchCV(estimator=svm, param_grid=params_dict, cv=10)
    search.fit(X_res, y_res)   
    print("Best parameter values:", search.best_params_)
    print("CV Score with best parameter values:", search.best_score_)
    return search.best_params_, search.best_score_, search

In [81]:
# Family
print("The 'Family' Label: ")
fam_params_l1_rs, fam_score_l1_rs, fam_model_l1_rs = L1PSVM_Resample(X_train_norm, y_train['Family'])

The 'Family' Label: 
Original dataset shape Counter({'Leptodactylidae': 3073, 'Hylidae': 1542, 'Dendrobatidae': 380, 'Bufonidae': 41})
Resampled dataset shape Counter({'Leptodactylidae': 3073, 'Dendrobatidae': 3073, 'Hylidae': 3073, 'Bufonidae': 3073})
Best parameter values: {'C': 10.0}
CV Score with best parameter values: 0.9450862349495607


In [82]:
# Genus
print("The 'Genus' Label: ")
Gen_params_l1_rs, Gen_score_l1_rs, Gen_model_l1_rs = L1PSVM_Resample(X_train_norm, y_train['Genus'])

The 'Genus' Label: 
Original dataset shape Counter({'Adenomera': 2899, 'Hypsiboas': 1125, 'Ameerega': 380, 'Dendropsophus': 226, 'Leptodactylus': 174, 'Scinax': 111, 'Osteocephalus': 80, 'Rhinella': 41})
Resampled dataset shape Counter({'Adenomera': 2899, 'Ameerega': 2899, 'Hypsiboas': 2899, 'Rhinella': 2899, 'Scinax': 2899, 'Dendropsophus': 2899, 'Leptodactylus': 2899, 'Osteocephalus': 2899})
Best parameter values: {'C': 10.0}
CV Score with best parameter values: 0.950327699206623


In [83]:
# Species
print("The 'Species' Label: ")
Spe_params_l1_rs, Spe_score_l1_rs, Spe_model_l1_rs = L1PSVM_Resample(X_train_norm, y_train['Species'])

The 'Species' Label: 
Original dataset shape Counter({'AdenomeraHylaedactylus': 2447, 'HypsiboasCordobae': 788, 'AdenomeraAndre': 452, 'Ameeregatrivittata': 380, 'HypsiboasCinerascens': 337, 'HylaMinuta': 226, 'LeptodactylusFuscus': 174, 'ScinaxRuber': 111, 'OsteocephalusOophagus': 80, 'Rhinellagranulosa': 41})
Resampled dataset shape Counter({'AdenomeraHylaedactylus': 2447, 'Ameeregatrivittata': 2447, 'HypsiboasCinerascens': 2447, 'AdenomeraAndre': 2447, 'Rhinellagranulosa': 2447, 'ScinaxRuber': 2447, 'HylaMinuta': 2447, 'HypsiboasCordobae': 2447, 'LeptodactylusFuscus': 2447, 'OsteocephalusOophagus': 2447})
Best parameter values: {'C': 1000000.0}
CV Score with best parameter values: 0.953330608908868


In [168]:
def res_SVM_best(X_train, y_train, C_, X_test):
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_sample(X_train, y_train)   
    svm = LinearSVC(penalty='l1', dual=False, C=C_)
    svm.fit(X_train, y_train)   
    y_predict = svm.predict(X_test)
    return y_predict

In [197]:
# Family
y_p_fam_l1_rs = res_SVM_best(X_train, y_train['Family'], fam_params_l1_rs['C'], X_test)
y_p_fam_l1_rs = pd.DataFrame(y_p_fam_l1_rs)
# Genus
y_p_gen_l1_rs = res_SVM_best(X_train, y_train['Genus'], Gen_params_l1_rs['C'], X_test)
y_p_gen_l1_rs = pd.DataFrame(y_p_gen_l1_rs)
# Species
y_p_spe_l1_rs = res_SVM_best(X_train, y_train['Species'], Spe_params_l1_rs['C'], X_test)
y_p_spe_l1_rs = pd.DataFrame(y_p_spe_l1_rs)
# combine
y_predict_l1_rs = pd.concat([y_p_fam_l1_rs, y_p_gen_l1_rs, y_p_spe_l1_rs], axis=1, sort=False)

In [198]:
h_loss_ls_rs = hammingLoss(y_predict_l1_rs, y_test)
print('Hamming loss for resampled L1 Penalty: ', h_loss_ls_rs)     
e_score_l1_rs = exactMatch(y_predict_l1_rs, y_test)
print('Exact Match Score for resampled L1 Penalty: ', e_score_l1_rs)  

Hamming loss for resampled L1 Penalty:  0.05697081982399256
Exact Match Score for resampled L1 Penalty:  0.9106067623899954


## Conclusion
In the experiment, Gaussian Kernel gives the best performance. 

In [185]:
score_report = pd.DataFrame()
score_report['Hamming Loss'] = [h_loss, h_loss_l1, h_loss_ls_rs]
score_report['Exact Match Score'] = [e_score, e_score_l1, e_score_l1_rs]
score_report.index = ['Gaussian', "l1 penalty", 'resampled l1']
score_report

Unnamed: 0,Hamming Loss,Exact Match Score
Gaussian,0.007411,0.98981
l1 penalty,0.056971,0.91107
resampled l1,0.056662,0.911533
