In [307]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

from sklearn import preprocessing

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from random import randint
from random import random
from math import ceil


import warnings
warnings.filterwarnings("ignore")

In [272]:
df_train = pd.read_csv("df_train_processed.csv")

In [273]:
df_train = df_train.drop(columns=['actor'])

In [274]:
df_train_num = df_train.select_dtypes(include="number")

## Standardization

In [275]:
scaler = MinMaxScaler()
df_train_num_scal = scaler.fit_transform(df_train_num)
df_train_num = pd.DataFrame(df_train_num_scal, columns=df_train_num.columns)

In [276]:
def add_target_column(df1, df2, target):
    
    df2[target] = df1[target]
    
    return df2

In [277]:
def lablel_encoder(data, target):
    
    label_encoder = preprocessing.LabelEncoder()
    
    data[target]= label_encoder.fit_transform(data[target])
    
    return data

In [278]:
df_vocal_channel_train = add_target_column(df_train, df_train_num, 'vocal_channel')

In [279]:
label_encoder = lablel_encoder(df_vocal_channel_train, 'vocal_channel')

## Rendere training dataset imbalanced

In [282]:
def dip_indp_variables(data,target):
    
    X = np.array(data.loc[:, data.columns != target])
    
    y = np.array(data.loc[:, data.columns == target])
    
    result = np.unique(y, return_counts=True)
    
    return  {'X': X,
             'y': y,
             'result': { result[0][0] : result[1][0],
                       result[0][1] : result[1][1]}
           }

In [283]:
dip_indp_variables_train = dip_indp_variables(df_vocal_channel_train,"vocal_channel")

In [284]:
dip_indp_variables_train['result']

{0: 748, 1: 1080}

In [285]:
def rows_to_remove(data,target,num, dataset):
    
    y = dataset['y']
    
    majority_class = list(dataset['result'].keys())[1]
    
    total_majority_class = dataset['result'][majority_class]
    
    np.random.seed(42)
    
    rows_remove = np.random.choice((data[y == majority_class]).index, total_majority_class-num, replace=False)
    
    df_target = data.drop(index=rows_remove, axis=0)
    
    return {'df_target': df_target,
            'rows_remove': rows_remove}

In [286]:
rows_remove_train = rows_to_remove(df_vocal_channel_train,"vocal_channel", 31, dip_indp_variables_train)

In [287]:
def drop_rows(data, rows_dataset):
    
    df_after_remove = data.drop(index=rows_dataset['rows_remove'], axis=0)
    
    return {'df_after_remove': df_after_remove}

In [288]:
drop_rows_train = drop_rows(df_vocal_channel_train, rows_remove_train)

In [289]:
def dip_indip_after_remove(data, target, rows, dip_indp_variables_dataset):
    
    X2 = data.loc[:, data.columns != target]
    
    y2 = np.array([dip_indp_variables_dataset['y'][i] for i in range(len(dip_indp_variables_dataset['y'])) if i not in rows])
    
    result = np.unique(y2, return_counts=True)
    
    return {'X2': X2,
            'y2': y2,
            'result': { result[0][0] : result[1][0],
                      result[0][1] : result[1][1]}
           }

In [290]:
dip_indip_after_remove_train = dip_indip_after_remove(drop_rows_train['df_after_remove'], 
                                                'vocal_channel',
                                                rows_remove_train['rows_remove'], dip_indp_variables_train)

In [291]:
X2 = np.array(dip_indip_after_remove_train['X2'])
y2 = np.array(dip_indip_after_remove_train['y2'].ravel())

In [292]:
np.unique(y2, return_counts=True)

(array([0, 1]), array([748,  31]))

In [293]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.3, random_state=100)

In [294]:
X_train.shape

(545, 376)

## Decision Tree before Undersampling/Oversampling

In [295]:
from scipy.stats import loguniform

rvs = loguniform.rvs(1e-2, 1e0, size=1000)

In [296]:
import scipy as sp
sp.random.seed(123345) 
x=sp.random.uniform(low=0.001,high=0.2,size=10)

In [297]:
def Decision_tree(X_train_array, y_train_array,N):
    
    param_dist = {'max_depth': list(range(2,N)),
              'min_samples_split': rvs,
              'min_samples_leaf': x,
              "criterion": ["gini", "entropy"]}


    tree = DecisionTreeClassifier(random_state=42)


    tree_cv = RandomizedSearchCV(tree, param_dist, cv=10, random_state=42)


    tree_cv.fit(X_train_array,y_train_array)

    return tree_cv

In [298]:
decision_tree = Decision_tree(X_train, y_train,545)

In [299]:
print("Tuned Decision Tree Parameters: {}".format(decision_tree.best_params_))

Tuned Decision Tree Parameters: {'min_samples_split': 0.055638707672742094, 'min_samples_leaf': 0.04035790659541334, 'max_depth': 467, 'criterion': 'entropy'}


In [300]:
y_pred_decision_tree = decision_tree.predict(X_test)

In [301]:
def metrics_model( model_name,predict, test):
    new_dic = {model_name:{}}
    new_dic[model_name]['accuracy'] = accuracy_score(predict, test)
    new_dic[model_name]['precision'] = precision_score(predict, test)#, average=None)
    new_dic[model_name]['recall'] = recall_score(predict, test)#, average=None)
    new_dic[model_name]['f1_score'] = f1_score(predict, test)#,average=None)
    
    return new_dic

In [302]:
metrics_model('decision_tree',y_pred_decision_tree, y_test)

{'decision_tree': {'accuracy': 0.9700854700854701,
  'precision': 0.7777777777777778,
  'recall': 0.5833333333333334,
  'f1_score': 0.6666666666666666}}

In [303]:
print(classification_report(y_test, y_pred_decision_tree))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       225
           1       0.58      0.78      0.67         9

    accuracy                           0.97       234
   macro avg       0.79      0.88      0.83       234
weighted avg       0.98      0.97      0.97       234



## KNN before Undersampling/Oversampling

In [308]:
def spearman_selector(target, df, k):
    """Returns a df comprising all the features highly correlated (above k and below -k)
    with the binary target according Spearman coefficient. Target must be a pd Series"""
    df = df.select_dtypes(include="number")
    target = target.map({target.unique()[0]:0, target.unique()[1]:1})
    new_df = pd.DataFrame()
    for col in df:
        spearman = stats.spearmanr(df[col], target)[0]
        if spearman > k or spearman < -k:
            new_df[col] = df[col]
    return new_df

In [309]:
df_vocal_channel_train = spearman_selector(df_vocal_channel_train['vocal_channel'], df_vocal_channel_train, 0.5)

In [310]:
dip_indp_variables_knn = dip_indp_variables(df_vocal_channel_train,"vocal_channel")

In [311]:
dip_indp_variables_knn['result']

{0: 748, 1: 1080}

In [312]:
rows_remove_knn = rows_to_remove(df_vocal_channel_train,"vocal_channel", 31, dip_indp_variables_knn)

In [313]:
drop_rows_knn = drop_rows(df_vocal_channel_train, rows_remove_knn)

In [314]:
dip_indip_after_remove_knn = dip_indip_after_remove(drop_rows_knn['df_after_remove'], 
                                                'vocal_channel',
                                                rows_remove_knn['rows_remove'], dip_indp_variables_knn)

In [315]:
X2_knn = np.array(dip_indip_after_remove_knn['X2'])
y2_knn = np.array(dip_indip_after_remove_knn['y2'].ravel())

In [316]:
np.unique(y2_knn, return_counts=True)

(array([0, 1]), array([748,  31]))

In [317]:
X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(X2_knn, y2_knn, test_size=0.3, random_state=100)

In [318]:
X_train_knn.shape

(545, 86)

In [319]:
def Knn(X_train_array, y_train_array, N):   #N è il numero di samples in training test
    
    knn = KNeighborsClassifier()
    
    parameters = {
        'n_neighbors': list(range(2, ceil(N/2),10)),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'cityblock', 'minkowski']
        }
    
    knn_cv = RandomizedSearchCV(knn,
                     parameters,
                     #scoring='f1_macro',
                     cv=10,
                    random_state = 42)
    
    knn_cv.fit(X_train_array, y_train_array)
    
    return knn_cv

In [320]:
knn = Knn(X_train_knn, y_train_knn,545)

In [321]:
print(knn.best_params_)

{'weights': 'uniform', 'n_neighbors': 22, 'metric': 'cityblock'}


In [322]:
y_pred_knn = knn.predict(X_test_knn)

In [323]:
metrics_model('knn',y_pred_knn, y_test_knn)

{'knn': {'accuracy': 0.9871794871794872,
  'precision': 0.6666666666666666,
  'recall': 1.0,
  'f1_score': 0.8}}

In [324]:
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       225
           1       1.00      0.67      0.80         9

    accuracy                           0.99       234
   macro avg       0.99      0.83      0.90       234
weighted avg       0.99      0.99      0.99       234



## Undersampling

In [326]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import EditedNearestNeighbours

## Random Undersampling

In [327]:
def randomUnderSampler(X_train_array, y_train_array):
    
    
    rus = RandomUnderSampler(random_state=42, sampling_strategy='majority')
    
    X_res_rus, y_res_rus = rus.fit_resample(X_train_array, y_train_array)
    
    return X_res_rus, y_res_rus

In [328]:
X_res_rus, y_res_rus = randomUnderSampler(X_train, y_train)

In [329]:
X_res_rus_knn, y_res_rus_knn = randomUnderSampler(X_train_knn, y_train_knn)

In [330]:
print('Resampled dataset shape %s' % Counter(y_res_rus))

Resampled dataset shape Counter({0: 22, 1: 22})


In [331]:
print('Resampled dataset shape %s' % Counter(y_res_rus_knn))

Resampled dataset shape Counter({0: 22, 1: 22})


In [332]:
len(X_res_rus)

44

## KNN dopo RandomUndersampling

In [333]:
knn_under_rus = Knn(X_res_rus_knn, y_res_rus_knn, 44)

In [334]:
print(knn_under_rus.best_params_)

{'weights': 'uniform', 'n_neighbors': 2, 'metric': 'cityblock'}


In [335]:
y_pred_knn_under_rus = knn_under_rus.predict(X_test_knn)

In [336]:
metrics_model('Knn RandomUndersampling',y_pred_knn_under_rus, y_test_knn)

{'Knn RandomUndersampling': {'accuracy': 0.9829059829059829,
  'precision': 0.7777777777777778,
  'recall': 0.7777777777777778,
  'f1_score': 0.7777777777777778}}

In [337]:
print(classification_report(y_test_knn, y_pred_knn_under_rus))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       225
           1       0.78      0.78      0.78         9

    accuracy                           0.98       234
   macro avg       0.88      0.88      0.88       234
weighted avg       0.98      0.98      0.98       234



## Decision Tree dopo RandomUndersampling

In [338]:
decision_tree_under_rus = Decision_tree(X_res_rus, y_res_rus, 44)

In [339]:
print("Tuned Decision Tree Parameters: {}".format(decision_tree_under_rus.best_params_))

Tuned Decision Tree Parameters: {'min_samples_split': 0.5932099548853659, 'min_samples_leaf': 0.04035790659541334, 'max_depth': 14, 'criterion': 'gini'}


In [340]:
y_pred_decision_tree_under_rus = decision_tree_under_rus.predict(X_test)

In [341]:
print(classification_report(y_test, y_pred_decision_tree_under_rus))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       225
           1       0.54      0.78      0.64         9

    accuracy                           0.97       234
   macro avg       0.76      0.88      0.81       234
weighted avg       0.97      0.97      0.97       234



In [342]:
metrics_model('decision_tree RandomUndersampling',y_pred_decision_tree_under_rus, y_test)

{'decision_tree RandomUndersampling': {'accuracy': 0.9658119658119658,
  'precision': 0.7777777777777778,
  'recall': 0.5384615384615384,
  'f1_score': 0.6363636363636364}}

## EditedNearestNeighbors

In [343]:
def enn(X_train_array, y_train_array):
    
    enn = EditedNearestNeighbours()
    
    X_res_enn, y_res_enn = enn.fit_resample(X_train_array, y_train_array)
    
    return X_res_enn, y_res_enn

In [344]:
X_res_enn, y_res_enn = enn(X_train, y_train)

In [345]:
X_res_enn_knn, y_res_enn_knn = enn(X_train_knn, y_train_knn)

In [346]:
print('Resampled dataset shape %s' % Counter(y_res_enn))

Resampled dataset shape Counter({0: 519, 1: 22})


In [347]:
print('Resampled dataset shape %s' % Counter(y_res_enn_knn))

Resampled dataset shape Counter({0: 515, 1: 22})


In [348]:
len(X_res_enn)

541

In [349]:
len(X_res_enn_knn)

537

## KNN after EditedNearestNeighbors

In [351]:
knn_under_enn = Knn(X_res_enn_knn, y_res_enn_knn, 537)

In [352]:
print(knn_under_enn.best_params_)

{'weights': 'distance', 'n_neighbors': 2, 'metric': 'cityblock'}


In [353]:
y_pred_knn_under_enn = knn_under_enn.predict(X_test_knn)

In [354]:
metrics_model('knn EditedNearestNeighbors',y_pred_knn_under_enn, y_test_knn)

{'knn EditedNearestNeighbors': {'accuracy': 0.9743589743589743,
  'precision': 0.5555555555555556,
  'recall': 0.7142857142857143,
  'f1_score': 0.6250000000000001}}

In [355]:
print(classification_report(y_test_knn, y_pred_knn_under_enn))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       225
           1       0.71      0.56      0.63         9

    accuracy                           0.97       234
   macro avg       0.85      0.77      0.81       234
weighted avg       0.97      0.97      0.97       234



## Decision Tree after EditedNearestNeighbors

In [356]:
decision_tree_under_enn = Decision_tree(X_res_enn, y_res_enn, 537)

In [357]:
print("Tuned Decision Tree Parameters: {}".format(decision_tree_under_enn.best_params_))

Tuned Decision Tree Parameters: {'min_samples_split': 0.055638707672742094, 'min_samples_leaf': 0.04035790659541334, 'max_depth': 475, 'criterion': 'entropy'}


In [358]:
y_pred_decision_tree_under_enn = decision_tree_under_enn.predict(X_test)

In [359]:
metrics_model('decision_tree EditedNearestNeighbors',y_pred_decision_tree_under_enn, y_test)

{'decision_tree EditedNearestNeighbors': {'accuracy': 0.9700854700854701,
  'precision': 0.7777777777777778,
  'recall': 0.5833333333333334,
  'f1_score': 0.6666666666666666}}

In [360]:
print(classification_report(y_test, y_pred_decision_tree_under_enn))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       225
           1       0.58      0.78      0.67         9

    accuracy                           0.97       234
   macro avg       0.79      0.88      0.83       234
weighted avg       0.98      0.97      0.97       234



## Oversampling

In [361]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

## SMOTE

In [362]:
def smote(X_train_array, y_train_array):
    
    sm = SMOTE(random_state=42)
    
    X_res_sm, y_res_sm = sm.fit_resample(X_train_array, y_train_array)
    
    return X_res_sm, y_res_sm

In [363]:
X_res_sm, y_res_sm = smote(X_train, y_train)

In [364]:
X_res_sm_knn, y_res_sm_knn = smote(X_train_knn, y_train_knn)

In [365]:
print('Resampled dataset shape %s' % Counter(y_res_sm))

Resampled dataset shape Counter({0: 523, 1: 523})


In [366]:
print('Resampled dataset shape %s' % Counter(y_res_sm_knn))

Resampled dataset shape Counter({0: 523, 1: 523})


In [367]:
len(X_train)

545

In [368]:
len(X_train_knn)

545

## KNN after SMOTE

In [369]:
knn_over_sm = Knn(X_res_sm_knn, y_res_sm_knn, 545)

In [370]:
print(knn_over_sm.best_params_)

{'weights': 'uniform', 'n_neighbors': 22, 'metric': 'cityblock'}


In [371]:
y_pred_knn_over_sm = knn_over_sm.predict(X_test_knn)

In [372]:
metrics_model('knn SMOTE',y_pred_knn_over_sm, y_test_knn)

{'knn SMOTE': {'accuracy': 0.9572649572649573,
  'precision': 1.0,
  'recall': 0.47368421052631576,
  'f1_score': 0.6428571428571429}}

In [373]:
print(classification_report(y_test_knn, y_pred_knn_over_sm))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98       225
           1       0.47      1.00      0.64         9

    accuracy                           0.96       234
   macro avg       0.74      0.98      0.81       234
weighted avg       0.98      0.96      0.96       234



## Decision Tree after SMOTE

In [374]:
decision_tree_over_sm = Decision_tree(X_res_sm, y_res_sm, 545)

In [375]:
print("Tuned Decision Tree Parameters: {}".format(decision_tree_over_sm.best_params_))

Tuned Decision Tree Parameters: {'min_samples_split': 0.4324481546688792, 'min_samples_leaf': 0.15207238355939207, 'max_depth': 101, 'criterion': 'entropy'}


In [376]:
y_pred_decision_tree_over_sm = decision_tree_over_sm.predict(X_test)

In [377]:
metrics_model('decision_tree SMOTE',y_pred_decision_tree_over_sm, y_test)

{'decision_tree SMOTE': {'accuracy': 0.9487179487179487,
  'precision': 0.8888888888888888,
  'recall': 0.42105263157894735,
  'f1_score': 0.5714285714285714}}

In [378]:
print(classification_report(y_test, y_pred_decision_tree_over_sm))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97       225
           1       0.42      0.89      0.57         9

    accuracy                           0.95       234
   macro avg       0.71      0.92      0.77       234
weighted avg       0.97      0.95      0.96       234

