In [1]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from statistics import mean
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import silhouette_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
from sklearn.model_selection import cross_validate

In [2]:
#Import the required dataset
data = pd.read_excel("Telco_customer_churn.xlsx")
demo = pd.read_excel("Telco_customer_churn_demographics copy.xlsx")
serv = pd.read_excel("Telco_customer_churn_services.xlsx")

#Renaming, merging, and dropping columns
data = data.rename(columns = {"CustomerID" : "Customer ID"})
data = pd.merge(data, demo[['Customer ID','Age', 'Married']], on = "Customer ID")
data = pd.merge(data, serv[['Customer ID', 'Number of Referrals', 'Avg Monthly GB Download', 'Streaming Music', 'Unlimited Data', 'Total Refunds', 'Total Extra Data Charges', 'Total Long Distance Charges']], on = 'Customer ID')
data = data.drop(columns = ["City", "Lat Long", "Count", "Country", "State", "Churn Reason", 'Total Charges', 'Churn Label'])
data = pd.merge(data, serv[['Customer ID','Total Charges', 'Total Revenue']], on = "Customer ID")
data = data.drop(columns = ['Customer ID'])

#Re-arranging the columns
cols = list(data.columns.values)
cols = ['Zip Code','Latitude','Longitude','Gender','Senior Citizen','Partner','Dependents','Tenure Months',
 'Phone Service','Multiple Lines','Internet Service','Online Security','Online Backup','Device Protection',
 'Tech Support','Streaming TV','Streaming Movies','Contract','Paperless Billing','Payment Method','Monthly Charges',
 'Churn Score','CLTV','Age','Married','Number of Referrals','Avg Monthly GB Download','Streaming Music',
 'Unlimited Data','Total Refunds','Total Extra Data Charges','Total Long Distance Charges','Total Charges',
 'Total Revenue','Churn Value']
data = data[cols]

In [3]:
#Split features from target variable
x = data.iloc[:, :34]
y = data.iloc[: , -1]

In [4]:
x_rest, x_test, y_rest, y_test = train_test_split(x, y,
                                                    stratify=y, 
                                                    test_size=0.20)

In [5]:
x_train_70p, x_test_70p, y_train_70p, y_test_70p = train_test_split(x_rest, y_rest,
                                                    stratify=y_rest, 
                                                    test_size=0.125)
x_train_60p, x_test_60p, y_train_60p, y_test_60p = train_test_split(x_rest, y_rest,
                                                    stratify=y_rest, 
                                                    test_size=0.250)
x_train_50p, x_test_50p, y_train_50p, y_test_50p = train_test_split(x_rest, y_rest,
                                                    stratify=y_rest, 
                                                    test_size=0.375)
x_train_40p, x_test_40p, y_train_40p, y_test_40p = train_test_split(x_rest, y_rest,
                                                    stratify=y_rest, 
                                                    test_size=0.500)
x_train_30p, x_test_30p, y_train_30p, y_test_30p = train_test_split(x_rest, y_rest,
                                                    stratify=y_rest, 
                                                    test_size=0.625)
x_train_20p, x_test_20p, y_train_20p, y_test_20p = train_test_split(x_rest, y_rest,
                                                    stratify=y_rest, 
                                                    test_size=0.750)
x_train_10p, x_test_10p, y_train_10p, y_test_10p = train_test_split(x_rest, y_rest,
                                                    stratify=y_rest, 
                                                    test_size=0.875)

Baselines

In [81]:
dummy_clf_70 = DummyClassifier(strategy='most_frequent')
dummy_clf_60 = DummyClassifier(strategy='most_frequent')
dummy_clf_50 = DummyClassifier(strategy='most_frequent')
dummy_clf_40 = DummyClassifier(strategy='most_frequent')
dummy_clf_30 = DummyClassifier(strategy='most_frequent')
dummy_clf_20 = DummyClassifier(strategy='most_frequent')
dummy_clf_10 = DummyClassifier(strategy='most_frequent')

dummy_clf_70.fit(x_train_70p, y_train_70p)
y_baseline_70 = dummy_clf_70.predict(x_test)
print(classification_report(y_test, y_baseline_70))

dummy_clf_60.fit(x_train_60p, y_train_60p)
y_baseline_60 = dummy_clf_60.predict(x_test)
print(classification_report(y_test, y_baseline_60))

dummy_clf_50.fit(x_train_50p, y_train_50p)
y_baseline_50 = dummy_clf_50.predict(x_test)
print(classification_report(y_test, y_baseline_50))

dummy_clf_40.fit(x_train_40p, y_train_40p)
y_baseline_40 = dummy_clf_40.predict(x_test)
print(classification_report(y_test, y_baseline_40))

dummy_clf_30.fit(x_train_30p, y_train_30p)
y_baseline_30 = dummy_clf_30.predict(x_test)
print(classification_report(y_test, y_baseline_30))

dummy_clf_20.fit(x_train_20p, y_train_20p)
y_baseline_20 = dummy_clf_20.predict(x_test)
print(classification_report(y_test, y_baseline_20))

dummy_clf_10.fit(x_train_10p, y_train_10p)
y_baseline_10 = dummy_clf_10.predict(x_test)
print(classification_report(y_test, y_baseline_10))

              precision    recall  f1-score   support

           0       0.73      1.00      0.85      1035
           1       0.00      0.00      0.00       374

    accuracy                           0.73      1409
   macro avg       0.37      0.50      0.42      1409
weighted avg       0.54      0.73      0.62      1409

              precision    recall  f1-score   support

           0       0.73      1.00      0.85      1035
           1       0.00      0.00      0.00       374

    accuracy                           0.73      1409
   macro avg       0.37      0.50      0.42      1409
weighted avg       0.54      0.73      0.62      1409

              precision    recall  f1-score   support

           0       0.73      1.00      0.85      1035
           1       0.00      0.00      0.00       374

    accuracy                           0.73      1409
   macro avg       0.37      0.50      0.42      1409
weighted avg       0.54      0.73      0.62      1409

              preci

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [6]:
oversample = SMOTENC(categorical_features = [3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,24,27,28])
x_train_70p, y_train_70p = oversample.fit_resample(x_train_70p, y_train_70p)
x_train_60p, y_train_60p = oversample.fit_resample(x_train_60p, y_train_60p)
x_train_50p, y_train_50p = oversample.fit_resample(x_train_50p, y_train_50p)
x_train_40p, y_train_40p = oversample.fit_resample(x_train_40p, y_train_40p)
x_train_30p, y_train_30p = oversample.fit_resample(x_train_30p, y_train_30p)
x_train_20p, y_train_20p = oversample.fit_resample(x_train_20p, y_train_20p)
x_train_10p, y_train_10p = oversample.fit_resample(x_train_10p, y_train_10p)

In [10]:
x_rest.to_csv(r'x_rest_tq3.csv', index = False, header = True)
x_test.to_csv(r'x_test_tq3.csv', index = False, header = True)

x_train_70p.to_csv(r'x_train_70p.csv', index = False, header = True)
x_train_60p.to_csv(r'x_train_60p.csv', index = False, header = True)
x_train_50p.to_csv(r'x_train_50p.csv', index = False, header = True)
x_train_40p.to_csv(r'x_train_40p.csv', index = False, header = True)
x_train_30p.to_csv(r'x_train_30p.csv', index = False, header = True)
x_train_20p.to_csv(r'x_train_20p.csv', index = False, header = True)
x_train_10p.to_csv(r'x_train_10p.csv', index = False, header = True)

In [11]:
train_t = pd.read_excel("x_rest_FAMD.xlsx")
test_t = pd.read_excel("x_test_FAMD.xlsx")

FAMD_train_70p = pd.read_excel("FAMD_train_70p.xlsx")
FAMD_train_60p = pd.read_excel("FAMD_train_60p.xlsx")
FAMD_train_50p = pd.read_excel("FAMD_train_50p.xlsx")
FAMD_train_40p = pd.read_excel("FAMD_train_40p.xlsx")
FAMD_train_30p = pd.read_excel("FAMD_train_30p.xlsx")
FAMD_train_20p = pd.read_excel("FAMD_train_20p.xlsx")
FAMD_train_10p = pd.read_excel("FAMD_train_10p.xlsx")

# SVM models

In [20]:
svm_model = svm.SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [100, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(kernel=kernel,C=C)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=svm_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_70p, y_train_70p)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.925184 using {'C': 10, 'kernel': 'rbf'}
0.909430 (0.009928) with: {'C': 100, 'kernel': 'poly'}
0.918579 (0.009406) with: {'C': 100, 'kernel': 'rbf'}
0.854900 (0.013573) with: {'C': 100, 'kernel': 'sigmoid'}
0.913355 (0.008342) with: {'C': 10, 'kernel': 'poly'}
0.925184 (0.011192) with: {'C': 10, 'kernel': 'rbf'}
0.851972 (0.014136) with: {'C': 10, 'kernel': 'sigmoid'}
0.908199 (0.009898) with: {'C': 1.0, 'kernel': 'poly'}
0.921595 (0.011328) with: {'C': 1.0, 'kernel': 'rbf'}
0.861557 (0.016758) with: {'C': 1.0, 'kernel': 'sigmoid'}
0.879450 (0.010554) with: {'C': 0.1, 'kernel': 'poly'}
0.902737 (0.011283) with: {'C': 0.1, 'kernel': 'rbf'}
0.894836 (0.010864) with: {'C': 0.1, 'kernel': 'sigmoid'}
0.803168 (0.011081) with: {'C': 0.01, 'kernel': 'poly'}
0.861934 (0.013228) with: {'C': 0.01, 'kernel': 'rbf'}
0.885158 (0.013916) with: {'C': 0.01, 'kernel': 'sigmoid'}


In [21]:
svm_model = svm.SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [100, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(kernel=kernel,C=C)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=svm_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_60p, y_train_60p)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.919598 using {'C': 10, 'kernel': 'rbf'}
0.895150 (0.006536) with: {'C': 100, 'kernel': 'poly'}
0.905981 (0.009496) with: {'C': 100, 'kernel': 'rbf'}
0.834386 (0.014255) with: {'C': 100, 'kernel': 'sigmoid'}
0.900261 (0.008210) with: {'C': 10, 'kernel': 'poly'}
0.919598 (0.009122) with: {'C': 10, 'kernel': 'rbf'}
0.837198 (0.012094) with: {'C': 10, 'kernel': 'sigmoid'}
0.898510 (0.012000) with: {'C': 1.0, 'kernel': 'poly'}
0.909113 (0.011423) with: {'C': 1.0, 'kernel': 'rbf'}
0.833726 (0.014896) with: {'C': 1.0, 'kernel': 'sigmoid'}
0.870172 (0.010733) with: {'C': 0.1, 'kernel': 'poly'}
0.892837 (0.009392) with: {'C': 0.1, 'kernel': 'rbf'}
0.880701 (0.013553) with: {'C': 0.1, 'kernel': 'sigmoid'}
0.797214 (0.012399) with: {'C': 0.01, 'kernel': 'poly'}
0.853321 (0.008633) with: {'C': 0.01, 'kernel': 'rbf'}
0.873693 (0.009911) with: {'C': 0.01, 'kernel': 'sigmoid'}


In [22]:
svm_model = svm.SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [100, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(kernel=kernel,C=C)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=svm_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_50p, y_train_50p)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.919030 using {'C': 10, 'kernel': 'rbf'}
0.892165 (0.012243) with: {'C': 100, 'kernel': 'poly'}
0.908668 (0.006814) with: {'C': 100, 'kernel': 'rbf'}
0.829017 (0.013316) with: {'C': 100, 'kernel': 'sigmoid'}
0.904545 (0.013416) with: {'C': 10, 'kernel': 'poly'}
0.919030 (0.009541) with: {'C': 10, 'kernel': 'rbf'}
0.836501 (0.015965) with: {'C': 10, 'kernel': 'sigmoid'}
0.907948 (0.014807) with: {'C': 1.0, 'kernel': 'poly'}
0.914454 (0.012264) with: {'C': 1.0, 'kernel': 'rbf'}
0.838282 (0.014917) with: {'C': 1.0, 'kernel': 'sigmoid'}
0.864854 (0.010492) with: {'C': 0.1, 'kernel': 'poly'}
0.899978 (0.013920) with: {'C': 0.1, 'kernel': 'rbf'}
0.885902 (0.013869) with: {'C': 0.1, 'kernel': 'sigmoid'}
0.789574 (0.008127) with: {'C': 0.01, 'kernel': 'poly'}
0.845654 (0.018868) with: {'C': 0.01, 'kernel': 'rbf'}
0.871932 (0.016422) with: {'C': 0.01, 'kernel': 'sigmoid'}


In [23]:
svm_model = svm.SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [100, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(kernel=kernel,C=C)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=svm_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_40p, y_train_40p)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.910664 using {'C': 10, 'kernel': 'rbf'}
0.883724 (0.015304) with: {'C': 100, 'kernel': 'poly'}
0.900751 (0.012585) with: {'C': 100, 'kernel': 'rbf'}
0.831861 (0.020090) with: {'C': 100, 'kernel': 'sigmoid'}
0.899175 (0.011579) with: {'C': 10, 'kernel': 'poly'}
0.910664 (0.009048) with: {'C': 10, 'kernel': 'rbf'}
0.830129 (0.021119) with: {'C': 10, 'kernel': 'sigmoid'}
0.897805 (0.011593) with: {'C': 1.0, 'kernel': 'poly'}
0.901827 (0.009621) with: {'C': 1.0, 'kernel': 'rbf'}
0.834128 (0.015638) with: {'C': 1.0, 'kernel': 'sigmoid'}
0.855731 (0.013088) with: {'C': 0.1, 'kernel': 'poly'}
0.883106 (0.009396) with: {'C': 0.1, 'kernel': 'rbf'}
0.879070 (0.011590) with: {'C': 0.1, 'kernel': 'sigmoid'}
0.785391 (0.005984) with: {'C': 0.01, 'kernel': 'poly'}
0.838019 (0.016228) with: {'C': 0.01, 'kernel': 'rbf'}
0.861951 (0.011811) with: {'C': 0.01, 'kernel': 'sigmoid'}


In [24]:
svm_model = svm.SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [100, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(kernel=kernel,C=C)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=svm_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_30p, y_train_30p)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.897546 using {'C': 10, 'kernel': 'rbf'}
0.874612 (0.017885) with: {'C': 100, 'kernel': 'poly'}
0.887796 (0.013695) with: {'C': 100, 'kernel': 'rbf'}
0.831131 (0.023912) with: {'C': 100, 'kernel': 'sigmoid'}
0.883969 (0.017348) with: {'C': 10, 'kernel': 'poly'}
0.897546 (0.013990) with: {'C': 10, 'kernel': 'rbf'}
0.816312 (0.022889) with: {'C': 10, 'kernel': 'sigmoid'}
0.883214 (0.019752) with: {'C': 1.0, 'kernel': 'poly'}
0.895300 (0.017538) with: {'C': 1.0, 'kernel': 'rbf'}
0.842521 (0.025229) with: {'C': 1.0, 'kernel': 'sigmoid'}
0.845256 (0.020005) with: {'C': 0.1, 'kernel': 'poly'}
0.878818 (0.024881) with: {'C': 0.1, 'kernel': 'rbf'}
0.874682 (0.024296) with: {'C': 0.1, 'kernel': 'sigmoid'}
0.788113 (0.018100) with: {'C': 0.01, 'kernel': 'poly'}
0.840932 (0.023410) with: {'C': 0.01, 'kernel': 'rbf'}
0.865814 (0.016799) with: {'C': 0.01, 'kernel': 'sigmoid'}


In [25]:
svm_model = svm.SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [100, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(kernel=kernel,C=C)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=svm_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_20p, y_train_20p)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.908556 using {'C': 10, 'kernel': 'rbf'}
0.880711 (0.018235) with: {'C': 100, 'kernel': 'poly'}
0.894091 (0.023535) with: {'C': 100, 'kernel': 'rbf'}
0.837295 (0.032746) with: {'C': 100, 'kernel': 'sigmoid'}
0.894804 (0.017857) with: {'C': 10, 'kernel': 'poly'}
0.908556 (0.018234) with: {'C': 10, 'kernel': 'rbf'}
0.837540 (0.032164) with: {'C': 10, 'kernel': 'sigmoid'}
0.890932 (0.021938) with: {'C': 1.0, 'kernel': 'poly'}
0.899004 (0.020603) with: {'C': 1.0, 'kernel': 'rbf'}
0.858737 (0.027851) with: {'C': 1.0, 'kernel': 'sigmoid'}
0.845750 (0.024270) with: {'C': 0.1, 'kernel': 'poly'}
0.891054 (0.019856) with: {'C': 0.1, 'kernel': 'rbf'}
0.893535 (0.024781) with: {'C': 0.1, 'kernel': 'sigmoid'}
0.762880 (0.022681) with: {'C': 0.01, 'kernel': 'poly'}
0.829600 (0.016978) with: {'C': 0.01, 'kernel': 'rbf'}
0.862291 (0.022658) with: {'C': 0.01, 'kernel': 'sigmoid'}


In [26]:
svm_model = svm.SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [100, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(kernel=kernel,C=C)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=svm_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_10p, y_train_10p)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.899596 using {'C': 1.0, 'kernel': 'rbf'}
0.863911 (0.024201) with: {'C': 100, 'kernel': 'poly'}
0.886953 (0.028821) with: {'C': 100, 'kernel': 'rbf'}
0.859993 (0.039674) with: {'C': 100, 'kernel': 'sigmoid'}
0.881640 (0.024242) with: {'C': 10, 'kernel': 'poly'}
0.893261 (0.032013) with: {'C': 10, 'kernel': 'rbf'}
0.872001 (0.024202) with: {'C': 10, 'kernel': 'sigmoid'}
0.882220 (0.014077) with: {'C': 1.0, 'kernel': 'poly'}
0.899596 (0.026042) with: {'C': 1.0, 'kernel': 'rbf'}
0.886147 (0.030953) with: {'C': 1.0, 'kernel': 'sigmoid'}
0.817899 (0.019653) with: {'C': 0.1, 'kernel': 'poly'}
0.878666 (0.022020) with: {'C': 0.1, 'kernel': 'rbf'}
0.885986 (0.026737) with: {'C': 0.1, 'kernel': 'sigmoid'}
0.730053 (0.020643) with: {'C': 0.01, 'kernel': 'poly'}
0.830369 (0.025566) with: {'C': 0.01, 'kernel': 'rbf'}
0.861822 (0.023047) with: {'C': 0.01, 'kernel': 'sigmoid'}


In [27]:
svm_model_70 = svm.SVC(C = 10, kernel = 'rbf')
svm_model_60 = svm.SVC(C = 10, kernel = 'rbf')
svm_model_50 = svm.SVC(C = 10, kernel = 'rbf')
svm_model_40 = svm.SVC(C = 10, kernel = 'rbf')
svm_model_30 = svm.SVC(C = 10, kernel = 'rbf')
svm_model_20 = svm.SVC(C = 10, kernel = 'rbf')
svm_model_10 = svm.SVC(C = 1.0, kernel = 'rbf')

svm_model_70.fit(FAMD_train_70p, y_train_70p)
y_pred_70 = svm_model_70.predict(test_t)

svm_model_60.fit(FAMD_train_60p, y_train_60p)
y_pred_60 = svm_model_60.predict(test_t)

svm_model_50.fit(FAMD_train_50p, y_train_50p)
y_pred_50 = svm_model_50.predict(test_t)

svm_model_40.fit(FAMD_train_40p, y_train_40p)
y_pred_40 = svm_model_40.predict(test_t)

svm_model_30.fit(FAMD_train_30p, y_train_30p)
y_pred_30 = svm_model_30.predict(test_t)

svm_model_20.fit(FAMD_train_20p, y_train_20p)
y_pred_20 = svm_model_20.predict(test_t)

svm_model_10.fit(FAMD_train_10p, y_train_10p)
y_pred_10 = svm_model_10.predict(test_t)

print(classification_report(y_test, y_pred_70))
print(classification_report(y_test, y_pred_60))
print(classification_report(y_test, y_pred_50))
print(classification_report(y_test, y_pred_40))
print(classification_report(y_test, y_pred_30))
print(classification_report(y_test, y_pred_20))
print(classification_report(y_test, y_pred_10))

              precision    recall  f1-score   support

           0       0.72      0.52      0.60      1035
           1       0.25      0.45      0.32       374

    accuracy                           0.50      1409
   macro avg       0.49      0.49      0.46      1409
weighted avg       0.60      0.50      0.53      1409

              precision    recall  f1-score   support

           0       0.72      0.53      0.61      1035
           1       0.24      0.42      0.31       374

    accuracy                           0.50      1409
   macro avg       0.48      0.47      0.46      1409
weighted avg       0.59      0.50      0.53      1409

              precision    recall  f1-score   support

           0       0.72      0.54      0.61      1035
           1       0.25      0.42      0.31       374

    accuracy                           0.50      1409
   macro avg       0.48      0.48      0.46      1409
weighted avg       0.59      0.50      0.53      1409

              preci

# KNN models

In [28]:
# define models and parameters
knn_model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=knn_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_70p, y_train_70p)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.887199 using {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.860615 (0.014662) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.860615 (0.014662) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.878189 (0.011766) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.879564 (0.011350) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.882257 (0.012112) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.883700 (0.011996) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.882968 (0.013434) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.884503 (0.012410) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.885127 (0.014709) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.887199 (0.013799) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.883704 

In [29]:
# define models and parameters
knn_model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=knn_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_60p, y_train_60p)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.885305 using {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.856164 (0.016470) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.856164 (0.016470) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.875283 (0.012823) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.875922 (0.012662) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.879860 (0.014325) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.881158 (0.013540) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.882194 (0.015076) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.885305 (0.014415) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.881397 (0.013914) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.883946 (0.013880) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.881334 

In [30]:
# define models and parameters
knn_model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=knn_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_50p, y_train_50p)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.880912 using {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'}
0.856748 (0.014967) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.856748 (0.014967) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.874806 (0.010602) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.875680 (0.010776) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.878562 (0.013540) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.880021 (0.012904) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.878605 (0.011242) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.879581 (0.011498) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.878829 (0.010422) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.880216 (0.010179) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.879598

In [31]:
# define models and parameters
knn_model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=knn_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_40p, y_train_40p)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.884576 using {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'distance'}
0.858864 (0.014586) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.858864 (0.014586) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.872076 (0.016124) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.872596 (0.015807) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.876303 (0.015730) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.878404 (0.017220) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.880184 (0.022302) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.882178 (0.022361) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.880465 (0.021669) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.882241 (0.021210) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.879573

In [32]:
# define models and parameters
knn_model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=knn_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_30p, y_train_30p)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.879663 using {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}
0.859293 (0.018557) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.859293 (0.018557) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.867036 (0.016293) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.867570 (0.016138) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.872014 (0.010339) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.873248 (0.010347) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.873425 (0.013784) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.875145 (0.013307) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.871280 (0.016355) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.873587 (0.016027) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.870308 

In [33]:
# define models and parameters
knn_model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=knn_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_20p, y_train_20p)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.883404 using {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.861546 (0.017924) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.861546 (0.017924) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.874573 (0.016963) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.874263 (0.017190) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.883023 (0.018914) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.883404 (0.020312) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.877155 (0.017494) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.876731 (0.017550) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.876780 (0.018047) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.878499 (0.017953) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.875228 

In [34]:
# define models and parameters
knn_model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=knn_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_10p, y_train_10p)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.883395 using {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}
0.832317 (0.029264) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.832317 (0.029264) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.867726 (0.026297) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.867726 (0.026297) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.865391 (0.025719) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.866220 (0.026256) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.870045 (0.022521) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.870045 (0.022521) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.866392 (0.021311) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.868786 (0.022820) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.863416 (

In [40]:
knn_model_70 = KNeighborsClassifier(metric = "euclidean", n_neighbors = 9, weights = "distance")
knn_model_60 = KNeighborsClassifier(metric = "euclidean", n_neighbors = 7, weights = "distance")
knn_model_50 = KNeighborsClassifier(metric = "euclidean", n_neighbors = 11, weights = "distance")
knn_model_40 = KNeighborsClassifier(metric = "manhattan", n_neighbors = 15, weights = "distance")
knn_model_30 = KNeighborsClassifier(metric = "manhattan", n_neighbors = 7, weights = "distance")
knn_model_20 = KNeighborsClassifier(metric = "euclidean", n_neighbors = 5, weights = "distance")
knn_model_10 = KNeighborsClassifier(metric = "manhattan", n_neighbors = 9, weights = "uniform")


knn_model_70.fit(FAMD_train_70p, y_train_70p)
y_pred_70 = knn_model_70.predict(test_t)

knn_model_60.fit(FAMD_train_60p, y_train_60p)
y_pred_60 = knn_model_60.predict(test_t)

knn_model_50.fit(FAMD_train_50p, y_train_50p)
y_pred_50 = knn_model_50.predict(test_t)

knn_model_40.fit(FAMD_train_40p, y_train_40p)
y_pred_40 = knn_model_40.predict(test_t)

knn_model_30.fit(FAMD_train_30p, y_train_30p)
y_pred_30 = knn_model_30.predict(test_t)

knn_model_20.fit(FAMD_train_20p, y_train_20p)
y_pred_20 = knn_model_20.predict(test_t)

knn_model_10.fit(FAMD_train_10p, y_train_10p)
y_pred_10 = knn_model_10.predict(test_t)


print(classification_report(y_test, y_pred_70))
print(classification_report(y_test, y_pred_60))
print(classification_report(y_test, y_pred_50))
print(classification_report(y_test, y_pred_40))
print(classification_report(y_test, y_pred_30))
print(classification_report(y_test, y_pred_20))
print(classification_report(y_test, y_pred_10))

              precision    recall  f1-score   support

           0       0.77      0.46      0.57      1035
           1       0.29      0.62      0.40       374

    accuracy                           0.50      1409
   macro avg       0.53      0.54      0.49      1409
weighted avg       0.64      0.50      0.53      1409

              precision    recall  f1-score   support

           0       0.77      0.47      0.58      1035
           1       0.29      0.61      0.40       374

    accuracy                           0.51      1409
   macro avg       0.53      0.54      0.49      1409
weighted avg       0.64      0.51      0.53      1409

              precision    recall  f1-score   support

           0       0.78      0.44      0.56      1035
           1       0.30      0.66      0.41       374

    accuracy                           0.50      1409
   macro avg       0.54      0.55      0.48      1409
weighted avg       0.65      0.50      0.52      1409

              preci

# Hybrid approach

In [41]:
range_n_clusters = list (range(2,9))
for n_clusters in range_n_clusters:
    clusterer = KMeans(n_clusters=n_clusters)
    preds = clusterer.fit_predict(test_t)
    centers = clusterer.cluster_centers_
    score = silhouette_score(test_t, preds)
    print("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))

For n_clusters = 2, silhouette score is 0.28914070568253053)
For n_clusters = 3, silhouette score is 0.23330829710904064)
For n_clusters = 4, silhouette score is 0.2424787445829663)
For n_clusters = 5, silhouette score is 0.18637180286824748)
For n_clusters = 6, silhouette score is 0.18433750950501895)
For n_clusters = 7, silhouette score is 0.13947343374505888)
For n_clusters = 8, silhouette score is 0.12453150604905466)


---

In [42]:
kmeans = KMeans(n_clusters=2)
train_clusters = kmeans.fit(FAMD_train_70p)
test_clusters = train_clusters.predict(test_t)

In [43]:
FAMD_train_70p['cluster'] = train_clusters.labels_
test_t['cluster'] = test_clusters

In [44]:
knn_model_70 = KNeighborsClassifier(metric = "euclidean", n_neighbors = 9, weights = "distance")
knn_model_70.fit(FAMD_train_70p, y_train_70p)
y_pred_70 = knn_model_70.predict(test_t)
print(classification_report(y_test, y_pred_70))

              precision    recall  f1-score   support

           0       0.77      0.46      0.57      1035
           1       0.29      0.62      0.40       374

    accuracy                           0.50      1409
   macro avg       0.53      0.54      0.49      1409
weighted avg       0.64      0.50      0.53      1409



In [45]:
svm_model_70 = svm.SVC(C = 10, kernel = 'rbf')
svm_model_70.fit(FAMD_train_70p, y_train_70p)
y_pred_70 = svm_model_70.predict(test_t)
print(classification_report(y_test, y_pred_70))

              precision    recall  f1-score   support

           0       0.72      0.52      0.60      1035
           1       0.25      0.45      0.32       374

    accuracy                           0.50      1409
   macro avg       0.49      0.48      0.46      1409
weighted avg       0.60      0.50      0.53      1409



---

In [46]:
del test_t["cluster"]

In [47]:
kmeans = KMeans(n_clusters=2)
train_clusters = kmeans.fit(FAMD_train_60p)
test_clusters = train_clusters.predict(test_t)

In [48]:
FAMD_train_60p['cluster'] = train_clusters.labels_
test_t['cluster'] = test_clusters

In [49]:
knn_model_60 = KNeighborsClassifier(metric = "euclidean", n_neighbors = 7, weights = "distance")
knn_model_60.fit(FAMD_train_60p, y_train_60p)
y_pred_60 = knn_model_60.predict(test_t)
print(classification_report(y_test, y_pred_60))

              precision    recall  f1-score   support

           0       0.77      0.47      0.58      1035
           1       0.29      0.61      0.40       374

    accuracy                           0.51      1409
   macro avg       0.53      0.54      0.49      1409
weighted avg       0.64      0.51      0.53      1409



In [50]:
svm_model_60 = svm.SVC(C = 10, kernel = 'rbf')
svm_model_60.fit(FAMD_train_60p, y_train_60p)
y_pred_60 = svm_model_60.predict(test_t)
print(classification_report(y_test, y_pred_60))

              precision    recall  f1-score   support

           0       0.72      0.53      0.61      1035
           1       0.24      0.42      0.31       374

    accuracy                           0.50      1409
   macro avg       0.48      0.48      0.46      1409
weighted avg       0.59      0.50      0.53      1409



---

In [51]:
del test_t["cluster"]

In [52]:
kmeans = KMeans(n_clusters=2)
train_clusters = kmeans.fit(FAMD_train_50p)
test_clusters = train_clusters.predict(test_t)

In [53]:
FAMD_train_50p['cluster'] = train_clusters.labels_
test_t['cluster'] = test_clusters

In [54]:
knn_model_50 = KNeighborsClassifier(metric = "euclidean", n_neighbors = 11, weights = "distance")
knn_model_50.fit(FAMD_train_50p, y_train_50p)
y_pred_50 = knn_model_50.predict(test_t)
print(classification_report(y_test, y_pred_50))

              precision    recall  f1-score   support

           0       0.78      0.44      0.56      1035
           1       0.30      0.66      0.41       374

    accuracy                           0.50      1409
   macro avg       0.54      0.55      0.48      1409
weighted avg       0.65      0.50      0.52      1409



In [55]:
svm_model_50 = svm.SVC(C = 10, kernel = 'rbf')
svm_model_50.fit(FAMD_train_50p, y_train_50p)
y_pred_50 = svm_model_50.predict(test_t)
print(classification_report(y_test, y_pred_50))

              precision    recall  f1-score   support

           0       0.72      0.54      0.61      1035
           1       0.25      0.42      0.31       374

    accuracy                           0.50      1409
   macro avg       0.48      0.48      0.46      1409
weighted avg       0.59      0.50      0.53      1409



---

In [56]:
del test_t["cluster"]

In [57]:
kmeans = KMeans(n_clusters=2)
train_clusters = kmeans.fit(FAMD_train_40p)
test_clusters = train_clusters.predict(test_t)

In [58]:
FAMD_train_40p['cluster'] = train_clusters.labels_
test_t['cluster'] = test_clusters

In [59]:
knn_model_40 = KNeighborsClassifier(metric = "manhattan", n_neighbors = 15, weights = "distance")
knn_model_40.fit(FAMD_train_40p, y_train_40p)
y_pred_40 = knn_model_40.predict(test_t)
print(classification_report(y_test, y_pred_40))

              precision    recall  f1-score   support

           0       0.83      0.47      0.60      1035
           1       0.33      0.73      0.46       374

    accuracy                           0.54      1409
   macro avg       0.58      0.60      0.53      1409
weighted avg       0.70      0.54      0.56      1409



In [60]:
svm_model_40 = svm.SVC(C = 10, kernel = 'rbf')
svm_model_40.fit(FAMD_train_40p, y_train_40p)
y_pred_40 = svm_model_40.predict(test_t)
print(classification_report(y_test, y_pred_40))

              precision    recall  f1-score   support

           0       0.72      0.52      0.61      1035
           1       0.25      0.44      0.32       374

    accuracy                           0.50      1409
   macro avg       0.49      0.48      0.46      1409
weighted avg       0.60      0.50      0.53      1409



---

In [61]:
del test_t["cluster"]

In [62]:
kmeans = KMeans(n_clusters=2)
train_clusters = kmeans.fit(FAMD_train_30p)
test_clusters = train_clusters.predict(test_t)

In [63]:
FAMD_train_30p['cluster'] = train_clusters.labels_
test_t['cluster'] = test_clusters

In [64]:
knn_model_30 = KNeighborsClassifier(metric = "manhattan", n_neighbors = 7, weights = "distance")
knn_model_30.fit(FAMD_train_30p, y_train_30p)
y_pred_30 = knn_model_30.predict(test_t)
print(classification_report(y_test, y_pred_30))

              precision    recall  f1-score   support

           0       0.74      0.41      0.53      1035
           1       0.27      0.60      0.37       374

    accuracy                           0.46      1409
   macro avg       0.51      0.51      0.45      1409
weighted avg       0.62      0.46      0.49      1409



In [65]:
svm_model_30 = svm.SVC(C = 10, kernel = 'rbf')
svm_model_30.fit(FAMD_train_30p, y_train_30p)
y_pred_30 = svm_model_30.predict(test_t)
print(classification_report(y_test, y_pred_30))

              precision    recall  f1-score   support

           0       0.68      0.50      0.58      1035
           1       0.20      0.36      0.26       374

    accuracy                           0.46      1409
   macro avg       0.44      0.43      0.42      1409
weighted avg       0.55      0.46      0.49      1409



---

In [66]:
del test_t["cluster"]

In [67]:
kmeans = KMeans(n_clusters=2)
train_clusters = kmeans.fit(FAMD_train_20p)
test_clusters = train_clusters.predict(test_t)

In [68]:
FAMD_train_20p['cluster'] = train_clusters.labels_
test_t['cluster'] = test_clusters

In [69]:
knn_model_20 = KNeighborsClassifier(metric = "euclidean", n_neighbors = 5, weights = "distance")
knn_model_20.fit(FAMD_train_20p, y_train_20p)
y_pred_20 = knn_model_20.predict(test_t)
print(classification_report(y_test, y_pred_20))

              precision    recall  f1-score   support

           0       0.87      0.47      0.61      1035
           1       0.36      0.80      0.49       374

    accuracy                           0.56      1409
   macro avg       0.61      0.64      0.55      1409
weighted avg       0.73      0.56      0.58      1409



In [70]:
svm_model_20 = svm.SVC(C = 10, kernel = 'rbf')
svm_model_20.fit(FAMD_train_20p, y_train_20p)
y_pred_20 = svm_model_20.predict(test_t)
print(classification_report(y_test, y_pred_20))

              precision    recall  f1-score   support

           0       0.79      0.59      0.67      1035
           1       0.33      0.56      0.42       374

    accuracy                           0.58      1409
   macro avg       0.56      0.57      0.54      1409
weighted avg       0.67      0.58      0.60      1409



---

In [71]:
del test_t["cluster"]

In [73]:
kmeans = KMeans(n_clusters=2)
train_clusters = kmeans.fit(FAMD_train_10p)
test_clusters = train_clusters.predict(test_t)

In [75]:
FAMD_train_10p['cluster'] = train_clusters.labels_
test_t['cluster'] = test_clusters

In [76]:
knn_model_10 = KNeighborsClassifier(metric = "manhattan", n_neighbors = 9, weights = "uniform")
knn_model_10.fit(FAMD_train_10p, y_train_10p)
y_pred_10 = knn_model_10.predict(test_t)


              precision    recall  f1-score   support

           0       0.87      0.43      0.57      1035
           1       0.34      0.83      0.49       374

    accuracy                           0.53      1409
   macro avg       0.61      0.63      0.53      1409
weighted avg       0.73      0.53      0.55      1409



In [77]:
svm_model_10 = svm.SVC(C = 1.0, kernel = 'rbf')
svm_model_10.fit(FAMD_train_10p, y_train_10p)
y_pred_10 = svm_model_10.predict(test_t)
print(classification_report(y_test, y_pred_10))

              precision    recall  f1-score   support

           0       0.82      0.67      0.73      1035
           1       0.39      0.59      0.47       374

    accuracy                           0.65      1409
   macro avg       0.60      0.63      0.60      1409
weighted avg       0.70      0.65      0.66      1409

