Import Packages

In [14]:
#Import the required packages
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from statistics import mean
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import silhouette_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
from sklearn.model_selection import cross_validate

Data wrangling

In [15]:
#Import the required dataset
data = pd.read_excel("Telco_customer_churn.xlsx")
demo = pd.read_excel("Telco_customer_churn_demographics copy.xlsx")
serv = pd.read_excel("Telco_customer_churn_services.xlsx")

#Renaming, merging, and dropping columns
data = data.rename(columns = {"CustomerID" : "Customer ID"})
data = pd.merge(data, demo[['Customer ID','Age', 'Married']], on = "Customer ID")
data = pd.merge(data, serv[['Customer ID', 'Number of Referrals', 'Avg Monthly GB Download', 'Streaming Music', 'Unlimited Data', 'Total Refunds', 'Total Extra Data Charges', 'Total Long Distance Charges']], on = 'Customer ID')
data = data.drop(columns = ["City", "Lat Long", "Count", "Country", "State", "Churn Reason", 'Total Charges', 'Churn Label'])
data = pd.merge(data, serv[['Customer ID','Total Charges', 'Total Revenue']], on = "Customer ID")
data = data.drop(columns = ['Customer ID'])

#Re-arranging the columns
cols = list(data.columns.values)
cols = ['Zip Code','Latitude','Longitude','Gender','Senior Citizen','Partner','Dependents','Tenure Months',
 'Phone Service','Multiple Lines','Internet Service','Online Security','Online Backup','Device Protection',
 'Tech Support','Streaming TV','Streaming Movies','Contract','Paperless Billing','Payment Method','Monthly Charges',
 'Churn Score','CLTV','Age','Married','Number of Referrals','Avg Monthly GB Download','Streaming Music',
 'Unlimited Data','Total Refunds','Total Extra Data Charges','Total Long Distance Charges','Total Charges',
 'Total Revenue','Churn Value']
data = data[cols]

Split into X and Y

In [16]:
#Split features from target variable
x = data.iloc[:, :34]
y = data.iloc[: , -1]

Train/test split

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    stratify=y, 
                                                    test_size=0.20, random_state = 50)

Oversample train X and Y

In [18]:
y_train.value_counts()

0    4139
1    1495
Name: Churn Value, dtype: int64

In [19]:
oversample = SMOTENC(categorical_features = [3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,24,27,28])
x_train, y_train = oversample.fit_resample(x_train, y_train)

In [20]:
y_train.value_counts()

0    4139
1    4139
Name: Churn Value, dtype: int64

In [21]:
x_train.to_csv(r'x_train_q1.csv', index = False, header = True)
x_test.to_csv(r'x_test_q1.csv', index = False, header = True)

In [22]:
#Train sets (x)
FAMD_train_43 = pd.read_excel("FAMD_train_43.xlsx")
FAMD_train_23 = pd.read_excel("FAMD_train_23.xlsx")
FAMD_train_16 = pd.read_excel("FAMD_train_16.xlsx")
FAMD_train_11 = pd.read_excel("FAMD_train_11.xlsx")
FAMD_train_7 = pd.read_excel("FAMD_train_7.xlsx")


#Test sets (x)
FAMD_test_43 = pd.read_excel("FAMD_test_43.xlsx")
FAMD_test_23 = pd.read_excel("FAMD_test_23.xlsx")
FAMD_test_16 = pd.read_excel("FAMD_test_16.xlsx")
FAMD_test_11 = pd.read_excel("FAMD_test_11.xlsx")
FAMD_test_7 = pd.read_excel("FAMD_test_7.xlsx")

# SVM models

In [23]:
svm_model = svm.SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(kernel=kernel,C=C)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=svm_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_43, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.944949 using {'C': 1.0, 'kernel': 'rbf'}
0.927531 (0.007023) with: {'C': 50, 'kernel': 'poly'}
0.936917 (0.008948) with: {'C': 50, 'kernel': 'rbf'}
0.894743 (0.011158) with: {'C': 50, 'kernel': 'sigmoid'}
0.935727 (0.006072) with: {'C': 10, 'kernel': 'poly'}
0.943531 (0.006333) with: {'C': 10, 'kernel': 'rbf'}
0.894975 (0.010544) with: {'C': 10, 'kernel': 'sigmoid'}
0.936956 (0.007063) with: {'C': 1.0, 'kernel': 'poly'}
0.944949 (0.006345) with: {'C': 1.0, 'kernel': 'rbf'}
0.897585 (0.007383) with: {'C': 1.0, 'kernel': 'sigmoid'}
0.898417 (0.009814) with: {'C': 0.1, 'kernel': 'poly'}
0.928846 (0.007614) with: {'C': 0.1, 'kernel': 'rbf'}
0.930215 (0.010130) with: {'C': 0.1, 'kernel': 'sigmoid'}
0.795577 (0.008581) with: {'C': 0.01, 'kernel': 'poly'}
0.866667 (0.014054) with: {'C': 0.01, 'kernel': 'rbf'}
0.903176 (0.009988) with: {'C': 0.01, 'kernel': 'sigmoid'}


In [24]:
svm_model = svm.SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(kernel=kernel,C=C)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=svm_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_23, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.945637 using {'C': 10, 'kernel': 'rbf'}
0.930432 (0.009437) with: {'C': 50, 'kernel': 'poly'}
0.941060 (0.008646) with: {'C': 50, 'kernel': 'rbf'}
0.886251 (0.011007) with: {'C': 50, 'kernel': 'sigmoid'}
0.938372 (0.008835) with: {'C': 10, 'kernel': 'poly'}
0.945637 (0.006204) with: {'C': 10, 'kernel': 'rbf'}
0.886646 (0.010923) with: {'C': 10, 'kernel': 'sigmoid'}
0.934036 (0.009922) with: {'C': 1.0, 'kernel': 'poly'}
0.944785 (0.007783) with: {'C': 1.0, 'kernel': 'rbf'}
0.890186 (0.010387) with: {'C': 1.0, 'kernel': 'sigmoid'}
0.901207 (0.009145) with: {'C': 0.1, 'kernel': 'poly'}
0.928705 (0.011263) with: {'C': 0.1, 'kernel': 'rbf'}
0.925646 (0.006407) with: {'C': 0.1, 'kernel': 'sigmoid'}
0.803422 (0.010579) with: {'C': 0.01, 'kernel': 'poly'}
0.870375 (0.014064) with: {'C': 0.01, 'kernel': 'rbf'}
0.905247 (0.009583) with: {'C': 0.01, 'kernel': 'sigmoid'}


In [25]:
svm_model = svm.SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(kernel=kernel,C=C)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=svm_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_16, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.925255 using {'C': 10, 'kernel': 'rbf'}
0.910542 (0.009781) with: {'C': 50, 'kernel': 'poly'}
0.920686 (0.010532) with: {'C': 50, 'kernel': 'rbf'}
0.850573 (0.012216) with: {'C': 50, 'kernel': 'sigmoid'}
0.915904 (0.008401) with: {'C': 10, 'kernel': 'poly'}
0.925255 (0.008597) with: {'C': 10, 'kernel': 'rbf'}
0.855564 (0.013739) with: {'C': 10, 'kernel': 'sigmoid'}
0.914782 (0.007073) with: {'C': 1.0, 'kernel': 'poly'}
0.922630 (0.007218) with: {'C': 1.0, 'kernel': 'rbf'}
0.858363 (0.011816) with: {'C': 1.0, 'kernel': 'sigmoid'}
0.884397 (0.005523) with: {'C': 0.1, 'kernel': 'poly'}
0.904611 (0.007504) with: {'C': 0.1, 'kernel': 'rbf'}
0.886598 (0.009033) with: {'C': 0.1, 'kernel': 'sigmoid'}
0.807380 (0.007578) with: {'C': 0.01, 'kernel': 'poly'}
0.863392 (0.009224) with: {'C': 0.01, 'kernel': 'rbf'}
0.883906 (0.007950) with: {'C': 0.01, 'kernel': 'sigmoid'}


In [26]:
svm_model = svm.SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(kernel=kernel,C=C)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=svm_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_11, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.890555 using {'C': 50, 'kernel': 'rbf'}
0.886280 (0.012401) with: {'C': 50, 'kernel': 'poly'}
0.890555 (0.012584) with: {'C': 50, 'kernel': 'rbf'}
0.789765 (0.008159) with: {'C': 50, 'kernel': 'sigmoid'}
0.882403 (0.011533) with: {'C': 10, 'kernel': 'poly'}
0.889805 (0.013898) with: {'C': 10, 'kernel': 'rbf'}
0.789396 (0.018017) with: {'C': 10, 'kernel': 'sigmoid'}
0.880768 (0.011785) with: {'C': 1.0, 'kernel': 'poly'}
0.883657 (0.013677) with: {'C': 1.0, 'kernel': 'rbf'}
0.787888 (0.017790) with: {'C': 1.0, 'kernel': 'sigmoid'}
0.862047 (0.010748) with: {'C': 0.1, 'kernel': 'poly'}
0.875119 (0.011043) with: {'C': 0.1, 'kernel': 'rbf'}
0.819758 (0.015691) with: {'C': 0.1, 'kernel': 'sigmoid'}
0.805763 (0.010869) with: {'C': 0.01, 'kernel': 'poly'}
0.846857 (0.012255) with: {'C': 0.01, 'kernel': 'rbf'}
0.852914 (0.011491) with: {'C': 0.01, 'kernel': 'sigmoid'}


In [27]:
svm_model = svm.SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(kernel=kernel,C=C)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=svm_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_7, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.891112 using {'C': 50, 'kernel': 'rbf'}
0.883218 (0.012741) with: {'C': 50, 'kernel': 'poly'}
0.891112 (0.009126) with: {'C': 50, 'kernel': 'rbf'}
0.780758 (0.012504) with: {'C': 50, 'kernel': 'sigmoid'}
0.882501 (0.011826) with: {'C': 10, 'kernel': 'poly'}
0.886164 (0.010813) with: {'C': 10, 'kernel': 'rbf'}
0.780490 (0.013143) with: {'C': 10, 'kernel': 'sigmoid'}
0.878634 (0.011547) with: {'C': 1.0, 'kernel': 'poly'}
0.882615 (0.011487) with: {'C': 1.0, 'kernel': 'rbf'}
0.780519 (0.017598) with: {'C': 1.0, 'kernel': 'sigmoid'}
0.864382 (0.009219) with: {'C': 0.1, 'kernel': 'poly'}
0.873270 (0.010491) with: {'C': 0.1, 'kernel': 'rbf'}
0.806243 (0.013020) with: {'C': 0.1, 'kernel': 'sigmoid'}
0.808829 (0.008929) with: {'C': 0.01, 'kernel': 'poly'}
0.845321 (0.016309) with: {'C': 0.01, 'kernel': 'rbf'}
0.846542 (0.011860) with: {'C': 0.01, 'kernel': 'sigmoid'}


In [33]:
svm_model_43 = svm.SVC(C = 1.0, kernel = 'rbf')
svm_model_23 = svm.SVC(C = 10, kernel = 'rbf')
svm_model_16 = svm.SVC(C = 10, kernel = 'rbf')
svm_model_11 = svm.SVC(C = 50, kernel = 'rbf')
svm_model_7 = svm.SVC(C = 50, kernel = 'rbf')


# 100% variance = 43 dims
svm_model_43.fit(FAMD_train_43, y_train)
y_pred_43 = svm_model_43.predict(FAMD_test_43)

# 90% variance = 23 dims
svm_model_23.fit(FAMD_train_23, y_train)
y_pred_23 = svm_model_23.predict(FAMD_test_23)

# 80% variance = 16 dims
svm_model_16.fit(FAMD_train_16, y_train)
y_pred_16 = svm_model_16.predict(FAMD_test_16)

# 70% variance = 11 dims
svm_model_11.fit(FAMD_train_11, y_train)
y_pred_11 = svm_model_11.predict(FAMD_test_11)

# 60% variance = 7 dims
svm_model_7.fit(FAMD_train_7, y_train)
y_pred_7 = svm_model_7.predict(FAMD_test_7)


print(classification_report(y_test, y_pred_43))
print(classification_report(y_test, y_pred_23))
print(classification_report(y_test, y_pred_16))
print(classification_report(y_test, y_pred_11))
print(classification_report(y_test, y_pred_7))

              precision    recall  f1-score   support

           0       0.86      0.66      0.75      1035
           1       0.43      0.70      0.53       374

    accuracy                           0.67      1409
   macro avg       0.64      0.68      0.64      1409
weighted avg       0.75      0.67      0.69      1409

              precision    recall  f1-score   support

           0       0.85      0.66      0.74      1035
           1       0.42      0.68      0.52       374

    accuracy                           0.66      1409
   macro avg       0.64      0.67      0.63      1409
weighted avg       0.74      0.66      0.68      1409

              precision    recall  f1-score   support

           0       0.84      0.61      0.71      1035
           1       0.39      0.69      0.50       374

    accuracy                           0.63      1409
   macro avg       0.62      0.65      0.61      1409
weighted avg       0.72      0.63      0.66      1409

              preci

# KNN models

In [28]:
# define models and parameters
knn_model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=knn_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_43, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.893493 using {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
0.868389 (0.010149) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.868389 (0.010149) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.885739 (0.008040) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.885968 (0.007813) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.885895 (0.006586) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.886856 (0.006660) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.886861 (0.005510) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.887408 (0.005471) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.885154 (0.007935) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.886304 (0.007743) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.882630 

In [29]:
# define models and parameters
knn_model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=knn_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_23, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.890599 using {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.868549 (0.003959) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.868549 (0.003959) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.884882 (0.006345) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.885043 (0.006057) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.889895 (0.006857) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.890599 (0.007688) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.888085 (0.005712) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.888613 (0.005464) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.887845 (0.006528) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.888949 (0.006716) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.883340 

In [30]:
# define models and parameters
knn_model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=knn_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_16, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.891311 using {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.869591 (0.016481) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.869591 (0.016481) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.885458 (0.015104) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.887594 (0.015157) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.889463 (0.013610) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.891311 (0.013492) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.888552 (0.011983) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.891102 (0.011799) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.887777 (0.012610) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.890522 (0.011704) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.888247 

In [31]:
# define models and parameters
knn_model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=knn_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_11, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.882271 using {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}
0.852051 (0.012979) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.852051 (0.012979) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.873004 (0.012175) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.874193 (0.012259) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.876449 (0.008149) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.877973 (0.008522) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.875622 (0.009115) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.878546 (0.007228) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.877409 (0.008734) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.880769 (0.007963) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.877337 

In [32]:
# define models and parameters
knn_model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = StratifiedKFold(n_splits=10, shuffle=True)
grid_search = GridSearchCV(estimator=knn_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
grid_result = grid_search.fit(FAMD_train_7, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.885423 using {'metric': 'euclidean', 'n_neighbors': 17, 'weights': 'distance'}
0.857561 (0.014363) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.857561 (0.014363) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.881026 (0.010649) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.881375 (0.011369) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.877656 (0.012708) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.879816 (0.012189) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.877943 (0.008810) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.881242 (0.009110) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.879935 (0.009696) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.882994 (0.009399) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.881843

In [34]:
knn_model_43 = KNeighborsClassifier(metric = "manhattan", n_neighbors = 5, weights = "distance")
knn_model_23 = KNeighborsClassifier(metric = "euclidean", n_neighbors = 5, weights = "distance")
knn_model_16 = KNeighborsClassifier(metric = "euclidean", n_neighbors = 5, weights = "distance")
knn_model_11 = KNeighborsClassifier(metric = "manhattan", n_neighbors = 7, weights = "distance")
knn_model_7 = KNeighborsClassifier(metric = "euclidean", n_neighbors = 17, weights = "distance")


# 100% variance = 43 dims
knn_model_43.fit(FAMD_train_43, y_train)
y_pred_43 = knn_model_43.predict(FAMD_test_43)

# 90% variance = 23 dims
knn_model_23.fit(FAMD_train_23, y_train)
y_pred_23 = knn_model_23.predict(FAMD_test_23)

# 80% variance = 16 dims
knn_model_16.fit(FAMD_train_16, y_train)
y_pred_16 = knn_model_16.predict(FAMD_test_16)

# 70% variance = 11 dims
knn_model_11.fit(FAMD_train_11, y_train)
y_pred_11 = knn_model_11.predict(FAMD_test_11)

# 60% variance = 7 dims
knn_model_7.fit(FAMD_train_7, y_train)
y_pred_7 = knn_model_7.predict(FAMD_test_7)


print(classification_report(y_test, y_pred_43))
print(classification_report(y_test, y_pred_23))
print(classification_report(y_test, y_pred_16))
print(classification_report(y_test, y_pred_11))
print(classification_report(y_test, y_pred_7))

              precision    recall  f1-score   support

           0       0.90      0.47      0.62      1035
           1       0.37      0.86      0.52       374

    accuracy                           0.58      1409
   macro avg       0.64      0.67      0.57      1409
weighted avg       0.76      0.58      0.59      1409

              precision    recall  f1-score   support

           0       0.90      0.51      0.65      1035
           1       0.38      0.85      0.53       374

    accuracy                           0.60      1409
   macro avg       0.64      0.68      0.59      1409
weighted avg       0.77      0.60      0.62      1409

              precision    recall  f1-score   support

           0       0.89      0.59      0.71      1035
           1       0.41      0.79      0.54       374

    accuracy                           0.65      1409
   macro avg       0.65      0.69      0.63      1409
weighted avg       0.76      0.65      0.67      1409

              preci