In [1]:
#Import the required packages
import pandas as pd
import numpy as np
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from statistics import mean
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import silhouette_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
from sklearn.model_selection import cross_validate

In [2]:
#Import the required dataset
data = pd.read_excel("Telco_customer_churn.xlsx")
demo = pd.read_excel("Telco_customer_churn_demographics copy.xlsx")
serv = pd.read_excel("Telco_customer_churn_services.xlsx")

#Renaming, merging, and dropping columns
data = data.rename(columns = {"CustomerID" : "Customer ID"})
data = pd.merge(data, demo[['Customer ID','Age', 'Married']], on = "Customer ID")
data = pd.merge(data, serv[['Customer ID', 'Number of Referrals', 'Avg Monthly GB Download', 'Streaming Music', 'Unlimited Data', 'Total Refunds', 'Total Extra Data Charges', 'Total Long Distance Charges']], on = 'Customer ID')
data = data.drop(columns = ["City", "Lat Long", "Count", "Country", "State", "Churn Reason", 'Total Charges', 'Churn Label'])
data = pd.merge(data, serv[['Customer ID','Total Charges', 'Total Revenue']], on = "Customer ID")
data = data.drop(columns = ['Customer ID'])

#Re-arranging the columns
cols = list(data.columns.values)
cols = ['Zip Code','Latitude','Longitude','Gender','Senior Citizen','Partner','Dependents','Tenure Months',
 'Phone Service','Multiple Lines','Internet Service','Online Security','Online Backup','Device Protection',
 'Tech Support','Streaming TV','Streaming Movies','Contract','Paperless Billing','Payment Method','Monthly Charges',
 'Churn Score','CLTV','Age','Married','Number of Referrals','Avg Monthly GB Download','Streaming Music',
 'Unlimited Data','Total Refunds','Total Extra Data Charges','Total Long Distance Charges','Total Charges',
 'Total Revenue','Churn Value']
data = data[cols]

In [3]:
#Split features from target variable
x = data.iloc[:, :34]
y = data.iloc[: , -1]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    stratify=y, 
                                                    test_size=0.20, random_state = 50)

In [5]:
y_train.value_counts()

0    4139
1    1495
Name: Churn Value, dtype: int64

In [6]:
oversample = SMOTENC(categorical_features = [3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,24,27,28])
x_train, y_train = oversample.fit_resample(x_train, y_train)

In [7]:
y_train.value_counts()

0    4139
1    4139
Name: Churn Value, dtype: int64

In [8]:
x_train = pd.get_dummies(x_train.iloc[:, 1 :])

x_train.head()

Unnamed: 0,Latitude,Longitude,Tenure Months,Monthly Charges,Churn Score,CLTV,Age,Number of Referrals,Avg Monthly GB Download,Total Refunds,...,Payment Method_Bank transfer (automatic),Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check,Married_No,Married_Yes,Streaming Music_No,Streaming Music_Yes,Unlimited Data_No,Unlimited Data_Yes
0,34.613476,-118.256358,11,20.45,29,2642,63,0,0,0.0,...,0,1,0,0,1,0,1,0,1,0
1,33.391181,-118.421305,25,58.6,99,4190,27,6,48,4.48,...,0,1,0,0,0,1,0,1,0,1
2,38.421458,-122.365048,36,63.7,31,2504,36,8,14,0.0,...,0,1,0,0,0,1,0,1,0,1
3,33.933565,-118.062611,2,91.15,52,2686,36,0,12,0.0,...,0,0,1,0,1,0,1,0,1,0
4,38.672708,-122.403219,9,48.75,97,4359,51,1,10,0.0,...,0,0,1,0,0,1,0,1,1,0


In [10]:
x_test = pd.get_dummies(x_test.iloc[:, 1 :])
x_test.head()

Unnamed: 0,Latitude,Longitude,Tenure Months,Monthly Charges,Churn Score,CLTV,Age,Number of Referrals,Avg Monthly GB Download,Total Refunds,...,Payment Method_Bank transfer (automatic),Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check,Married_No,Married_Yes,Streaming Music_No,Streaming Music_Yes,Unlimited Data_No,Unlimited Data_Yes
1482,36.385818,-119.224243,1,20.2,87,5394,28,0,0,0.0,...,1,0,0,0,1,0,1,0,1,0
5461,41.017282,-121.468945,7,48.8,51,4785,56,1,28,0.0,...,0,0,1,0,0,1,1,0,0,1
2595,37.722727,-122.02157,59,75.95,46,5446,40,4,22,0.0,...,0,1,0,0,0,1,0,1,0,1
4405,33.887676,-118.127289,25,19.5,36,4761,29,0,0,0.0,...,1,0,0,0,1,0,1,0,1,0
424,41.750354,-120.403885,23,94.75,95,2740,77,0,23,0.0,...,0,0,1,0,1,0,1,0,0,1


In [11]:
sc = StandardScaler()
x_train_pca = sc.fit_transform(x_train)
x_test_pca = sc.fit_transform(x_test) 
model_100 = PCA(n_components = 42)
model_90 = PCA(n_components = 22)
model_80 = PCA(n_components = 16)
model_70 = PCA(n_components = 11)
model_60 = PCA(n_components = 7)
x_pca_comp_100_train = model_100.fit_transform(x_train_pca)
x_pca_comp_90_train = model_90.fit_transform(x_train_pca)
x_pca_comp_80_train = model_80.fit_transform(x_train_pca)
x_pca_comp_70_train = model_70.fit_transform(x_train_pca)
x_pca_comp_60_train = model_60.fit_transform(x_train_pca)

x_pca_comp_100_test = model_100.fit_transform(x_test_pca)
x_pca_comp_90_test = model_90.fit_transform(x_test_pca)
x_pca_comp_80_test = model_80.fit_transform(x_test_pca)
x_pca_comp_70_test = model_70.fit_transform(x_test_pca)
x_pca_comp_60_test = model_60.fit_transform(x_test_pca)

In [12]:
pca_df_100_train=pd.DataFrame(x_pca_comp_100_train,columns=['pca1','pca2','pca3','pca4','pca5','pca6','pca7','pca8','pca9','pca10',
                                        'pca11','pca12','pca13','pca14','pca15','pca16','pca17','pca18','pca19',
                                        'pca20','pca21','pca22','pca23','pca24','pca25','pca26','pca27','pca28',
                                        'pca29','pca30','pca31','pca32','pca33','pca34','pca35','pca36','pca37',
                                        'pca38','pca39','pca40','pca41','pca42'])
pca_df_100_test=pd.DataFrame(x_pca_comp_100_test,columns=['pca1','pca2','pca3','pca4','pca5','pca6','pca7','pca8','pca9','pca10',
                                        'pca11','pca12','pca13','pca14','pca15','pca16','pca17','pca18','pca19',
                                        'pca20','pca21','pca22','pca23','pca24','pca25','pca26','pca27','pca28',
                                        'pca29','pca30','pca31','pca32','pca33','pca34','pca35','pca36','pca37',
                                        'pca38','pca39','pca40','pca41','pca42'])

In [13]:
pca_df_90_train=pd.DataFrame(x_pca_comp_90_train,columns=['pca1','pca2','pca3','pca4','pca5','pca6','pca7','pca8','pca9','pca10',
                                        'pca11','pca12','pca13','pca14','pca15','pca16','pca17','pca18','pca19',
                                        'pca20','pca21','pca22'])
pca_df_90_test=pd.DataFrame(x_pca_comp_90_test,columns=['pca1','pca2','pca3','pca4','pca5','pca6','pca7','pca8','pca9','pca10',
                                        'pca11','pca12','pca13','pca14','pca15','pca16','pca17','pca18','pca19',
                                        'pca20','pca21','pca22'])

In [14]:
pca_df_80_train=pd.DataFrame(x_pca_comp_80_train,columns=['pca1','pca2','pca3','pca4','pca5','pca6','pca7','pca8','pca9','pca10',
                                        'pca11','pca12','pca13','pca14','pca15','pca16'])
pca_df_80_test=pd.DataFrame(x_pca_comp_80_test,columns=['pca1','pca2','pca3','pca4','pca5','pca6','pca7','pca8','pca9','pca10',
                                        'pca11','pca12','pca13','pca14','pca15','pca16'])

In [15]:
pca_df_70_train=pd.DataFrame(x_pca_comp_70_train,columns=['pca1','pca2','pca3','pca4','pca5','pca6','pca7','pca8','pca9','pca10',
                                        'pca11'])
pca_df_70_test=pd.DataFrame(x_pca_comp_70_test,columns=['pca1','pca2','pca3','pca4','pca5','pca6','pca7','pca8','pca9','pca10',
                                        'pca11'])

In [16]:
pca_df_60_train=pd.DataFrame(x_pca_comp_60_train,columns=['pca1','pca2','pca3','pca4','pca5','pca6','pca7'])
pca_df_60_test=pd.DataFrame(x_pca_comp_60_test,columns=['pca1','pca2','pca3','pca4','pca5','pca6','pca7'])

# SVM models

In [17]:
svm_model_43 = svm.SVC(C = 1.0, kernel = 'rbf')
svm_model_23 = svm.SVC(C = 10, kernel = 'rbf')
svm_model_16 = svm.SVC(C = 10, kernel = 'rbf')
svm_model_11 = svm.SVC(C = 50, kernel = 'rbf')
svm_model_7 = svm.SVC(C = 50, kernel = 'rbf')


# 100% variance = 43 dims
svm_model_43.fit(pca_df_100_train, y_train)
y_pred_43 = svm_model_43.predict(pca_df_100_test)

# 90% variance = 23 dims
svm_model_23.fit(pca_df_90_train, y_train)
y_pred_23 = svm_model_23.predict(pca_df_90_test)

# 80% variance = 16 dims
svm_model_16.fit(pca_df_80_train, y_train)
y_pred_16 = svm_model_16.predict(pca_df_80_test)

# 70% variance = 11 dims
svm_model_11.fit(pca_df_70_train, y_train)
y_pred_11 = svm_model_11.predict(pca_df_70_test)

# 60% variance = 7 dims
svm_model_7.fit(pca_df_60_train, y_train)
y_pred_7 = svm_model_7.predict(pca_df_60_test)


print(classification_report(y_test, y_pred_43))
print(classification_report(y_test, y_pred_23))
print(classification_report(y_test, y_pred_16))
print(classification_report(y_test, y_pred_11))
print(classification_report(y_test, y_pred_7))

              precision    recall  f1-score   support

           0       0.90      0.72      0.80      1035
           1       0.51      0.78      0.61       374

    accuracy                           0.74      1409
   macro avg       0.70      0.75      0.71      1409
weighted avg       0.80      0.74      0.75      1409

              precision    recall  f1-score   support

           0       0.89      0.64      0.75      1035
           1       0.44      0.79      0.56       374

    accuracy                           0.68      1409
   macro avg       0.67      0.71      0.65      1409
weighted avg       0.77      0.68      0.70      1409

              precision    recall  f1-score   support

           0       0.89      0.64      0.75      1035
           1       0.44      0.79      0.56       374

    accuracy                           0.68      1409
   macro avg       0.67      0.71      0.65      1409
weighted avg       0.77      0.68      0.70      1409

              preci

# K-NN models

In [18]:
knn_model_43 = KNeighborsClassifier(metric = "manhattan", n_neighbors = 5, weights = "distance")
knn_model_23 = KNeighborsClassifier(metric = "euclidean", n_neighbors = 5, weights = "distance")
knn_model_16 = KNeighborsClassifier(metric = "euclidean", n_neighbors = 5, weights = "distance")
knn_model_11 = KNeighborsClassifier(metric = "manhattan", n_neighbors = 7, weights = "distance")
knn_model_7 = KNeighborsClassifier(metric = "euclidean", n_neighbors = 17, weights = "distance")


# 100% variance = 43 dims
knn_model_43.fit(pca_df_100_train, y_train)
y_pred_43 = knn_model_43.predict(pca_df_100_test)

# 90% variance = 23 dims
knn_model_23.fit(pca_df_90_train, y_train)
y_pred_23 = knn_model_23.predict(pca_df_90_test)

# 80% variance = 16 dims
knn_model_16.fit(pca_df_80_train, y_train)
y_pred_16 = knn_model_16.predict(pca_df_80_test)

# 70% variance = 11 dims
knn_model_11.fit(pca_df_70_train, y_train)
y_pred_11 = knn_model_11.predict(pca_df_70_test)

# 60% variance = 7 dims
knn_model_7.fit(pca_df_60_train, y_train)
y_pred_7 = knn_model_7.predict(pca_df_60_test)


print(classification_report(y_test, y_pred_43))
print(classification_report(y_test, y_pred_23))
print(classification_report(y_test, y_pred_16))
print(classification_report(y_test, y_pred_11))
print(classification_report(y_test, y_pred_7))

              precision    recall  f1-score   support

           0       0.92      0.51      0.66      1035
           1       0.39      0.87      0.54       374

    accuracy                           0.61      1409
   macro avg       0.65      0.69      0.60      1409
weighted avg       0.78      0.61      0.62      1409

              precision    recall  f1-score   support

           0       0.92      0.58      0.71      1035
           1       0.42      0.86      0.57       374

    accuracy                           0.65      1409
   macro avg       0.67      0.72      0.64      1409
weighted avg       0.79      0.65      0.67      1409

              precision    recall  f1-score   support

           0       0.91      0.62      0.74      1035
           1       0.44      0.83      0.57       374

    accuracy                           0.67      1409
   macro avg       0.67      0.72      0.66      1409
weighted avg       0.78      0.67      0.69      1409

              preci