# COMP 7810 PROJECT 
## (Adilet Uvaliyev (24451703))

This project is about churn prediction in telecom industry

In [None]:
### Import libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# 1. Initial Explarotary data analysis

In [None]:
## read a file
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
## Take a look at first 5 rows
print(df.head())
## Print the shape and info
print(df.shape)
print(df.info())
## Get info about numerical features
print(df.describe())
## Get info about non numerical features
print(df.describe(include=["object"]))

In [None]:
## Plot the table about the statistics of each feature

# 2. Pre-processing

In [None]:
#### Exploratory data analysis
### Drop ID column
df = df.drop(['customerID'], axis = 'columns')
## Drop duplicates
df.drop_duplicates(inplace=True)


## Change data types
df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce')
df['MonthlyCharges'] = pd.to_numeric(df.MonthlyCharges, errors='coerce')
df["SeniorCitizen"]= df["SeniorCitizen"].map({0: "No", 1: "Yes"})
df["Churn"]= df["Churn"].map({"No" : 0, "Yes": 1})



## Drop rows with missing values
df = df.dropna() 


## Get info about churn
df["Churn"].value_counts()



In [None]:
# # ### Create boxplots for outlier detection
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Create box plots for each numerical feature
for i in range(3):
    plt.subplot(1, 3, i+1)
    sns.boxplot(y=df[num_cols[i]])
    plt.title(num_cols[i])

plt.tight_layout()
plt.savefig('box_plot.png')
plt.show()

# 3. Explarotary Analysis after Pre-processing

In [None]:
## Take a look at first 5 rows
print(df.head())
## Print the shape and info
print(df.shape)
print(df.info())
## Get info about numerical features
print(df.describe())
## Get info about non numerical features
print(df.describe(include=["object"]))


## Find info about the qualitative variables
qual_cols = df.select_dtypes(include=['object']).columns
for col in qual_cols:
    print(df[col].value_counts())

# 4. Modelling

### 4.a One-Hot Encoding of Categorical Features

In [None]:
categoric_cols = [col for col in df.columns if df[col].dtype == 'object' and col != 'Churn']

# Encoding multiple columns
df_enc = pd.get_dummies(df, columns=categoric_cols)

## Get features and output variable
X = df_enc.drop('Churn', axis=1)
feature_names = df_enc.drop('Churn', axis=1).columns
Y = df_enc['Churn']

### 4.b Normalization

In [None]:
## Normalization
scaler= StandardScaler()
X = scaler.fit_transform(X)

### 4.c Train test split and Data imbalance problem

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state = 0)

### Use SMOTE to upsample the minority class 
smote=SMOTE(sampling_strategy='minority') 
X_train,Y_train =smote.fit_resample(X_train,Y_train)

results = []

In [None]:
print(y_train.head())

In [None]:
print(df_enc.info())
print(df_enc.head())

### 4.d SVM Model

In [None]:
## Define the SVC model
svc_model = SVC(random_state = 2)
svc_model.fit(X_train,Y_train)
Y_preds = svc_model.predict(X_test)


## Calculate and Print metrics
print("Accuracy:", accuracy_score(Y_test, Y_preds))
print("Precision:", precision_score(Y_test, Y_preds, pos_label=1))
print("Recall:", recall_score(Y_test, Y_preds, pos_label=1))
print("F1 Score:", f1_score(Y_test, Y_preds, pos_label=1))
print("AUC:", roc_auc_score(Y_test, Y_preds))


## Save results for later plot
result = {}
result['Accuracy'] = round(float(100 * accuracy_score(Y_test, Y_preds)),2)
result['Precision'] = round(float(100 * precision_score(Y_test, Y_preds, pos_label=1)),2)
result['Recall'] = round(float(100 * recall_score(Y_test, Y_preds, pos_label=1)),2)
result['F1'] = round(float(100 * f1_score(Y_test, Y_preds, pos_label=1)),2)
result['AUC'] = round(float(100 * roc_auc_score(Y_test, Y_preds)),2)
result['Model'] = 'SVM'
results.append(result)


## Display and save confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix(Y_test, Y_preds))
disp.plot().figure_.savefig('confusion_matrix_SVM.png')

### 4.e RF Model

In [None]:
print(results)

In [None]:
## Define and train RF model
RF = RandomForestClassifier(n_estimators=500 , random_state =1)
RF.fit(X_train, Y_train)
Y_preds = RF.predict(X_test)

## Calculate and Print metrics
print("Accuracy:", accuracy_score(Y_test, Y_preds))
print("Precision:", precision_score(Y_test, Y_preds, pos_label=1))
print("Recall:", recall_score(Y_test, Y_preds, pos_label=1))
print("F1 Score:", f1_score(Y_test, Y_preds, pos_label=1))
print("AUC:", roc_auc_score(Y_test, Y_preds))


## Display and save confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix(Y_test, Y_preds))
disp.plot().figure_.savefig('confusion_matrix_RF.png')

## Save results for later plot
result = {}
result['Accuracy'] = round(float(100 * accuracy_score(Y_test, Y_preds)),2)
result['Precision'] = round(float(100 * precision_score(Y_test, Y_preds, pos_label=1)),2)
result['Recall'] = round(float(100 * recall_score(Y_test, Y_preds, pos_label=1)),2)
result['F1'] = round(float(100 * f1_score(Y_test, Y_preds, pos_label=1)),2)
result['AUC'] = round(float(100 * roc_auc_score(Y_test, Y_preds)),2)
result['Model'] = 'RF'
results.append(result)

## 4.F Decision Tree algorthm

In [None]:
## Define and train DT algorithm
d_tree = DecisionTreeClassifier()
d_tree.fit(X_train, Y_train)
Y_preds = d_tree.predict(X_test)


## Calculate and Print metrics
print("Accuracy:", accuracy_score(Y_test, Y_preds))
print("Precision:", precision_score(Y_test, Y_preds, pos_label=1))
print("Recall:", recall_score(Y_test, Y_preds, pos_label=1))
print("F1 Score:", f1_score(Y_test, Y_preds, pos_label=1))
print("AUC:", roc_auc_score(Y_test, Y_preds))


## Save results for later plot
result = {}
result['Accuracy'] = round(float(100 * accuracy_score(Y_test, Y_preds)),2)
result['Precision'] = round(float(100 * precision_score(Y_test, Y_preds, pos_label=1)),2)
result['Recall'] = round(float(100 * recall_score(Y_test, Y_preds, pos_label=1)),2)
result['F1'] = round(float(100 * f1_score(Y_test, Y_preds, pos_label=1)),2)
result['AUC'] = round(float(100 * roc_auc_score(Y_test, Y_preds)),2)
result['Model'] = 'Decision Tree'
results.append(result)

## Display and save confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix(Y_test, Y_preds))
disp.plot().figure_.savefig('confusion_matrix_DT.png')

## 4.G Plot the comparison curve

In [None]:
### Plot the comparision figure

models = ['SVM', 'Random Forest', 'Decision Tree']
accuracy = []  
precision = []  
recall = []     
f1 = []         
AUC = []

for i in range(len(results)):
    accuracy.append(results[i]['Accuracy'])
    precision.append(results[i]['Precision'])
    recall.append(results[i]['Recall'])
    f1.append(results[i]['F1'])
    AUC.append(results[i]['AUC'])
# Create an array for the metrics
metrics = np.array([accuracy, precision, recall, f1, AUC])

# Set the bar width and positions
bar_width = 0.15
x = np.arange(len(models))
colors = ['#A1C6EA',  
          '#FFD700',  
          '#FFA500',  
          '#003366',  
          '#A9A9A9'] 
# Create the bar chart
plt.figure(figsize=(12, 7))
plt.bar(x, metrics[0], width=bar_width, label='Accuracy', color=colors[0], align='center')
plt.bar(x + bar_width, metrics[1], width=bar_width, label='Precision', color=colors[1], align='center')
plt.bar(x + 2 * bar_width, metrics[2], width=bar_width, label='Recall', color=colors[2], align='center')
plt.bar(x + 3 * bar_width, metrics[3], width=bar_width, label='F1 Score', color=colors[3], align='center')
plt.bar(x + 4 * bar_width, metrics[4], width=bar_width, label='AUC', color=colors[4], align='center')

# Adding labels and title
plt.xlabel('Models')
plt.ylabel('Scores')
plt.title('Performance Comparison of ML Models')
plt.xticks(x + 1.5 * bar_width, models)  # Center x-axis labels
plt.ylim(0, 100)  # Set the y-axis limits
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.subplots_adjust(right=0.85)
# Show the plot
#plt.tight_layout()
#plt.show()
plt.savefig('performance_comparison_ml_models.png')

# 5. Feature Importance Analysis

In [None]:
## Draw feature importance plot
importance = RF.feature_importances_

for i in range(len(importance)):
    importance[i] = round(importance[i], 4)

feat_imp = pd.DataFrame({'Feature': feature_names, 'Importance': importance}).sort_values('Importance', ascending=False)



pd.DataFrame(feat_imp).to_excel('feature importance.xlsx')

# 6. Drawing Business Insights (Top 3 features)

In [None]:
### Draw the distribution of Total Charges

sns.histplot(data=df, x='TotalCharges', hue='Churn', multiple= 'stack', legend= True)
plt.title('Total Charges Distribution by Churn Status')
plt.xlabel('Total Charges')
plt.ylabel('Frequency')
plt.legend(title='Churn', labels = ['Yes', 'No'])
plt.savefig('TotalCharges Dist by Churn')
plt.show()

In [None]:
# Draw the Distribution of Tenure
#df["Churn"]= df["Churn"].map({0 : "No", 1: "Yes"})

sns.histplot(data=df, x='tenure', hue='Churn', multiple= 'stack', legend= True)
plt.title('Tenure Distribution by Churn Status')
plt.xlabel('Tenure (Months)')
plt.ylabel('Frequency')
plt.legend(title='Churn', labels = ['Yes', 'No'])
plt.savefig('Tenure Dist by Churn')
plt.show()

In [None]:
### Draw the distribution of conrtract (month-to-month) variable


# Encoding multiple columns as the churn column was removed on previous steps
categoric_cols = [col for col in df.columns if df[col].dtype == 'object' and col != 'Churn']
# Encoding multiple columns
df_enc_2 = pd.get_dummies(df, columns=categoric_cols)
df_enc_2["Churn"]= df_enc_2["Churn"].map({0 : "No", 1: "Yes"})




sns.countplot(data=df_enc_2, x='Contract_Month-to-month', hue = 'Churn')
plt.title('Distribution of Contract (Month-to-month)')
plt.xlabel('Class')
plt.ylabel('Count')
plt.savefig('Contract (Month to month).png')
plt.show()