# Part 1: Review and Select a Dataset.

The dataset we selected is from the Binary Classification Datasets collection. For our in-class group mini-project, we decided on the Customer Churn Dataset.

# Part 2: Preprocess the Data.

In [32]:
# Import necessary libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [33]:
# Load the dataset
data = pd.read_csv('customer-churn.csv')
data.head()

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.64,0
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035,0
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.52,0
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.02,0
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805,0


In [34]:
# Seperate Features and Target
X = data.drop('Churn', axis=1)
y = data['Churn']
X.head()

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.64
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.52
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.02
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805


In [35]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Churn, dtype: int64

In [36]:
# Perform the train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
# Display the shapes of the resulting datasets
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2520, 13)
(630, 13)
(2520,)
(630,)


# Part 3: Train and Evaluate the Models. (Linear Regression, KNN, Random Forest Classifier)

Model 1: Logistic Regression.

In [38]:
# Prepare the dataset
display(X_train, X_test, y_train, y_test)


Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value
2310,10,0,36,1,6143,99,7,10,3,1,1,30,277.680
2189,12,0,30,2,1933,48,42,72,5,1,1,55,92.715
2304,7,0,36,1,2618,62,7,31,2,1,1,25,152.100
2988,15,0,30,3,2330,53,208,13,3,2,1,30,927.320
221,7,0,33,0,6638,73,98,33,2,1,1,25,742.995
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3092,15,0,27,1,1530,38,26,15,2,1,1,25,187.560
1095,11,0,15,0,6088,135,78,40,1,2,1,15,771.265
1130,1,0,33,1,6540,94,12,22,4,1,1,45,195.850
1294,0,0,12,0,1310,18,11,8,2,1,1,25,109.260


Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value
2965,14,0,40,3,7515,103,201,28,3,1,1,30,1108.720
969,3,0,37,0,7508,127,384,43,2,1,1,25,2071.575
1385,0,0,28,0,3153,66,0,20,2,1,1,25,144.855
1233,21,0,33,3,15850,234,3,82,2,1,1,25,737.280
2996,23,0,18,2,9947,188,88,42,5,1,1,55,284.025
...,...,...,...,...,...,...,...,...,...,...,...,...,...
765,1,0,32,0,6500,80,178,25,3,1,1,30,975.200
1041,0,0,26,0,825,10,21,3,4,1,2,45,73.375
790,9,0,22,0,700,19,2,6,2,1,2,25,41.355
2815,6,0,39,2,6615,91,184,25,3,1,1,30,1004.240


2310    0
2189    0
2304    0
2988    0
221     0
       ..
3092    0
1095    0
1130    0
1294    0
860     0
Name: Churn, Length: 2520, dtype: int64

2965    0
969     0
1385    0
1233    0
2996    0
       ..
765     0
1041    0
790     0
2815    0
1565    0
Name: Churn, Length: 630, dtype: int64

In [39]:
# Scale the Features
scaler = StandardScaler()

In [40]:
# Fit the scaler on the TRAINING DATA and transform it
X_train_scaled = scaler.fit_transform(X_train)

In [41]:
# Use the same scaler to transform the TEST DATA    
X_test_scaled = scaler.transform(X_test)

In [42]:
# Initialize the Logistic Regression model
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train_scaled, y_train)

Evaluate the Model.

In [43]:
# Make predictions on the scaled test data
y_pred = logistic_model.predict(X_test_scaled)

In [44]:
# Evaluate the model
log_accuracy = accuracy_score(y_test, y_pred)
log_conf_matrix = confusion_matrix(y_test, y_pred)
log_class_report = classification_report(y_test, y_pred)

In [45]:
# Print the evaluation metrics
print(f"Logistic Regression Accuracy: {log_accuracy}")
print("Confusion Matrix:")
print(log_conf_matrix)
print("Classification Report:")
print(log_class_report)

Logistic Regression Accuracy: 0.8698412698412699
Confusion Matrix:
[[504  16]
 [ 66  44]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.97      0.92       520
           1       0.73      0.40      0.52       110

    accuracy                           0.87       630
   macro avg       0.81      0.68      0.72       630
weighted avg       0.86      0.87      0.85       630



Model 2: K-Nearest Neighbor (KNN)

In [46]:
# Prepare and scale the dataset
scaler = StandardScaler()

In [47]:
# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

In [48]:
# Use the same scaler to transform the test data
x_test_scaled = scaler.transform(X_test)

Train the KNN model.

In [49]:
# Initialize the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)

In [50]:
# Train the model on the scaled training data
knn_model.fit(X_train_scaled, y_train)

In [51]:
# Evaluate the model (After training the model, evaluate its performance on the test data)
# Make predictions on the scaled data
y=pred = knn_model.predict(X_test_scaled)

In [52]:
# Evaluate the model
k_accuracy = accuracy_score(y_test, y_pred)
k_conf_matrix = confusion_matrix(y_test, y_pred)
k_class_report = classification_report(y_test, y_pred)

In [63]:
# Print the evaluation metrics
print(f"KNN Accuracy: {k_accuracy}")
print("Confusion Matrix:")
print(k_conf_matrix)
print("Classification Report:")
print(k_class_report)


'KNN Accuracy: 0.8698412698412699'

'Confusion Matrix:'

array([[504,  16],
       [ 66,  44]])

'Classification Report:'

'              precision    recall  f1-score   support\n\n           0       0.88      0.97      0.92       520\n           1       0.73      0.40      0.52       110\n\n    accuracy                           0.87       630\n   macro avg       0.81      0.68      0.72       630\nweighted avg       0.86      0.87      0.85       630\n'

Model 3: Random Forest Classifier.


In [54]:
# Prepare and scale the dataset
scaler = StandardScaler()

In [55]:
# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

In [56]:
# Use the same scaler to transform the test data
X_test_scaled = scaler.transform(X_test)

In [57]:
# Train the Random Forest Classifier
# Initialize the Random Forest Classifier
random_forest_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)


In [58]:
# Train the model on the scaled training data
random_forest_model.fit(X_train_scaled, y_train)

In [59]:
# Evaluate the model
# Make predictions on the scaled test data
y_pred = random_forest_model.predict(X_test_scaled)

In [60]:
# Evaluate the model
ran_accuracy = accuracy_score(y_test, y_pred)
ran_conf_matrix = confusion_matrix(y_test, y_pred)
ran_class_report = classification_report(y_test, y_pred)

In [61]:
# Print the evaluation metrics
print(f"Random Forest Classifier Accuracy: {ran_accuracy}")
print("Confusion Matrix:")
print(ran_conf_matrix)
print("Classification Report")
print(ran_class_report)

Random Forest Classifier Accuracy: 0.9380952380952381
Confusion Matrix:
[[504  16]
 [ 23  87]]
Classification Report
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       520
           1       0.84      0.79      0.82       110

    accuracy                           0.94       630
   macro avg       0.90      0.88      0.89       630
weighted avg       0.94      0.94      0.94       630



In [62]:
print(f"     Logistic Regression Accuracy:  {format(log_accuracy * 100, '.2f')}%")
print(f"                     KNN Accuracy:  {format(k_accuracy * 100, '.2f')}%")
print(f"Random Forest Classifier Accuracy:  {format(ran_accuracy * 100, '.2f')}%")

     Logistic Regression Accuracy:  86.98%
                     KNN Accuracy:  86.98%
Random Forest Classifier Accuracy:  93.81%


# Part 4: Discuss Results.