In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

In [2]:
########## ROUND 1 ##############

In [58]:
churnData = pd.read_csv('DATA_Customer-Churn.txt')

In [59]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [60]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')

In [61]:
churnData.isnull().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [62]:
churnData.fillna(churnData.median(), inplace=True)
churnData.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [63]:
churnData['SeniorCitizen']

0       0
1       0
2       0
3       0
4       0
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: SeniorCitizen, Length: 7043, dtype: int64

In [64]:
X = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
y = churnData['Churn']

In [65]:
#Split the data into training ans testing sets
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2,random_state=42) # splitting the data into train and test sets

In [66]:
#Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [68]:
# Convert the numpy arrays to pandas DataFrame objects
X_train = pd.DataFrame(X_train, columns=['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'])
X_test = pd.DataFrame(X_test, columns=['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'])

# Encode the categorical variables (SeniorCitizen) using pd.get_dummies
X_train = pd.concat([pd.get_dummies(X_train['SeniorCitizen']), X_train[['tenure', 'MonthlyCharges', 'TotalCharges']]], axis=1)
X_test = pd.concat([pd.get_dummies(X_test['SeniorCitizen']), X_test[['tenure', 'MonthlyCharges', 'TotalCharges']]], axis=1)


In [34]:
churnData_dummy = pd.get_dummies(churnData)
churnData_dummy.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,Churn_No,Churn_Yes
0,0,1,29.85,29.85,1,0,0,1,1,0,1,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0
1,0,34,56.95,1889.5,0,1,1,0,1,0,0,1,0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1,0
2,0,2,53.85,108.15,0,1,1,0,1,0,0,1,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1
3,0,45,42.3,1840.75,0,1,1,0,1,0,1,0,0,0,1,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,0,1,0
4,0,2,70.7,151.65,1,0,1,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1


In [21]:
########## ROUND 2 #########

In [69]:
# Fit a logistic regression model
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)

LogisticRegression(random_state=42)

In [70]:
# Fit a KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [71]:
# Fit a decision tree classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [72]:
# Evaluate the models on the train and test sets
models = {'Logistic Regression': logreg, 'KNN': knn, 'Decision Tree': dt}
for name, model in models.items():
    print(name)
    print('Train set:')
    y_train_pred = model.predict(X_train)
    print('Accuracy: {:.3f}'.format(accuracy_score(y_train, y_train_pred)))
    print('Precision: {:.3f}'.format(precision_score(y_train, y_train_pred,  pos_label='Yes')))
    print('Recall: {:.3f}'.format(recall_score(y_train, y_train_pred,  pos_label='Yes')))
    print('Test set:')
    y_test_pred = model.predict(X_test)
    print('Accuracy: {:.3f}'.format(accuracy_score(y_test, y_test_pred)))
    print('Precision: {:.3f}'.format(precision_score(y_test, y_test_pred,  pos_label='Yes')))
    print('Recall: {:.3f}'.format(recall_score(y_test, y_test_pred,  pos_label='Yes')))
    print()

Logistic Regression
Train set:
Accuracy: 0.793
Precision: 0.669
Recall: 0.436
Test set:
Accuracy: 0.798
Precision: 0.676
Recall: 0.453

KNN
Train set:
Accuracy: 0.828
Precision: 0.718
Recall: 0.578
Test set:
Accuracy: 0.779
Precision: 0.595
Recall: 0.512

Decision Tree
Train set:
Accuracy: 0.991
Precision: 0.992
Recall: 0.973
Test set:
Accuracy: 0.718
Precision: 0.469
Recall: 0.507



In [73]:
from sklearn.model_selection import cross_val_score

models = {'Logistic Regression': logreg, 'KNN': knn, 'Decision Tree': dt}

for name, model in models.items():
    print(name)
    scores = cross_val_score(model, churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']],
                             churnData['Churn'], cv=5, scoring='accuracy')
    print('Cross-validation scores:', scores)
    print('Mean cross-validation score: {:.3f}'.format(scores.mean()))
    print()


Logistic Regression
Cross-validation scores: [0.79418027 0.78566359 0.78140525 0.79332386 0.79403409]
Mean cross-validation score: 0.790

KNN
Cross-validation scores: [0.75585522 0.7707594  0.76721079 0.75568182 0.77059659]
Mean cross-validation score: 0.764

Decision Tree
Cross-validation scores: [0.71611072 0.72888573 0.70759404 0.69886364 0.72585227]
Mean cross-validation score: 0.715



In [76]:
from sklearn.ensemble import RandomForestClassifier

# create an instance of Random Forest Classifier
rfc = RandomForestClassifier(random_state=42)

# fit the model to the training data
rfc.fit(X_train, y_train)

# evaluate the model on the training and test data
print('Train set:')
y_train_pred = rfc.predict(X_train)
print('Accuracy: {:.3f}'.format(accuracy_score(y_train, y_train_pred)))
print('Precision: {:.3f}'.format(precision_score(y_train, y_train_pred, pos_label='Yes')))
print('Recall: {:.3f}'.format(recall_score(y_train, y_train_pred,  pos_label='Yes')))
print('Test set:')
y_test_pred = model.predict(X_test)
print('Accuracy: {:.3f}'.format(accuracy_score(y_test, y_test_pred)))
print('Precision: {:.3f}'.format(precision_score(y_test, y_test_pred,  pos_label='Yes')))
print('Recall: {:.3f}'.format(recall_score(y_test, y_test_pred,  pos_label='Yes')))
print()

Train set:
Accuracy: 0.991
Precision: 0.979
Recall: 0.985
Test set:
Accuracy: 0.718
Precision: 0.469
Recall: 0.507



In [77]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rfc = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rfc, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print('Best parameters:', grid_search.best_params_)
print('Train set score:', grid_search.best_score_)
print('Test set score:', grid_search.score(X_test, y_test))


Best parameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300}
Train set score: 0.7873607764211561
Test set score: 0.794889992902768


In [79]:
rfc = RandomForestClassifier(random_state=42, **grid_search.best_params_)
rfc.fit(X_train, y_train)

# Evaluate the model on the train and test sets
models = {'Random Forest Classifier': rfc}
for name, model in models.items():
    print(name)
    print('Train set')
    y_train_pred = rfc.predict(X_train)
    print('Accuracy: {:.3f}'.format(accuracy_score(y_train, y_train_pred)))
    print('Precision: {:.3f}'.format(precision_score(y_train, y_train_pred, pos_label='Yes')))
    print('Recall: {:.3f}'.format(recall_score(y_train, y_train_pred,  pos_label='Yes')))
    print('Test set:')
    y_test_pred = model.predict(X_test)
    print('Accuracy: {:.3f}'.format(accuracy_score(y_test, y_test_pred)))
    print('Precision: {:.3f}'.format(precision_score(y_test, y_test_pred,  pos_label='Yes')))
    print('Recall: {:.3f}'.format(recall_score(y_test, y_test_pred,  pos_label='Yes')))
    print()

Random Forest Classifier
Train set
Accuracy: 0.837
Precision: 0.799
Recall: 0.517
Test set:
Accuracy: 0.795
Precision: 0.667
Recall: 0.450

