In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import plot_tree
import warnings
warnings.filterwarnings('ignore')

In [7]:
churnData_df = pd.read_csv(r'DATA_Customer-Churn.csv')
churnData_df.fillna(churnData_df.mean())
churnData_df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [8]:
churnData_df['TotalCharges'] = pd.to_numeric(churnData_df['TotalCharges'], errors="coerce")

In [9]:
churnData_df.drop(["gender","Partner","Dependents","PhoneService","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies","Contract"],axis=1,inplace=True)
churnData_df = churnData_df.fillna(churnData_df.mean())

In [10]:
churnData_df['Churn'] = churnData_df['Churn'].replace({'Yes': 1, 'No': 0})

In [11]:
print(churnData_df.isnull().sum())

SeniorCitizen     0
tenure            0
MonthlyCharges    0
TotalCharges      0
Churn             0
dtype: int64


In [12]:
churnData_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SeniorCitizen   7043 non-null   int64  
 1   tenure          7043 non-null   int64  
 2   MonthlyCharges  7043 non-null   float64
 3   TotalCharges    7043 non-null   float64
 4   Churn           7043 non-null   int64  
dtypes: float64(2), int64(3)
memory usage: 275.2 KB


In [13]:
RAND_STATE = 34 # for reproducible shuffling
TT_RATIO = 0.3 # test/train

In [14]:
X = churnData_df.drop('Churn', axis=1)
y = churnData_df.Churn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TT_RATIO, random_state=RAND_STATE)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
X_train.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
3273,0,33,68.25,2171.15
3672,0,54,70.15,3715.65
13,0,49,103.7,5036.3
4538,0,3,85.8,272.2
5126,0,57,103.05,5925.75


In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=11)

In [17]:
k=1
while k<15:
    model = DecisionTreeClassifier(max_depth=k)
    model.fit(X_train, y_train)
    y_pred_train_dt = model.predict(X_train)
    y_pred_test_dt = model.predict(X_test)
    print("Depth:",k)
    performance_df = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_pred_train_dt),
                                         precision_score(y_train, y_pred_train_dt),
                                         recall_score(y_train, y_pred_train_dt)],
                               'Test': [accuracy_score(y_test, y_pred_test_dt),
                                        precision_score(y_test, y_pred_test_dt),
                                        recall_score(y_test, y_pred_test_dt)]})
    print(performance_df)
    print("")
    k+=1

Depth: 1
  Error_metric     Train      Test
0     Accuracy  0.736917  0.729295
1    Precision  0.000000  0.000000
2       Recall  0.000000  0.000000

Depth: 2
  Error_metric     Train      Test
0     Accuracy  0.787018  0.778514
1    Precision  0.647904  0.662500
2       Recall  0.417116  0.370629

Depth: 3
  Error_metric     Train      Test
0     Accuracy  0.787018  0.778514
1    Precision  0.647904  0.662500
2       Recall  0.417116  0.370629

Depth: 4
  Error_metric     Train      Test
0     Accuracy  0.792089  0.781827
1    Precision  0.660000  0.657224
2       Recall  0.432537  0.405594

Depth: 5
  Error_metric     Train      Test
0     Accuracy  0.802840  0.774255
1    Precision  0.691402  0.636888
2       Recall  0.452583  0.386364

Depth: 6
  Error_metric     Train      Test
0     Accuracy  0.805882  0.770469
1    Precision  0.715736  0.635514
2       Recall  0.434850  0.356643

Depth: 7
  Error_metric     Train      Test
0     Accuracy  0.817647  0.772835
1    Precision  0.728

In [19]:
X = churnData_df.drop('Churn', axis=1)
y = churnData_df.Churn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TT_RATIO, random_state=RAND_STATE)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [20]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [21]:
def evaluate_classification_model(y_train, y_pred_train, y_test, y_pred_test):
    """
    Generates performance metrics and comparisons of labels with their predicted values
    :param y_train: training labels
    :param y_pred_train: predictions from the model on the training set
    :param y_test: test labels
    :param y_pred_test: predictions from the model on the test set
    :returns: 3 dfs - Accuracy, Precision, and Recall scores, y_train vs. y_pred_train and y_test vs. y_pred_dist
    """
    performance_df = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_pred_train),
                                         precision_score(y_train, y_pred_train),
                                         recall_score(y_train, y_pred_train)],
                               'Test': [accuracy_score(y_test, y_pred_test),
                                        precision_score(y_test, y_pred_test),
                                        recall_score(y_test, y_pred_test)]})
    
    pd.options.display.float_format = '{:.2f}'.format

    df_train = pd.DataFrame({'Real': y_train, 'Predicted': y_pred_train})
    df_test  = pd.DataFrame({'Real': y_test,  'Predicted': y_pred_test})

    return performance_df, df_train, df_test

In [22]:
model = KNeighborsClassifier(n_neighbors=5,weights='uniform')
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_pred_train=model.predict(X_train_scaled)

In [23]:
k=1
while k<15:
    model = KNeighborsClassifier(n_neighbors=k,weights='uniform')
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_pred_train=model.predict(X_train_scaled)
    print("Number of Neigbors:",k)
    error_metrics_df,y_train_vs_predicted, y_test_vs_predicted =evaluate_classification_model(y_train, y_pred_train,y_test, y_pred)
    print(error_metrics_df)
    print("")
    k+=1

Number of Neigbors: 1
  Error_metric  Train  Test
0     Accuracy   0.99  0.73
1    Precision   0.98  0.51
2       Recall   0.98  0.49

Number of Neigbors: 2
  Error_metric  Train  Test
0     Accuracy   0.86  0.76
1    Precision   0.99  0.62
2       Recall   0.48  0.28

Number of Neigbors: 3
  Error_metric  Train  Test
0     Accuracy   0.86  0.76
1    Precision   0.78  0.56
2       Recall   0.66  0.47

Number of Neigbors: 4
  Error_metric  Train  Test
0     Accuracy   0.84  0.76
1    Precision   0.85  0.61
2       Recall   0.48  0.34

Number of Neigbors: 5
  Error_metric  Train  Test
0     Accuracy   0.84  0.76
1    Precision   0.74  0.57
2       Recall   0.59  0.44

Number of Neigbors: 6
  Error_metric  Train  Test
0     Accuracy   0.83  0.76
1    Precision   0.79  0.59
2       Recall   0.47  0.36

Number of Neigbors: 7
  Error_metric  Train  Test
0     Accuracy   0.82  0.76
1    Precision   0.72  0.58
2       Recall   0.54  0.45

Number of Neigbors: 8
  Error_metric  Train  Test
0    

In [24]:
model1 = DecisionTreeClassifier()
model2 = LogisticRegression()
model3 = KNeighborsClassifier()

model_pipeline = [model1, model2, model3]
model_names = ['Classification Tree', 'Logistic Regression', 'KNN']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

NameError: name 'LogisticRegression' is not defined