In [100]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import missingno as msno
import warnings
import joblib
import pydotplus

In [101]:
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.tree import DecisionTreeClassifier, export_graphviz, export_text
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, validation_curve

In [102]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.float_format", lambda x: "%.3f" %x)
pd.set_option("display.width", 500)

In [103]:
df = pd.read_csv("DATA/churn.csv")

In [104]:
def grab_col_names(dataframe, cat_th=15, car_th=20):
    """
    Veri setindeki kategorik, numerik ve kategorik fakat kordinal değişkenlerin isimlerini verir
    Parameters
    ----------
    dataframe: DataFrame
        değişkeni isimleri alınmak istenen DataFrame'dir
    cat_th: int, float
        numerik fakat kategorik olan değişkenler için eşik değeri
    car_th int, float
        numerik fakat kordinal olan değişenler için eşik değeri

    Returns
    -------
    cat_cols: list
        Kategorik değeşken listesi
    num_cols: list
        Numerik değişken listesi
    cat_but_car: list
        Kategorik görünümlü kordinal değişken listesi

    Notes
    -------
    cat_cols + num_cols + cat_but_car = toplam değişken sayısı
    num_but_cat cat_cols'un içinderdir

    """
    cat_cols = [col for col in dataframe.columns if str(dataframe[col].dtypes) in ["object", "category", "bool"]]

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < 15 and dataframe[col].dtypes in ["int64", "float64", "int", "float"]]

    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > 20 and dataframe[col].dtypes in ["object", "category"]]

    cat_cols = cat_cols + num_but_cat

    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes in ["int64", "float64", ]]

    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print(f"num_but_cat: {len(num_but_cat)}")

    return cat_cols, num_cols, cat_but_car

In [105]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observations: 5000
Variables: 18
cat_cols: 4
num_cols: 14
cat_but_car: 0
num_but_cat: 1


In [106]:
binary_cols = [col for col in df.columns if df[col].dtype not in ["int64", "float64"]
              and df[col].nunique() == 2]
binary_cols

['churn', 'internationalplan', 'voicemailplan']

In [107]:
df[binary_cols].head()

Unnamed: 0,churn,internationalplan,voicemailplan
0,No,no,yes
1,No,no,yes
2,No,no,no
3,No,yes,no
4,No,yes,no


In [108]:
def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

In [109]:
for col in binary_cols:
    label_encoder(df, col)

In [110]:
df[binary_cols].head()

Unnamed: 0,churn,internationalplan,voicemailplan
0,0,0,1
1,0,0,1
2,0,0,0
3,0,1,0
4,0,1,0


In [111]:
df.head()

Unnamed: 0,churn,accountlength,internationalplan,voicemailplan,numbervmailmessages,totaldayminutes,totaldaycalls,totaldaycharge,totaleveminutes,totalevecalls,totalevecharge,totalnightminutes,totalnightcalls,totalnightcharge,totalintlminutes,totalintlcalls,totalintlcharge,numbercustomerservicecalls
0,0,128,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1
1,0,107,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1
2,0,137,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0
3,0,84,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2
4,0,75,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3


In [112]:
df["numbercustomerservicecalls"].value_counts()

numbercustomerservicecalls
1    1786
2    1127
0    1023
3     665
4     252
5      96
6      34
7      13
9       2
8       2
Name: count, dtype: int64

In [113]:
mms = MinMaxScaler()
for col in num_cols:
    df[col] = mms.fit_transform(df[[col]])
df[num_cols].head()

Unnamed: 0,accountlength,numbervmailmessages,totaldayminutes,totaldaycalls,totaldaycharge,totaleveminutes,totalevecalls,totalevecharge,totalnightminutes,totalnightcalls,totalnightcharge,totalintlminutes,totalintlcalls,totalintlcharge
0,0.525,0.481,0.754,0.667,0.754,0.543,0.582,0.543,0.619,0.52,0.62,0.5,0.15,0.5
1,0.438,0.5,0.46,0.745,0.46,0.538,0.606,0.538,0.644,0.589,0.644,0.685,0.15,0.685
2,0.562,0.0,0.692,0.691,0.692,0.333,0.647,0.333,0.412,0.594,0.412,0.61,0.25,0.609
3,0.343,0.0,0.852,0.43,0.852,0.17,0.518,0.17,0.498,0.509,0.499,0.33,0.35,0.33
4,0.306,0.0,0.474,0.685,0.474,0.408,0.718,0.408,0.473,0.691,0.473,0.505,0.15,0.506


# KNN 

In [114]:
df = df.drop(["numbercustomerservicecalls"], axis=1)

In [115]:
x = df.drop(["churn"], axis = 1)

In [116]:
y = df["churn"]

In [94]:
x.columns

Index(['accountlength', 'internationalplan', 'voicemailplan', 'numbervmailmessages', 'totaldayminutes', 'totaldaycalls', 'totaldaycharge', 'totaleveminutes', 'totalevecalls', 'totalevecharge', 'totalnightminutes', 'totalnightcalls', 'totalnightcharge', 'totalintlminutes', 'totalintlcalls', 'totalintlcharge'], dtype='object')

In [19]:
knn_model = KNeighborsClassifier().fit(x, y)

In [20]:
random_user = x.sample(1, random_state=45)
random_user

Unnamed: 0,accountlength,internationalplan,voicemailplan,numbervmailmessages,totaldayminutes,totaldaycalls,totaldaycharge,totaleveminutes,totalevecalls,totalevecharge,totalnightminutes,totalnightcalls,totalnightcharge,totalintlminutes,totalintlcalls,totalintlcharge
4988,0.326,0,0,0.0,0.447,0.612,0.447,0.574,0.747,0.574,0.287,0.623,0.287,0.81,0.1,0.809


In [21]:
knn_model.predict(random_user)

array([0])

In [22]:
y_pred = knn_model.predict(x)
y_prob = knn_model.predict_proba(x)[:, 1]
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.99      0.95      4293
           1       0.92      0.46      0.61       707

    accuracy                           0.92      5000
   macro avg       0.92      0.73      0.78      5000
weighted avg       0.92      0.92      0.91      5000



In [23]:
roc_auc_score(y, y_prob)

0.9428939449800027

In [24]:
cv_results = cross_validate(knn_model, x, y, cv=5, scoring=["accuracy", "f1", "roc_auc"])
cv_results

{'fit_time': array([0.00501657, 0.00433326, 0.00400496, 0.00378418, 0.00500369]),
 'score_time': array([0.14802146, 0.14373231, 0.14509344, 0.14620423, 0.14635396]),
 'test_accuracy': array([0.891, 0.891, 0.905, 0.891, 0.898]),
 'test_f1': array([0.47342995, 0.44102564, 0.53658537, 0.45226131, 0.50961538]),
 'test_roc_auc': array([0.73553282, 0.76755918, 0.79888374, 0.76220083, 0.77906776])}

In [25]:
cv_results["test_accuracy"].mean()

0.8952

In [26]:
cv_results["test_f1"].mean()

0.48258352994363374

In [27]:
cv_results["test_roc_auc"].mean()

0.7686488666857643

In [28]:
knn_model = KNeighborsClassifier()
knn_model

In [29]:
knn_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [30]:
knn_params = {"n_neighbors": range(2, 250)}

In [31]:
knn_gs_best = GridSearchCV(knn_model,
                           knn_params,
                           cv=5,
                           n_jobs=-1,
                           verbose=1).fit(x, y)

Fitting 5 folds for each of 248 candidates, totalling 1240 fits


In [32]:
knn_gs_best.best_params_

{'n_neighbors': 9}

In [33]:
knn_final = knn_model.set_params(**knn_gs_best.best_params_).fit(x, y)

In [34]:
cv_results1 = cross_validate(knn_final,
                            x,
                            y,
                            cv=5,
                            scoring=["accuracy", "f1", "roc_auc"])

In [35]:
cv_results1

{'fit_time': array([0.00399971, 0.00310278, 0.00486398, 0.0047226 , 0.00511622]),
 'score_time': array([0.14956522, 0.14808917, 0.14674354, 0.15535617, 0.152107  ]),
 'test_accuracy': array([0.892, 0.893, 0.903, 0.895, 0.899]),
 'test_f1': array([0.44329897, 0.44559585, 0.4921466 , 0.46153846, 0.49751244]),
 'test_roc_auc': array([0.75060478, 0.77920888, 0.8140878 , 0.80136413, 0.80961292])}

In [36]:
cv_results1["test_accuracy"].mean()

0.8964000000000001

In [37]:
cv_results1["test_f1"].mean()

0.468018464040498

In [38]:
cv_results1["test_roc_auc"].mean()

0.7909757006817236

In [39]:
random_user = x.sample(1)
knn_final.predict(random_user)

array([0])

# Karar Ağacı

In [117]:
cart_model = DecisionTreeClassifier(random_state=1).fit(x, y)

In [118]:
y_pred = cart_model.predict(x)

In [119]:
y_prob = cart_model.predict_proba(x)[:, 1]

In [120]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4293
           1       1.00      1.00      1.00       707

    accuracy                           1.00      5000
   macro avg       1.00      1.00      1.00      5000
weighted avg       1.00      1.00      1.00      5000



In [121]:
roc_auc_score(y, y_prob)

1.0

## holdout yöntemi

In [122]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=17)

In [123]:
cart_model = DecisionTreeClassifier(random_state=17).fit(x_train, y_train)

In [124]:
y_pred = cart_model.predict(x_train)

In [125]:
y_prob = cart_model.predict_proba(x_train)[:, 1]
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3443
           1       1.00      1.00      1.00       557

    accuracy                           1.00      4000
   macro avg       1.00      1.00      1.00      4000
weighted avg       1.00      1.00      1.00      4000



In [126]:
roc_auc_score(y_train, y_prob)

1.0

## Test hatası

In [127]:
y_pred = cart_model.predict(x_test)

In [128]:
y_prob = cart_model.predict_proba(x_test)[:, 1]

In [129]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       850
           1       0.48      0.53      0.51       150

    accuracy                           0.84      1000
   macro avg       0.70      0.72      0.71      1000
weighted avg       0.85      0.84      0.85      1000



In [53]:
roc_auc_score(y_test, y_prob)

0.7273221526298139

### cv ile başarı değerlendirme

In [54]:
cart_model = DecisionTreeClassifier(random_state=17).fit(x, y)
cv_results = cross_validate(cart_model,
                            x,y,
                            cv=5,
                            scoring=["accuracy", "f1", "roc_auc"])
cv_results

{'fit_time': array([0.0580647 , 0.05459881, 0.05695224, 0.05591559, 0.05199623]),
 'score_time': array([0.00722337, 0.00610566, 0.00644994, 0.00676656, 0.00645447]),
 'test_accuracy': array([0.862, 0.869, 0.901, 0.878, 0.871]),
 'test_f1': array([0.53691275, 0.5559322 , 0.64259928, 0.56115108, 0.57425743]),
 'test_roc_auc': array([0.73886839, 0.74887095, 0.78824544, 0.74084835, 0.76321449])}

In [55]:
cv_results["test_accuracy"].mean()

0.8762000000000001

In [56]:
cv_results["test_f1"].mean()

0.5741705475850574

In [57]:
cv_results["test_roc_auc"].mean()

0.7560095227747016

### Hiperparametre Optimizasyonu

In [58]:
cart_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 17,
 'splitter': 'best'}

In [65]:
cart_params = {"max_depth": range(1, 31),
               "min_samples_split": range(2, 42)}

In [66]:
cart_best_grid = GridSearchCV(cart_model,
                              cart_params,
                              n_jobs=-1,
                              verbose=1).fit(x, y)

Fitting 5 folds for each of 1200 candidates, totalling 6000 fits


In [67]:
cart_best_grid.best_params_

{'max_depth': 8, 'min_samples_split': 7}

In [68]:
cart_best_grid.best_score_

0.9246000000000001

In [69]:
random = x.sample(1, random_state=45)

In [70]:
cart_best_grid.predict(random)

array([0])

### final model

In [71]:
cart_final = DecisionTreeClassifier(**cart_best_grid.best_params_, random_state=17).fit(x, y)

In [72]:
cart_final = cart_model.set_params(**cart_best_grid.best_params_).fit(x, y)

In [73]:
cart_final.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 8,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 7,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 17,
 'splitter': 'best'}

In [74]:
cv_results = cross_validate(cart_model,
                            x,y,
                            cv=5,
                            scoring=["accuracy", "f1", "roc_auc"])

In [75]:
cv_results["test_accuracy"].mean()

0.9246000000000001

In [76]:
cv_results["test_f1"].mean()

0.6705137567746264

In [77]:
cv_results["test_roc_auc"].mean()

0.7863771520084064