In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
import optuna
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, RocCurveDisplay
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
pd.set_option("display.float_format", lambda x: "%.3f" % x)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Veri Seti Okutma
df = pd.read_csv("Telco-Customer-Churn.csv")
def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(3))
    print("##################### Tail #####################")
    print(dataframe.tail(3))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
check_df(df)

##################### Shape #####################
(7043, 21)
##################### Types #####################
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object
##################### Head #####################
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService     MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies        Contract PaperlessBilling     Payme

In [5]:
# Veri setinde Type hatası olan değişkenleri düzeltelim.
df.columns = [col.title() for col in df.columns]
# Aslında numerik olan değişken object onu düzenleyelim.
df["Totalcharges"] = pd.to_numeric(df["Totalcharges"], errors="coerce")
# Target değişkeni numeric hale getirelim.
df["Churn"] = df["Churn"].apply(lambda x: 1 if x == "Yes" else 0)

In [7]:
# Değişkenleri Sınıflandıralım.
def grab_col_names(dataframe, cat_th=10, car_th=20):
    """
    Veri setindeki kategorik,numerik ve kategorik fakat kardinal değişkenlerin isimlerini verir.
    Not: Kategorik değişkenlerin içerisine numerik görünümlü kategorik değişkenler de dahildir

    Parameters
    ----------
    dataframe: dataframe
        Değişken isimleri alınmak istenen dataframe'dir
    cat_th: int, float
        Numerik fakat kategorik değişkenler için sınıf eşik değeri
    car_th: int, float
        Kategorik fakat kardinal değişkenlerin sınıf eşik değeri

      Returns
    ------
        cat_cols: list
                Kategorik değişken listesi
        num_cols: list
                Numerik değişken listesi
        cat_but_car: list
                Kategorik görünümlü kardinal değişken listesi

    Examples
    ------
        import seaborn as sns
        df = sns.load_dataset("iris")
        print(grab_col_names(df))


    Notes
    ------
        cat_cols + num_cols + cat_but_car = toplam değişken sayısı
        num_but_cat cat_cols'un içerisinde.
        Return olan 3 liste toplamı toplam değişken sayısına eşittir: cat_cols + num_cols + cat_but_car = değişken sayısı
    """

    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].dtypes != "O" and dataframe[col].nunique() < cat_th]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].dtypes == "O" and dataframe[col].nunique() > car_th]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car
cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observations: 7043
Variables: 21
cat_cols: 17
num_cols: 3
cat_but_car: 1
num_but_cat: 2


In [8]:
# Numerik ve kategorik değişkenlerin veri içindeki dağılımını gözlemleyelim.

# Kategorik Değişkenler için:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print("##########################################")
    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.show(block=True)

for col in cat_cols:
    cat_summary(df, col)

        Gender  Ratio
Gender               
Male      3555 50.476
Female    3488 49.524
##########################################
         Partner  Ratio
Partner                
No          3641 51.697
Yes         3402 48.303
##########################################
            Dependents  Ratio
Dependents                   
No                4933 70.041
Yes               2110 29.959
##########################################
              Phoneservice  Ratio
Phoneservice                     
Yes                   6361 90.317
No                     682  9.683
##########################################
                  Multiplelines  Ratio
Multiplelines                         
No                         3390 48.133
Yes                        2971 42.184
No phone service            682  9.683
##########################################
                 Internetservice  Ratio
Internetservice                        
Fiber optic                 3096 43.959
DSL                         24

In [None]:
# Numerik değişkenler için:
def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        dataframe[numerical_col].hist(bins=20)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show(block=True)

for col in num_cols:
    print(f"############## {col} ############")
    num_summary(df, col)

In [None]:
#  Kategorik değişkenler ile hedef değişken incelemesini yapalım.
def target_category(dataframe,  target, col_category):
    print(dataframe.groupby(col_category).agg({target: "mean"}))
    print("#" * 40)

for col in cat_cols:
    print(f"######### {col.upper()} #########")
    target_category(df, "Churn", col)

In [None]:
# Aykırı Değer İncelemesi Yapalım.
def outlier_thresholds(dataframe, col_name, q1=0.01, q3=0.99):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit
def check_outlier(dataframe, col_name):
    low, up = outlier_thresholds(dataframe, col_name)
    if dataframe.loc[(dataframe[col_name] < low) | (dataframe[col_name] > up)].any(axis=None):
        return True
    else:
        return False

for col in num_cols:
    print(col, check_outlier(df, col))

def replace_with_thresholds(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    dataframe.loc[dataframe[col_name] > up_limit, col_name] = up_limit
    dataframe.loc[dataframe[col_name] < low_limit, col_name] = low_limit

# AYKIRI DEĞER YOKTUR.

In [None]:
# Korelasyon İnceleyelim.
def corr_map(df, width=14, height=6, annot_kws=15, corr_th=0.7):
    corr = df.corr()
    cor_matrix = corr.abs()
    upper_triangle_matrix = cor_matrix.where(
        np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))  # np.bool yerine bool
    drop_list = [col for col in upper_triangle_matrix.columns if any(upper_triangle_matrix[col] > corr_th)]
    mtx = np.triu(df.corr())
    f, ax = plt.subplots(figsize = (width,height))
    sns.heatmap(df.corr(),
                annot= True,
                fmt = ".2f",
                ax=ax,
                vmin = -1,
                vmax = 1,
                cmap = "RdBu",
                mask = mtx,
                linewidth = 0.4,
                linecolor = "black",
                annot_kws={"size": annot_kws})
    plt.yticks(rotation=0,size=15)
    plt.xticks(rotation=75,size=15)
    plt.title('\nCorrelation Map\n', size = 40)
    plt.show()
    return drop_list
corr_map(df[num_cols])

In [None]:
# Eksik Gözlem Var mı İnceleyelim.
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, ratio], axis=1, keys=["n_miss", "ratio"])
    print(missing_df)
    if na_name:
        return na_columns

missing_values_table(df)

In [None]:
# Burada eksik değerlere baktık ve anlamlı bir eksiklik mi diye düşündük.Sonunda ise bu eksikliklerin aslında müşterilerin
# daha yeni oldukları için boş olduğunu gördük ve bunları "Monthlycharges" değişkeni değerleri ile doldurduk.
df.loc[df["Totalcharges"].isnull(), "Totalcharges"] = df.loc[df["Totalcharges"].isnull(), "Monthlycharges"]

# Doldurduğumuz değişkenleri kontrol edelim.
df.loc[df["Tenure"] == 0]

In [None]:
# Yeni gelen müşterilerin "Tenure" değişkeni 0'dır.Her gözlemin "Tenure" değişkenini 1 arttıralım.
df["Tenure"] = df["Tenure"] + 1

In [None]:
# BASE MODEL KURULUMU
df_base = df.copy()

# Encoding İşlemleri
def label_encoder(dataframe, binary_col):
    labelEncoder = LabelEncoder()
    dataframe[binary_col] = labelEncoder.fit_transform(dataframe[binary_col])
    return dataframe

binary_cols = [col for col in df_base.columns if df_base[col].dtypes == "O" and df_base[col].nunique() == 2]
for col in binary_cols:
    label_encoder(df_base, col)

def one_hot_encoder(dataframe, categorical_col, drop_first=True):
    dataframe = pd.get_dummies(dataframe, columns=categorical_col, drop_first=drop_first, dtype=int)
    return dataframe

ohe_cols = [col for col in df_base.columns if df_base[col].dtypes == "O" and df_base[col].nunique() > 2 and
            col not in "Customerid"]

df_base = one_hot_encoder(df_base, ohe_cols)

In [None]:
# Model Aşaması
X = df_base.drop(["Customerid", "Churn"], axis=1)
y = df_base["Churn"]

models = [("LR", LogisticRegression()),
          ("CART", DecisionTreeClassifier()),
          ("KNN", KNeighborsClassifier()),
          ("GBM", GradientBoostingClassifier()),
          ("RF", RandomForestClassifier()),
          ("XGBoost", XGBClassifier()),
          ("LightGBM", LGBMClassifier(verbosity=-1)),
          ("CatBoost", CatBoostClassifier(verbose=False))]

for name, model in models:
    cv_results = cross_validate(model, X, y, cv=5, scoring=["accuracy", "f1", "recall", "precision", "roc_auc"])
    print(f"########## {name} ##########")
    print(f"Accuracy: {round(cv_results['test_accuracy'].mean(), 4)}")
    print(f"Auc: {round(cv_results['test_roc_auc'].mean(), 4)}")
    print(f"Recall: {round(cv_results['test_recall'].mean(), 4)}")
    print(f"Precision: {round(cv_results['test_precision'].mean(), 4)}")
    print(f"F1: {round(cv_results['test_f1'].mean(), 4)}")

In [None]:
# FEATURE ENGINEERING

# Veri setinde ilk olarak aykırı değer ve eksik değerlere bakalım.
missing_values_table(df)  # Eksik değer YOK.
# Aykır Değer YOK.
for col in num_cols:
    print(col, check_outlier(df, col))

In [None]:
# Yeni Değişkenler Üretelim.

# Ödeme Yöntemi Otamatik mi?
df["Pay_Automatic"] = df["Paymentmethod"].apply(lambda x: 1 if "automatic" in x else 0)

# Kontrat aylık mı?
df["Contract_Type"] = df["Contract"].str.contains("Month-to-month", case=False).astype(int)

# Müşterinin yeni olup olmadığının belirlenmesi
df.loc[(df["Tenure"] >= 0) & (df["Tenure"] <= 12), "New_Tenure_Year"] = "0-1 Year"
df.loc[(df["Tenure"] > 12) & (df["Tenure"] <= 24), "New_Tenure_Year"] = "1-2 Year"
df.loc[(df["Tenure"] > 24) & (df["Tenure"] <= 36), "New_Tenure_Year"] = "2-3 Year"
df.loc[(df["Tenure"] > 36) & (df["Tenure"] <= 48), "New_Tenure_Year"] = "3-4 Year"
df.loc[(df["Tenure"] > 48) & (df["Tenure"] <= 60), "New_Tenure_Year"] = "4-5 Year"
df.loc[(df["Tenure"] > 60) & (df["Tenure"] <= 72), "New_Tenure_Year"] = "5-6 Year"
df.loc[(df["Tenure"] > 72) & (df["Tenure"] <= float("inf")), "New_Tenure_Year"] = "6+ Year"

# Herhangi bir destek almış mı?
df["Tech_IsNot"] = df.apply(lambda x: 1 if (x["Onlinesecurity"] == "Yes") or (x["Onlinebackup"] == "Yes") or
                                           (x["Deviceprotection"] == "Yes") or (x["Techsupport"] == "Yes") else 0, axis=1)

# Aldığı toplam destek sayısı
df["Total_Tech"] = df[["Onlinesecurity", "Onlinebackup", "Deviceprotection", "Techsupport"]]. \
    apply(lambda x: (x == "Yes").sum(), axis=1)

# Aldığı Toplam Hizmet Sayısı
df["Total_Service"] = df[["Phoneservice", "Internetservice", "Streamingtv", "Streamingmovies"]]. \
    apply(lambda x: (x == "Yes").sum(), axis=1)

# Total tutarın aldığı hizmete oranı
df["Totalcharges_Service"] = df.apply(lambda x: x["Totalcharges"] / x["Total_Service"] if x["Total_Service"] > 0 \
                                      else x["Totalcharges"], axis=1)

# Total tutarın aldığı desteğe oranı
df["Totalcharges_Tech"] = df.apply(lambda x: x["Totalcharges"] / x["Total_Tech"] if x["Total_Tech"] > 0 \
                                   else x["Totalcharges"], axis=1)

# Yaşlı ve otomatik ödeme
df["Senior_Automatic"] = df.apply(lambda x: 1 if (x["Seniorcitizen"] == 1) and (x["Pay_Automatic"] == 0) else 0, axis=1)

# Yaşlı ve kontrat tipi
df["Senior_Contract"] = df.apply(lambda x: 1 if (x["Seniorcitizen"] == 1) and (x["Contract_Type"] == 1) else 0, axis=1)

# Tekrar Değişkenleri sınıflandıralım.
cat_cols, num_cols, cat_but_car = grab_col_names(df)

In [None]:
# Encoding İşlemlerini Halledelim.
binary_cols = [col for col in df.columns if df[col].dtypes == "O" and df[col].nunique() == 2]

for col in binary_cols:
    label_encoder(df, col)

ohe_cols = [col for col in df.columns if df[col].dtypes == "O" and df[col].nunique() > 2 and col not in "Customerid"]
df = one_hot_encoder(df, ohe_cols)

In [None]:
# Scale İşlemleri
scale_rs = RobustScaler()
df[num_cols] = scale_rs.fit_transform(df[num_cols])

In [None]:
# Model Aşaması
X = df.drop(["Customerid", "Churn"], axis=1)
y = df["Churn"]

models = [("LR", LogisticRegression()),
          ("CART", DecisionTreeClassifier()),
          ("KNN", KNeighborsClassifier()),
          ("GBM", GradientBoostingClassifier()),
          ("RF", RandomForestClassifier()),
          ("XGBoost", XGBClassifier()),
          ("LightGBM", LGBMClassifier(verbosity=-1)),
          ("CatBoost", CatBoostClassifier(verbose=False))]

for name, model in models:
    cv_results = cross_validate(model, X, y, cv=5, scoring=["accuracy", "f1", "recall", "precision", "roc_auc"])
    print(f"########## {name} ##########")
    print(f"Accuracy: {round(cv_results['test_accuracy'].mean(), 4)}")
    print(f"Auc: {round(cv_results['test_roc_auc'].mean(), 4)}")
    print(f"Recall: {round(cv_results['test_recall'].mean(), 4)}")
    print(f"Precision: {round(cv_results['test_precision'].mean(), 4)}")
    print(f"F1: {round(cv_results['test_f1'].mean(), 4)}")

#### Model olarak LogisticReg ve GBM ile devam edeceğim.

In [None]:
log_model = LogisticRegression()
# Hiperparametre Optimizasyonu

log_param = {'penalty': ['l1', 'l2', 'elasticnet'],
             'C': np.logspace(-4, 4, 10),
             'solver': ['newton-cg', 'lbfgs', 'liblinear'],
             'max_iter': [100, 150, 200, 300]}

log_model_best_grid = GridSearchCV(log_model, log_param, cv=5, n_jobs=-1, verbose=True).fit(X, y)

In [None]:
# Final Model
log_final_model = log_model.set_params(**log_model_best_grid.best_params_).fit(X, y)
cv_results_final = cross_validate(log_final_model, X, y, cv=5, scoring=["accuracy", "f1", "recall", "precision", "roc_auc"])

print(f"Accuracy: {round(cv_results_final['test_accuracy'].mean(), 4)}")
print(f"Auc: {round(cv_results_final['test_roc_auc'].mean(), 4)}")
print(f"Recall: {round(cv_results_final['test_recall'].mean(), 4)}")
print(f"Precision: {round(cv_results_final['test_precision'].mean(), 4)}")
print(f"F1: {round(cv_results_final['test_f1'].mean(), 4)}")

# Hiperparametre optimizasyonu ile bir şekilde modelimizin başarısını arttırdık.

#### Model olarak GBM'i kullanalım fakat hiperparametre optimizasyonu olarak da OPTUNA kullanalım.

In [None]:
gbm_model = GradientBoostingClassifier()

# Hiperparametre Optimizasyonu(Optuna ile)
study = optuna.create_study(direction="maximize")

# Optuna için modelin parametrelerinin belirlendiği fonksiyon
def objective(trial):
    # Hiperparametre değerlerini belirleyin
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.1)
    max_depth = trial.suggest_int("max_depth", 3, 8)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 5)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)

    # Modeli oluşturun ve eğitin
    model = GradientBoostingClassifier(
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        random_state=42
    )
    model.fit(X, y)

    cv_results = cross_validate(model, X, y, cv=5,
                                scoring=["accuracy"])

    accuracy = round(cv_results['test_accuracy'].mean(), 4)
    return accuracy

# Optuna ile en iyi parametreleri getirecek kod
study.optimize(objective, n_trials=20)

print("En iyi hiperparametreler: ", study.best_params)
print("En iyi doğruluk skoru: ", study.best_value)

In [None]:
# GBM Final Model
gbm_final_model = gbm_model.set_params(**study.best_params, random_state=42).fit(X, y)
cv_results_final = cross_validate(gbm_final_model, X, y, cv=5, scoring=["accuracy", "f1", "recall", "precision", "roc_auc"])

print(f"Accuracy: {round(cv_results_final['test_accuracy'].mean(), 4)}")
print(f"Auc: {round(cv_results_final['test_roc_auc'].mean(), 4)}")
print(f"Recall: {round(cv_results_final['test_recall'].mean(), 4)}")
print(f"Precision: {round(cv_results_final['test_precision'].mean(), 4)}")
print(f"F1: {round(cv_results_final['test_f1'].mean(), 4)}")

#### Değişkenlerin önem düzeyini belirten feature_importance fonksiyonunu kullanarak özelliklerin sıralamasını çizdirelim..

In [None]:
# Logistic Reg için değişken önem düzeyi
def plot_importance(model, features, num=len(X), save=False):
    feature_imp = pd.DataFrame({"Value": model.coef_[0], "Feature": features.columns})
    plt.figure(figsize=(10, 6))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[0:num])
    plt.title("Features")
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig("importances.png")

plot_importance(log_final_model, X, num=15)
# Oluşturduğumuz "New_Tenure_Year" değişkeninin modele çok fazla etki ettiğini görüyoruz.

In [None]:
# GBM için değişken önem düzeyi
def plot_importance(model, features, num=len(X), save=False):
    feature_imp = pd.DataFrame({"Value": model.feature_importances_, "Feature": features.columns})
    plt.figure(figsize=(10, 6))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[0:num])
    plt.title("Features")
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig("importances.png")

plot_importance(gbm_final_model, X, num=10)
# Oluşturduğumuz değişkenlerin modele etki ettiğini görüyoruz.
