In [43]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler, MaxAbsScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.feature_selection import SelectFromModel, SelectFdr
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import sklearn
import pickle
import skopt

In [44]:
df = pd.read_csv(filepath_or_buffer = "C:/Users/Edish/Documents/Python Taskilled/Project/telco-customer-churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [45]:
df.columns = df.columns.str.strip()
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [47]:
df.TotalCharges = df.TotalCharges.replace(' ', np.nan)
df.TotalCharges = df.TotalCharges.astype(float)

In [48]:
output_data = []

for col in df.columns:
    if df.loc[:, col].nunique() <= 5:
        unique_values = df.loc[:, col].unique()
        output_data.append([col, df.loc[:, col].nunique(), unique_values, df.loc[:, col].dtype])
    else:
        output_data.append([col, df.loc[:, col].nunique(),"-", df.loc[:, col].dtype])

output_df = pd.DataFrame(output_data, columns=['Column Name', 'Number of Unique Values', ' Unique Values ', 'Data Type'])

output_df

Unnamed: 0,Column Name,Number of Unique Values,Unique Values,Data Type
0,customerID,7043,-,object
1,gender,2,"[Female, Male]",object
2,SeniorCitizen,2,"[0, 1]",int64
3,Partner,2,"[Yes, No]",object
4,Dependents,2,"[No, Yes]",object
5,tenure,73,-,int64
6,PhoneService,2,"[No, Yes]",object
7,MultipleLines,3,"[No phone service, No, Yes]",object
8,InternetService,3,"[DSL, Fiber optic, No]",object
9,OnlineSecurity,3,"[No, Yes, No internet service]",object


In [49]:
yes_no = {'Yes':1, 'No':0}
g = {'Male':1, 'Female':0}
in_s = {'No':1, 'Yes':2, 'No internet service':0}
s = {'No phone service':0, 'No':1, 'Yes':2}
net_s = {'DSL':2, 'Fiber optic':1, 'No':0}
cont = {'Month-to-month':0, 'One year':1, 'Two year':2}
paym = {'Electronic check':0, 'Mailed check':1, 'Bank transfer (automatic)':2,
       'Credit card (automatic)':3}

In [50]:
df['gender'] = df['gender'].replace(g)

df['Partner'] = df['Partner'].replace(yes_no)
df['Dependents'] = df['Dependents'].replace(yes_no)
df['PhoneService'] = df['PhoneService'].replace(yes_no)
df['PaperlessBilling'] = df['PaperlessBilling'].replace(yes_no)
df['Churn'] = df['Churn'].replace(yes_no)

df['MultipleLines'] = df['MultipleLines'].replace(s)

df['OnlineSecurity'] = df['OnlineSecurity'].replace(in_s)
df['OnlineBackup'] = df['OnlineBackup'].replace(in_s)
df['DeviceProtection'] = df['DeviceProtection'].replace(in_s)
df['TechSupport'] = df['TechSupport'].replace(in_s)
df['StreamingTV'] = df['StreamingTV'].replace(in_s)
df['StreamingMovies'] = df['StreamingMovies'].replace(in_s)

df['InternetService'] = df['InternetService'].replace(net_s)

df['Contract'] = df['Contract'].replace(cont)

df['PaymentMethod'] = df['PaymentMethod'].replace(paym)

In [51]:
df.corrwith(df['Churn']).abs().sort_values(ascending=False)

Churn               1.000000
Contract            0.396713
tenure              0.352229
PaymentMethod       0.262818
TotalCharges        0.199484
MonthlyCharges      0.193356
PaperlessBilling    0.191825
StreamingTV         0.164673
Dependents          0.164221
StreamingMovies     0.163220
SeniorCitizen       0.150889
Partner             0.150448
DeviceProtection    0.084654
OnlineBackup        0.074205
InternetService     0.047291
MultipleLines       0.036310
TechSupport         0.027037
OnlineSecurity      0.023309
PhoneService        0.011942
gender              0.008612
dtype: float64

In [52]:
df = df.drop(columns = ['customerID','gender','PhoneService','OnlineSecurity','TechSupport','MultipleLines','InternetService'])
df.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,OnlineBackup,DeviceProtection,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,1,0,1,2,1,1,1,0,1,0,29.85,29.85,0
1,0,0,0,34,1,2,1,1,1,0,1,56.95,1889.5,0
2,0,0,0,2,2,1,1,1,0,1,1,53.85,108.15,1
3,0,0,0,45,1,2,1,1,1,0,2,42.3,1840.75,0
4,0,0,0,2,1,1,1,1,0,1,0,70.7,151.65,1


In [53]:
X = df.drop(columns = 'Churn')
Y = df.Churn

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42) 

In [55]:
algorithm = LogisticRegression(solver = 'liblinear', max_iter = 500, random_state = 42)

numeric_pipeline = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'median')), ('scaler', RobustScaler())])

feature_transformer = ColumnTransformer(transformers = [('numeric_transformer', numeric_pipeline, X_train.select_dtypes(include = 'number').columns.tolist())], n_jobs = -1)

pipe = Pipeline(steps = [('feature_transformer', feature_transformer), ('feature_selector',
                                                                        SelectFromModel(estimator = algorithm)), ('classifier', algorithm)])

# Boru arxitekturasının göstərilməsi
pipe

In [56]:
def apply_bayesian_optimization(model = None, hyperparameters = None, n_iterations = 50, metric = None, train_features = None, train_labels = None):
    rskf = RepeatedStratifiedKFold(random_state = 42)

    bayesian_optimization = skopt.BayesSearchCV(estimator = model, search_spaces = hyperparameters, n_iter = n_iterations, scoring = metric, n_jobs = -1, cv = rskf, random_state = 42)
    bayesian_optimization.fit(X = train_features, y = train_labels)

    best_estimator = bayesian_optimization.best_estimator_
    
    return best_estimator

In [57]:
search_spaces = {}
search_spaces['feature_transformer__numeric_transformer__scaler'] = skopt.space.Categorical(categories = [RobustScaler(), MinMaxScaler(), MaxAbsScaler(), StandardScaler()])
search_spaces['feature_transformer__numeric_transformer__imputer__strategy'] = skopt.space.Categorical(categories = ['mean', 'median', 'constant', 'most_frequent'])
search_spaces['feature_selector'] = skopt.space.Categorical(categories = [SelectFromModel(estimator = algorithm), SelectFdr()])
search_spaces['classifier__C'] = skopt.space.Real(low = 1e-6, high = 100)
search_spaces['classifier__tol'] = skopt.space.Real(low = 1e-6, high = 100)

best_pipe = apply_bayesian_optimization(model = pipe, hyperparameters = search_spaces, n_iterations = 100, metric = 'accuracy', train_features = X_train, train_labels = y_train)
best_pipe