In [30]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# from xgboost.sklearn import XGBClassifier


from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.tree import plot_tree
from sklearn.tree import export_graphviz
from sklearn.tree import export_text

from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from hyperopt import hp
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

from sklearn.ensemble import StackingClassifier

In [10]:
df = pd.read_csv('customer_churn_dataset-training-master.csv')

In [11]:
df[df.isnull().any(axis=1)] #Код, который находит строку с NaN
df = df.drop(index = 199295)


In [12]:
df2 = df.copy()
labelEncoder = LabelEncoder()
lists_for_encode = ['Gender', 'Subscription Type', 'Contract Length']

for column in lists_for_encode:
    df2[column] = labelEncoder.fit_transform(df2[column])

In [13]:
for_x = df2.drop(['Churn', 'CustomerID'], axis=1)
x = for_x
y = df2['Churn']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4)

In [14]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

tree.score(X_test, y_test)

y_pred = tree.predict(X_test)
 
print("Accurcy: ", accuracy_score(y_test, y_pred))                 
print(metrics.classification_report(y_test, y_pred))

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("MAE:", mae)
print("MSE:", mse)


Accurcy:  0.9998468806179218
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     76186
         1.0       1.00      1.00      1.00    100147

    accuracy                           1.00    176333
   macro avg       1.00      1.00      1.00    176333
weighted avg       1.00      1.00      1.00    176333

MAE: 0.000153119382078227
MSE: 0.000153119382078227


In [17]:
#Далее необходимо посмотреть confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[ 76176     10]
 [    17 100130]]


In [18]:
#Как трактовать результаты?
# 1) True Positives (TP):  99 983 - Модель правильно предсказала 99 983 положительных случаев.
# 2) True Negatives (TN):  76145 - Модель правильно предсказала 76 319 отрицательных случаев.
# 3) False Positives (FP):  13 - Модель неправильно предсказала 13 отрицательных случая как положительные. 
# 4) False Negatives (FN):  18 - Модель неправильно предсказала 18 положительных случаев как отрицательные. 

In [19]:
predict = df2.copy()
predict['predicted_exited'] = tree.predict(for_x)
predict

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn,predicted_exited
0,2.0,30.0,0,39.0,14.0,5.0,18.0,2,0,932.00,17.0,1.0,1.0
1,3.0,65.0,0,49.0,1.0,10.0,8.0,0,1,557.00,6.0,1.0,1.0
2,4.0,55.0,0,14.0,4.0,6.0,18.0,0,2,185.00,3.0,1.0,1.0
3,5.0,58.0,1,38.0,21.0,7.0,7.0,2,1,396.00,29.0,1.0,1.0
4,6.0,23.0,1,32.0,20.0,5.0,8.0,0,1,617.00,20.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
440828,449995.0,42.0,1,54.0,15.0,1.0,3.0,1,0,716.38,8.0,0.0,0.0
440829,449996.0,25.0,0,8.0,13.0,1.0,20.0,1,0,745.38,2.0,0.0,0.0
440830,449997.0,26.0,1,35.0,27.0,1.0,5.0,2,2,977.31,9.0,0.0,0.0
440831,449998.0,28.0,1,55.0,14.0,2.0,0.0,2,2,602.55,2.0,0.0,0.0


In [20]:
#Юзаем Hyperopt

#Определяем пространство поиска

intro = {
    'max_depth': hp.choice('max_depth', [round(i) for i in np.arange(2, 11, 1)]), 
    'min_samples_split': hp.uniform('min_samples_split', 0.01, 0.2),
    'criterion': hp.choice('criterion', ['gini', 'entropy'])
}

#Прогоняет модель (вариант по умолчанию) + выдает точность
def TreeClass(params):
    model = DecisionTreeClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return {'loss': -accuracy, 'status': STATUS_OK}


#Определяет лучшие гиперпараметры для модели
trials = Trials()
best = fmin(fn=TreeClass,
            space=intro,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)


100%|██████████| 100/100 [00:33<00:00,  2.96trial/s, best loss: -0.9947485723035393]


In [25]:
best

{'criterion': 1, 'max_depth': 8, 'min_samples_split': 0.01024987143887619}

In [26]:
final_model = DecisionTreeClassifier(
    criterion='gini',
    max_depth=8,
    min_samples_split=0.01
)

final_model.fit(X_train, y_train)

final_model.score(X_test, y_test)

y_pred = final_model.predict(X_test)
 
print("Accurcy: ", accuracy_score(y_test, y_pred))                 
print(metrics.classification_report(y_test, y_pred))

Accurcy:  0.987852529021794
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.99     76186
         1.0       1.00      0.98      0.99    100147

    accuracy                           0.99    176333
   macro avg       0.99      0.99      0.99    176333
weighted avg       0.99      0.99      0.99    176333



In [23]:
#Как мы наблюдаем - использования параметров испортило ситуацию, ибо до функции подбора лучших гиперпарам

In [24]:
#GridSearchCV - это инструмент из библиотеки scikit-learn, который позволяет выполнять поиск по сетке (grid search) для оптимизации гиперпараметров модели. 
#В отличие от Hyperopt, который использует более сложные алгоритмы оптимизации, GridSearchCV перебирает все возможные комбинации гиперпараметров из заданного пространства поиска и 
#выбирает лучшую комбинацию на основе метрики качества.

In [47]:
base = [
    ('xgb',XGBClassifier()),
    ('kn',KNeighborsClassifier()),
    ('dt',DecisionTreeClassifier()),
    ('lr',LogisticRegression()),
    ('rf',RandomForestClassifier()),
]

meta_model = DecisionTreeClassifier()

stacking_models = StackingClassifier(estimators=base, final_estimator=meta_model, cv=5)

final_predict = stacking_models.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [48]:
final_predict.score(X_test, y_test)

y_pred = final_predict.predict(X_test)

print("Accurcy: ", accuracy_score(y_test, y_pred))                 
print(metrics.classification_report(y_test, y_pred))

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("MAE:", mae)
print("MSE:", mse)



Accurcy:  0.9999829867353246
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     76186
         1.0       1.00      1.00      1.00    100147

    accuracy                           1.00    176333
   macro avg       1.00      1.00      1.00    176333
weighted avg       1.00      1.00      1.00    176333

MAE: 1.7013264675358556e-05
MSE: 1.7013264675358556e-05


In [42]:
for name, model in base:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy {name}: {accuracy}')

Accuracy xgb: 0.9999376180295236
Accuracy kn: 0.8870149092909438
Accuracy dt: 0.9998525517061468


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy lr: 0.8482019814782259
Accuracy rf: 0.999665405794718


In [39]:
final_predict