In [67]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Загрузка данных
df = pd.read_csv("dataset/Churn.csv")
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,9091.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,4.99769,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.894723,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,2.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [68]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [69]:
# Предобработка данных

# Perform one-hot encoding
df = pd.get_dummies(df, columns=['Geography'], drop_first=True)
df = pd.get_dummies(df, columns=['Gender'], drop_first=True)

# Impute missing values in 'tenure' with the median
df['Tenure'].fillna(df['Tenure'].median(), inplace=True)

df.isnull().sum()

RowNumber            0
CustomerId           0
Surname              0
CreditScore          0
Age                  0
Tenure               0
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
Geography_Germany    0
Geography_Spain      0
Gender_Male          0
dtype: int64

In [70]:
# Выделение признаков и целевой переменной
X = df.drop(['RowNumber', 'CustomerId', 'Surname', 'Exited'], axis=1)
y = df['Exited']

print(X.head())
print(y.head())

   CreditScore  Age  Tenure    Balance  NumOfProducts  HasCrCard  \
0          619   42     2.0       0.00              1          1   
1          608   41     1.0   83807.86              1          0   
2          502   42     8.0  159660.80              3          1   
3          699   39     1.0       0.00              2          0   
4          850   43     2.0  125510.82              1          1   

   IsActiveMember  EstimatedSalary  Geography_Germany  Geography_Spain  \
0               1        101348.88              False            False   
1               1        112542.58              False             True   
2               0        113931.57              False            False   
3               0         93826.63              False            False   
4               1         79084.10              False             True   

   Gender_Male  
0        False  
1        False  
2        False  
3        False  
4        False  
0    1
1    0
2    1
3    0
4    0
Name: Exi

In [71]:
# Разделение выборки на обучающую и тестовую
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [72]:
# Создание модели
logistic_reg = LogisticRegression()

# Определение сетки параметров для GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# Подбор параметров с использованием GridSearchCV
grid_search = GridSearchCV(logistic_reg, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Получение лучших параметров
best_params = grid_search.best_params_

# Строим модель с лучшими параметрами
best_logistic_reg = LogisticRegression(**best_params)
best_logistic_reg.fit(X_train, y_train)

# Предсказания
y_pred_logistic = best_logistic_reg.predict(X_test)

# Оценка модели
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
classification_report_logistic = classification_report(y_test, y_pred_logistic)

print(f"Лучшие параметры для Логистической регрессии: {best_params}")
print(f"Точность Логистической регрессии: {accuracy_logistic}")
print("Отчет о классификации для Логистической регрессии:\n", classification_report_logistic)


Лучшие параметры для Логистической регрессии: {'C': 0.001}
Точность Логистической регрессии: 0.8005
Отчет о классификации для Логистической регрессии:
               precision    recall  f1-score   support

           0       0.81      0.98      0.89      1607
           1       0.45      0.07      0.12       393

    accuracy                           0.80      2000
   macro avg       0.63      0.53      0.51      2000
weighted avg       0.74      0.80      0.74      2000



In [73]:
from sklearn.svm import SVC

# Создание модели
svm_model = SVC()

# Строим модель SVM
best_svm_model = SVC()
best_svm_model.fit(X_train, y_train)

# Предсказания
y_pred_svm = best_svm_model.predict(X_test)

# Оценка модели
accuracy_svm = accuracy_score(y_test, y_pred_svm)
classification_report_svm = classification_report(y_test, y_pred_svm,zero_division=1)

print(f"Точность SVM: {accuracy_svm}")
print("Отчет о классификации для SVM:\n", classification_report_svm)


Точность SVM: 0.8035
Отчет о классификации для SVM:
               precision    recall  f1-score   support

           0       0.80      1.00      0.89      1607
           1       1.00      0.00      0.00       393

    accuracy                           0.80      2000
   macro avg       0.90      0.50      0.45      2000
weighted avg       0.84      0.80      0.72      2000



In [74]:
from sklearn.ensemble import RandomForestClassifier

# Создание модели
random_forest_model = RandomForestClassifier()

# Строим модель Случайного леса
best_rf_model = RandomForestClassifier()
best_rf_model.fit(X_train, y_train)

# Предсказания
y_pred_rf = best_rf_model.predict(X_test)

# Оценка модели
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_report_rf = classification_report(y_test, y_pred_rf)

print(f"Точность Случайного леса: {accuracy_rf}")
print("Отчет о классификации для Случайного леса:\n", classification_report_rf)


Точность Случайного леса: 0.8675
Отчет о классификации для Случайного леса:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.76      0.48      0.59       393

    accuracy                           0.87      2000
   macro avg       0.82      0.72      0.75      2000
weighted avg       0.86      0.87      0.86      2000

