In [2]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%config InLineBackend.figure_format = 'svg'
from sklearn.metrics import classification_report

### Этап 1. Загрузка данных

In [3]:
# Импортируем датасет
ch_df = pd.read_csv("C:/Users/Даниил/Desktop/ОМО/churn.csv")


# Отбрасываем ненужные для анализа данные
ch_df_x = ch_df.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'Exited'])
ch_df_y = ch_df['Exited']
ch_df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [4]:
ch_df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5.0,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10.0,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7.0,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3.0,75075.31,2,1,0,92888.52,1


### Этап 2. Предобработка данных

In [5]:
# Проверяем на пустые значения
ch_df_x.isnull().sum()

# Заменяем пустые значения на 0
ch_df_x.fillna(0, inplace=True)

# Проверяем
ch_df_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  float64
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
dtypes: float64(3), int64(5), object(2)
memory usage: 781.4+ KB


### Этап 3. Исследовательский анализ данных

In [6]:
le = LabelEncoder()

objList = ch_df_x.select_dtypes(include="object").columns
if all(column in ch_df_x.columns for column in ['Geography', 'Gender']):
    ch_df_x = pd.get_dummies(ch_df_x, columns=['Geography', 'Gender'])
    ch_df_x = ch_df_x.astype(int)
else:
    print("One or more specified columns do not exist in the DataFrame.")


In [7]:
ch_df_x.head(20)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,619,42,2,0,1,1,1,101348,1,0,0,1,0
1,608,41,1,83807,1,0,1,112542,0,0,1,1,0
2,502,42,8,159660,3,1,0,113931,1,0,0,1,0
3,699,39,1,0,2,0,0,93826,1,0,0,1,0
4,850,43,2,125510,1,1,1,79084,0,0,1,1,0
5,645,44,8,113755,2,1,0,149756,0,0,1,0,1
6,822,50,7,0,2,1,1,10062,1,0,0,0,1
7,376,29,4,115046,4,1,0,119346,0,1,0,1,0
8,501,44,4,142051,2,0,1,74940,1,0,0,0,1
9,684,27,2,134603,1,1,1,71725,1,0,0,0,1


In [8]:
# Составим таблицу корреляции
corr_data = pd.concat([ch_df_x, ch_df_y], axis=1)
corr_data.corr().round(2)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,Exited
CreditScore,1.0,-0.0,0.0,0.01,0.01,-0.01,0.03,-0.0,-0.01,0.01,0.0,0.0,-0.0,-0.03
Age,-0.0,1.0,-0.01,0.03,-0.03,-0.01,0.09,-0.01,-0.04,0.05,-0.0,0.03,-0.03,0.29
Tenure,0.0,-0.01,1.0,-0.01,0.01,0.02,-0.03,0.01,-0.0,0.0,-0.0,-0.01,0.01,-0.01
Balance,0.01,0.03,-0.01,1.0,-0.3,-0.01,-0.01,0.01,-0.23,0.4,-0.13,-0.01,0.01,0.12
NumOfProducts,0.01,-0.03,0.01,-0.3,1.0,0.0,0.01,0.01,0.0,-0.01,0.01,0.02,-0.02,-0.05
HasCrCard,-0.01,-0.01,0.02,-0.01,0.0,1.0,-0.01,-0.01,0.0,0.01,-0.01,-0.01,0.01,-0.01
IsActiveMember,0.03,0.09,-0.03,-0.01,0.01,-0.01,1.0,-0.01,0.0,-0.02,0.02,-0.02,0.02,-0.16
EstimatedSalary,-0.0,-0.01,0.01,0.01,0.01,-0.01,-0.01,1.0,-0.0,0.01,-0.01,0.01,-0.01,0.01
Geography_France,-0.01,-0.04,-0.0,-0.23,0.0,0.0,0.0,-0.0,1.0,-0.58,-0.58,-0.01,0.01,-0.1
Geography_Germany,0.01,0.05,0.0,0.4,-0.01,0.01,-0.02,0.01,-0.58,1.0,-0.33,0.02,-0.02,0.17


In [9]:
c = pd.DataFrame(ch_df_y.value_counts())
c
ch_df_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   CreditScore        10000 non-null  int32
 1   Age                10000 non-null  int32
 2   Tenure             10000 non-null  int32
 3   Balance            10000 non-null  int32
 4   NumOfProducts      10000 non-null  int32
 5   HasCrCard          10000 non-null  int32
 6   IsActiveMember     10000 non-null  int32
 7   EstimatedSalary    10000 non-null  int32
 8   Geography_France   10000 non-null  int32
 9   Geography_Germany  10000 non-null  int32
 10  Geography_Spain    10000 non-null  int32
 11  Gender_Female      10000 non-null  int32
 12  Gender_Male        10000 non-null  int32
dtypes: int32(13)
memory usage: 507.9 KB


In [10]:
# Разделяем датасет на обучающие признаки, тестовые признаки, целевые обучающие, целевые тестовые
x_train, x_test, y_train, y_test = train_test_split(
    ch_df_x, ch_df_y, test_size=0.3, random_state=0)
target_names = ['Not Exited', 'Exited']
sc = StandardScaler()
sc.fit(x_train)
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)

### Логистическая регрессия

In [11]:
from sklearn.linear_model import LogisticRegression

# Создаем логистическую регрессию и обучаем ее (fit) 
lr = LogisticRegression(C=100.0, random_state = 1)
lr.fit(x_train_std, y_train)

In [12]:
lr.predict_proba(x_test_std[:3, :])

# Делаем тестовое предсказание
lr.predict(x_test_std[0, :].reshape(1, -1)) 

x_test_std[0, :]

x_test_std[0, :].reshape(1, -1)

y_pred_lr = lr.predict(x_test_std)

print(classification_report(y_test, y_pred_lr, target_names=target_names))

              precision    recall  f1-score   support

  Not Exited       0.83      0.96      0.89      2379
      Exited       0.58      0.23      0.33       621

    accuracy                           0.81      3000
   macro avg       0.70      0.59      0.61      3000
weighted avg       0.77      0.81      0.77      3000



### Машина опорных векторов (SVC)

In [13]:
from sklearn.svm import SVC

# Обьявляем
svm = SVC(kernel='linear', C=1.0, random_state=1)

# Обучение модели
svm.fit(x_train_std, y_train)

In [14]:
y_pred_svm = svm.predict(x_test_std)

# Выводим отчёт
print(classification_report(y_test, y_pred_svm, target_names=target_names, zero_division='warn'))

              precision    recall  f1-score   support

  Not Exited       0.79      1.00      0.88      2379
      Exited       0.00      0.00      0.00       621

    accuracy                           0.79      3000
   macro avg       0.40      0.50      0.44      3000
weighted avg       0.63      0.79      0.70      3000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Нелинейная классификация

In [15]:
svm_rbf = SVC(kernel='rbf', random_state=1, gamma=0.10, C=10.0)
svm_rbf.fit(x_train_std, y_train)

In [16]:
y_pred_svm_rbf = svm_rbf.predict(x_test_std)

# Выводим отчёт
print(classification_report(y_test, y_pred_svm_rbf, target_names=target_names))

              precision    recall  f1-score   support

  Not Exited       0.88      0.95      0.91      2379
      Exited       0.73      0.52      0.60       621

    accuracy                           0.86      3000
   macro avg       0.80      0.73      0.76      3000
weighted avg       0.85      0.86      0.85      3000



### Метод ближайших соседей (kNN)

In [17]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 5, p=2, metric='manhattan')
knn.fit(x_train_std, y_train)

y_pred_knn = knn.predict(x_test_std)
# Выводим отчёт
print(classification_report(y_test, y_pred_knn, target_names=target_names))

              precision    recall  f1-score   support

  Not Exited       0.86      0.94      0.90      2379
      Exited       0.66      0.41      0.50       621

    accuracy                           0.83      3000
   macro avg       0.76      0.68      0.70      3000
weighted avg       0.82      0.83      0.82      3000



### Дерево принятия решении

In [18]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='gini',
                              max_depth=4,
                              random_state=1)
tree.fit(x_train_std, y_train)

y_pred_tree = tree.predict(x_test_std)
# Выводим отчёт
print(classification_report(y_test, y_pred_tree, target_names=target_names))

              precision    recall  f1-score   support

  Not Exited       0.87      0.96      0.91      2379
      Exited       0.75      0.45      0.56       621

    accuracy                           0.85      3000
   macro avg       0.81      0.71      0.74      3000
weighted avg       0.84      0.85      0.84      3000



#### Попробуем подобрать более оптимальные параметры для модели при помощи GridSearch

In [19]:
tree_model = DecisionTreeClassifier()
criterion_values = ['gini', 'entropy', 'log_loss']
splitter_values = ['best', 'random']
max_depth_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 50]

grid = dict(criterion=criterion_values, splitter=splitter_values, max_depth=max_depth_values) # degree=poly_degree_values, coef0=coef0_values
grid_search = GridSearchCV(estimator=tree_model, param_grid=grid, n_jobs=-1,error_score=0 )
grid_result = grid_search.fit(x_train_std, y_train)
print(grid_result.best_score_, grid_result.best_params_)

0.854857142857143 {'criterion': 'gini', 'max_depth': 6, 'splitter': 'best'}


#### Подставим подобранные значения в модель

In [20]:
from sklearn.tree import DecisionTreeClassifier

tree2 = DecisionTreeClassifier(criterion='gini',
                              max_depth=6, splitter='best',
                              random_state=1)
tree2.fit(x_train_std, y_train)

y_pred_tree2 = tree2.predict(x_test_std)
# Выводим отчёт
print(classification_report(y_test, y_pred_tree2, target_names=target_names))

              precision    recall  f1-score   support

  Not Exited       0.88      0.96      0.92      2379
      Exited       0.75      0.50      0.60       621

    accuracy                           0.86      3000
   macro avg       0.81      0.73      0.76      3000
weighted avg       0.85      0.86      0.85      3000



#### Подобранные значения точности при помощи GridSearchCV немного лучше, чем взятые наугад: 0.85 и 0.84

### Оценим качество моделей через метрики RMSE и R2

In [27]:
# импортируем модуль метрик
from sklearn import metrics
 
# выведем корень среднеквадратической ошибки
# сравним тестовые и прогнозные значения цен на жилье
print('RMSE:')
print('1. Linear regression. Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred_lr)))
print('2. SVM. Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred_svm)))
print('3. kNN. Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred_knn)))
print('4. Tree. Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred_tree2)))
print('\n')
print('R2:')
print('1. Linear regression. R2:', np.round(metrics.r2_score(y_test, y_pred_lr), 2))
print('2. SVM. R2:', np.round(metrics.r2_score(y_test, y_pred_svm), 2))
print('3. kNN. R2:', np.round(metrics.r2_score(y_test, y_pred_knn), 2))
print('4. Tree. R2:', np.round(metrics.r2_score(y_test, y_pred_tree2), 2))

RMSE:
1. Linear regression. Root Mean Squared Error (RMSE): 0.44083254568297625
2. SVM. Root Mean Squared Error (RMSE): 0.454972526643093
3. kNN. Root Mean Squared Error (RMSE): 0.408656334834051
4. Tree. Root Mean Squared Error (RMSE): 0.3723797345005051


R2:
1. Linear regression. R2: -0.18
2. SVM. R2: -0.26
3. kNN. R2: -0.02
4. Tree. R2: 0.16
