## Модель регрессии

In [1]:
# импортируем библиотеки
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd

### Этап 1. Загрузка данных

**Загрузка данных**. Скачивание датасета из сессионного хранилища

In [2]:
# загружаем датасет и выводим первые 5 строчек
churn_df = pd.read_csv('Churn.csv')
churn_df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [3]:
# посмотрим с каким типом переменных нам предстоит работать
# для этого есть метод .info()
churn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           9091 non-null   float64
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


In [4]:
# Удаляем ненужные столбцы, индексы и фамилии
del churn_df['RowNumber']
del churn_df['CustomerId']
del churn_df['Surname']

In [5]:
# посмотрим на основные статистические показатели (summary statistics)
# с помощью метода .describe()
churn_df.describe().round(2)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,9091.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,650.53,38.92,5.0,76485.89,1.53,0.71,0.52,100090.24,0.2
std,96.65,10.49,2.89,62397.41,0.58,0.46,0.5,57510.49,0.4
min,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,584.0,32.0,2.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.92,0.0
75%,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.25,0.0
max,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


### Этап 2. Предварительная обработка данных

In [6]:
# проверим, есть ли пропущенные значения
churn_df.isnull().sum()

CreditScore          0
Geography            0
Gender               0
Age                  0
Tenure             909
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
dtype: int64

In [7]:
# Удаляем пропуски
churn_df.dropna(inplace=True)

In [8]:
# Еще раз проверим, есть ли пропущенные значения
churn_df.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

Пропущенных значений больше нет

In [9]:
# Меняем столбец места жительства и пол с символов на цифры

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

le = LabelEncoder()
le.fit(churn_df['Geography'])
churn_df['Geography'] = le.transform(churn_df['Geography'])
le1 = LabelEncoder()
le1.fit(churn_df['Gender'])
churn_df['Gender'] = le1.transform(churn_df['Gender'])

In [10]:
#выводим 10 строчек датасета
churn_df.head(10)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,0,0,42,2.0,0.0,1,1,1,101348.88,1
1,608,2,0,41,1.0,83807.86,1,0,1,112542.58,0
2,502,0,0,42,8.0,159660.8,3,1,0,113931.57,1
3,699,0,0,39,1.0,0.0,2,0,0,93826.63,0
4,850,2,0,43,2.0,125510.82,1,1,1,79084.1,0
5,645,2,1,44,8.0,113755.78,2,1,0,149756.71,1
6,822,0,1,50,7.0,0.0,2,1,1,10062.8,0
7,376,1,0,29,4.0,115046.74,4,1,0,119346.88,1
8,501,0,1,44,4.0,142051.07,2,0,1,74940.5,0
9,684,0,1,27,2.0,134603.88,1,1,1,71725.73,0


### Этап 3. Исследовательский анализ данных (Exploratory Data Analysis)

In [11]:
# посчитаем коэффициент корреляции для всего датафрейма и округлим значение
# получается корреляционная матрица
corr_matrix = churn_df.corr().round(2)
corr_matrix

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
CreditScore,1.0,0.0,-0.01,-0.0,-0.0,0.0,0.01,-0.0,0.03,0.01,-0.02
Geography,0.0,1.0,0.0,0.02,-0.0,0.07,0.01,-0.01,0.01,-0.01,0.04
Gender,-0.01,0.0,1.0,-0.03,0.01,0.02,-0.03,0.01,0.03,-0.0,-0.11
Age,-0.0,0.02,-0.03,1.0,-0.01,0.03,-0.03,-0.01,0.08,-0.01,0.28
Tenure,-0.0,-0.0,0.01,-0.01,1.0,-0.01,0.01,0.03,-0.03,0.01,-0.02
Balance,0.0,0.07,0.02,0.03,-0.01,1.0,-0.3,-0.02,-0.0,0.01,0.12
NumOfProducts,0.01,0.01,-0.03,-0.03,0.01,-0.3,1.0,0.01,0.01,0.01,-0.05
HasCrCard,-0.0,-0.01,0.01,-0.01,0.03,-0.02,0.01,1.0,-0.01,-0.01,-0.01
IsActiveMember,0.03,0.01,0.03,0.08,-0.03,-0.0,0.01,-0.01,1.0,-0.02,-0.16
EstimatedSalary,0.01,-0.01,-0.0,-0.01,0.01,0.01,0.01,-0.01,-0.02,1.0,0.02


Каждый признак по отдельности мало влияет на результат

### Этап 4. Отбор и выделение признаков

In [12]:
# Помещаем все признаки в переменную X. Целевую переменную Exited в переменную y
# 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'
X = churn_df[['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']]
y = churn_df['Exited']
print(type(X), type(y))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


In [13]:
from sklearn.model_selection import train_test_split

# разобьем данные на обучающую и тестовую выборку
# размер тестовой выборки составит 30%
# также зададим точку отсчета для воспроизводимости
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.3, 
                                                    random_state = 1)

In [14]:
# посмотрим на новую размерность обучающей
print(X_train.shape, y_train.shape)

# и тестовой выборки
print(X_test.shape, y_test.shape)

(6363, 10) (6363,)
(2728, 10) (2728,)


### Этап 5. Обучение и оценка качества модели

Линейная классификация

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

lr = LogisticRegression()
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
penalty = ['l2']
c = [0.01, 0.1, 1, 10, 100]
grid = dict(solver = solvers, penalty = penalty, C = c)
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
grid_search = GridSearchCV(estimator=lr, param_grid = grid, n_jobs = -1, cv = cv, scoring = 'accuracy', error_score=0)
grid_result = grid_search.fit(X_train, y_train)

print("accuracy = ", grid_result.best_score_ , "parameter ", grid_result.best_params_)

accuracy =  0.8066426909418822 parameter  {'C': 10, 'penalty': 'l2', 'solver': 'newton-cholesky'}


Дерево принятия решений

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

depth1 = 0
f11 = 0
accuracy = 0
for depth in range(1, 40):
    m_dt = DecisionTreeClassifier(random_state = 1, max_depth=depth)
    m_dt.fit(X_train, y_train)
    prediction = m_dt.predict(X_test)
    f1 = f1_score(y_test, prediction)
    accuracy = accuracy_score(y_test, prediction)
    if f1 > f11:
        f11 = f1
        accuracy1 = accuracy
        depth1 = depth
print("Лучшие настройки для данного датасета depth = ", depth1, "f1 при этом равен ", f1, "accuracy равен ", accuracy)    

Лучшие настройки для данного датасета depth =  6 f1 при этом равен  0.5004549590536852 accuracy равен  0.7987536656891495


Метод ближайших соседей

In [17]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
n_neighbors1 = [1, 10, 1]
leaf_size1 = [20, 40, 1]
p1 = [1, 2]
weights1 = ['uniform', 'distance']
metric1 = ['minkowski', 'chebyshev']
grid = dict(n_neighbors = n_neighbors1, leaf_size = leaf_size1, p = p1, weights = weights1, metric = metric1)
grid_search = GridSearchCV(estimator=knn, param_grid = grid, n_jobs = -1, cv = 5, scoring = 'accuracy', error_score=0)
grid_result = grid_search.fit(X_train, y_train)
print("Лучший accuracy = ", grid_result.best_score_ , "При настройках", grid_result.best_params_)

Лучший accuracy =  0.7909790669295034 При настройках {'leaf_size': 20, 'metric': 'chebyshev', 'n_neighbors': 10, 'p': 1, 'weights': 'uniform'}
