<h1>Содержание<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Подготовка-данных" data-toc-modified-id="Подготовка-данных-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Подготовка данных</a></span></li><li><span><a href="#Исследование-задачи" data-toc-modified-id="Исследование-задачи-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Исследование задачи</a></span></li><li><span><a href="#Борьба-с-дисбалансом" data-toc-modified-id="Борьба-с-дисбалансом-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Борьба с дисбалансом</a></span></li><li><span><a href="#Тестирование-модели" data-toc-modified-id="Тестирование-модели-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Тестирование модели</a></span></li></ul></div>

# Отток клиентов

Из «Бета-Банка» стали уходить клиенты. Каждый месяц. Немного, но заметно. Банковские маркетологи посчитали: сохранять текущих клиентов дешевле, чем привлекать новых.

Нужно спрогнозировать, уйдёт клиент из банка в ближайшее время или нет. Вам предоставлены исторические данные о поведении клиентов и расторжении договоров с банком. 

## Подготовка данных

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler 

In [28]:
try:
    df = pd.read_csv('Churn.csv')
except:
    df=pd.read_csv('/datasets/Churn.csv')

In [29]:
#смотрим инфо по данным
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           9091 non-null   float64
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


In [30]:
#приводим столбцы к нижнему регистру,удаляем пропуски и столбец дублирующий индексы
df.columns = df.columns.str.lower()
df = df.drop(['rownumber'],axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customerid       10000 non-null  int64  
 1   surname          10000 non-null  object 
 2   creditscore      10000 non-null  int64  
 3   geography        10000 non-null  object 
 4   gender           10000 non-null  object 
 5   age              10000 non-null  int64  
 6   tenure           9091 non-null   float64
 7   balance          10000 non-null  float64
 8   numofproducts    10000 non-null  int64  
 9   hascrcard        10000 non-null  int64  
 10  isactivemember   10000 non-null  int64  
 11  estimatedsalary  10000 non-null  float64
 12  exited           10000 non-null  int64  
dtypes: float64(3), int64(7), object(3)
memory usage: 1015.8+ KB


In [31]:
#заполняем пропуски заглушкой -1 и приводим к типу object
df['tenure'] = df['tenure'].fillna(-1)
df['tenure'] = df['tenure'].astype('object')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customerid       10000 non-null  int64  
 1   surname          10000 non-null  object 
 2   creditscore      10000 non-null  int64  
 3   geography        10000 non-null  object 
 4   gender           10000 non-null  object 
 5   age              10000 non-null  int64  
 6   tenure           10000 non-null  object 
 7   balance          10000 non-null  float64
 8   numofproducts    10000 non-null  int64  
 9   hascrcard        10000 non-null  int64  
 10  isactivemember   10000 non-null  int64  
 11  estimatedsalary  10000 non-null  float64
 12  exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(4)
memory usage: 1015.8+ KB


In [32]:
#удаляем столбцы с именами и идентификаторами клиента
df = df.drop(df[['surname','customerid']],axis=1)

In [33]:
#Кодируем строковые значения в числовые
df = pd.get_dummies(df,drop_first = True)
df

  uniques = Index(uniques)


Unnamed: 0,creditscore,age,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited,geography_Germany,geography_Spain,...,tenure_1.0,tenure_2.0,tenure_3.0,tenure_4.0,tenure_5.0,tenure_6.0,tenure_7.0,tenure_8.0,tenure_9.0,tenure_10.0
0,619,42,0.00,1,1,1,101348.88,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,608,41,83807.86,1,0,1,112542.58,0,0,1,...,1,0,0,0,0,0,0,0,0,0
2,502,42,159660.80,3,1,0,113931.57,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,699,39,0.00,2,0,0,93826.63,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,850,43,125510.82,1,1,1,79084.10,0,0,1,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,39,0.00,2,1,0,96270.64,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9996,516,35,57369.61,1,1,1,101699.77,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9997,709,36,0.00,1,0,1,42085.58,1,0,0,...,0,0,0,0,0,0,1,0,0,0
9998,772,42,75075.31,2,1,0,92888.52,1,1,0,...,0,0,1,0,0,0,0,0,0,0


In [34]:
#разделяем выборки на обучающую,валидационную и тестовую (60%\20%\20%) и отделяем целевой признак.
X = df.drop(['exited'],axis=1)
y = df['exited']
X_train, X_time_test, y_train, y_time_test = train_test_split(X, y ,random_state=12345, test_size=0.4)
X_valid, X_test, y_valid, y_test = train_test_split(X_time_test, y_time_test, random_state=12345, test_size = 0.5)

In [35]:
df.columns

Index(['creditscore', 'age', 'balance', 'numofproducts', 'hascrcard',
       'isactivemember', 'estimatedsalary', 'exited', 'geography_Germany',
       'geography_Spain', 'gender_Male', 'tenure_0.0', 'tenure_1.0',
       'tenure_2.0', 'tenure_3.0', 'tenure_4.0', 'tenure_5.0', 'tenure_6.0',
       'tenure_7.0', 'tenure_8.0', 'tenure_9.0', 'tenure_10.0'],
      dtype='object')

In [36]:
# маштабируем данные
numeric = ['creditscore', 'age', 'balance', 'numofproducts', 'hascrcard',
       'isactivemember', 'estimatedsalary', 'geography_Germany',
       'geography_Spain', 'gender_Male', 'tenure_0.0', 'tenure_1.0',
       'tenure_2.0', 'tenure_3.0', 'tenure_4.0', 'tenure_5.0', 'tenure_6.0',
       'tenure_7.0', 'tenure_8.0', 'tenure_9.0', 'tenure_10.0']
scaler = StandardScaler()
scaler.fit(X_train[numeric])
X_train[numeric] = scaler.transform(X_train[numeric] )
X_valid[numeric]  = scaler.transform(X_valid[numeric] )
X_test[numeric]  = scaler.transform(X_test[numeric] )
pd.options.mode.chained_assignment = None

In [37]:
X_train

Unnamed: 0,creditscore,age,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,geography_Germany,geography_Spain,gender_Male,...,tenure_1.0,tenure_2.0,tenure_3.0,tenure_4.0,tenure_5.0,tenure_6.0,tenure_7.0,tenure_8.0,tenure_9.0,tenure_10.0
7479,-0.886751,-0.373192,1.232271,-0.891560,0.642466,-1.055187,-0.187705,-0.572475,1.728977,0.907278,...,-0.324936,-0.318945,-0.322107,-0.308367,-0.323994,-0.313204,-0.312562,3.048212,-0.314485,-0.213256
3411,0.608663,-0.183385,0.600563,-0.891560,-1.556504,-1.055187,-0.333945,-0.572475,-0.578377,-1.102198,...,-0.324936,-0.318945,-0.322107,-0.308367,-0.323994,-0.313204,-0.312562,3.048212,-0.314485,-0.213256
6027,2.052152,0.480939,1.027098,0.830152,-1.556504,0.947699,1.503095,1.746802,-0.578377,0.907278,...,-0.324936,-0.318945,3.104561,-0.308367,-0.323994,-0.313204,-0.312562,-0.328061,-0.314485,-0.213256
1247,-1.457915,-1.417129,-1.233163,0.830152,0.642466,-1.055187,-1.071061,-0.572475,-0.578377,0.907278,...,-0.324936,-0.318945,-0.322107,-0.308367,-0.323994,3.192809,-0.312562,-0.328061,-0.314485,-0.213256
3716,0.130961,-1.132419,1.140475,-0.891560,-1.556504,-1.055187,1.524268,1.746802,-0.578377,-1.102198,...,-0.324936,3.135335,-0.322107,-0.308367,-0.323994,-0.313204,-0.312562,-0.328061,-0.314485,-0.213256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4478,-1.073677,-0.752805,-1.233163,0.830152,-1.556504,0.947699,-1.278361,-0.572475,-0.578377,0.907278,...,-0.324936,-0.318945,-0.322107,3.242887,-0.323994,-0.313204,-0.312562,-0.328061,-0.314485,-0.213256
4094,-1.447531,-0.942612,-1.233163,0.830152,-1.556504,0.947699,-1.281307,-0.572475,-0.578377,0.907278,...,-0.324936,-0.318945,-0.322107,-0.308367,-0.323994,-0.313204,-0.312562,-0.328061,-0.314485,4.689190
3492,0.027113,0.575842,-0.310229,-0.891560,-1.556504,0.947699,-0.903158,-0.572475,-0.578377,-1.102198,...,-0.324936,-0.318945,-0.322107,-0.308367,-0.323994,-0.313204,-0.312562,-0.328061,-0.314485,-0.213256
2177,0.151731,-1.417129,-1.233163,0.830152,-1.556504,0.947699,-1.128539,-0.572475,-0.578377,-1.102198,...,-0.324936,-0.318945,-0.322107,3.242887,-0.323994,-0.313204,-0.312562,-0.328061,-0.314485,-0.213256


## Исследование задачи

In [38]:
#посмотрим на баланс классов
y.value_counts(normalize=True)

0    0.7963
1    0.2037
Name: exited, dtype: float64

In [39]:
#пробуем модель логистической регрессии
model = LogisticRegression(random_state=12345,solver='liblinear')
model.fit(X_train,y_train)
predict_valid = model.predict(X_valid)
f1_score(y_valid,predict_valid)

0.32830820770519265

In [40]:
%%time
best_depth = 0
best_f1 = 0
for depth in range(2, 22, 2):
    for estimator in range(100, 1001, 100):
        model = RandomForestClassifier(random_state=12345, n_estimators=estimator)
        model.fit(X_train,y_train)
        predict_valid = model.predict(X_valid)
        f1 = f1_score(y_valid,predict_valid)
        if f1 > best_f1:
            best_f1 = f1
            print('estimator:', estimator, f1, depth)

estimator: 100 0.5718654434250765 2
estimator: 200 0.5801526717557252 2
estimator: 300 0.583963691376702 2
Wall time: 4min 44s


## Борьба с дисбалансом

In [41]:
#воспользуемся взвешивание признаков
model = LogisticRegression(random_state=12345, solver='liblinear', class_weight='balanced')
model.fit(X_train,y_train)
predict_valid = model.predict(X_valid)
f1_score(y_valid,predict_valid)

0.4883920894239037

In [42]:
#воспользуемся взвешивание признаков
best_est = 0
best_f1 = 0
for depth in range(2,22,2):
    for est in range(10,100,10):
        model = RandomForestClassifier(random_state =12345, n_estimators = est, class_weight='balanced')
        model.fit(X_train,y_train)
        predict_valid = model.predict(X_valid)
        f1_score(y_valid,predict_valid)
        if f1 > best_f1:
            best_f1 = f1
            best_est = est
            print('estimator:',best_est,best_f1)

estimator: 10 0.5809379727685325


In [43]:
#уравняем вес признаков увеличением редких
def upsample(X,y,repeat):
    X_zeros = X[y == 0]
    X_ones = X[y == 1]
    y_zeros = y[y == 0]
    y_ones = y[y == 1]
    X_upsampled = pd.concat([X_zeros] + [X_ones] * repeat)
    y_upsampled = pd.concat([y_zeros] + [y_ones] * repeat)
    X_upsampled,y_upsampled = shuffle(X_upsampled,y_upsampled,random_state=12345)
    return X_upsampled,y_upsampled 

X_upsampled,y_upsampled = upsample(X_train,y_train,4)

In [44]:
best_est = 0
best_f1 = 0
best_depth = 0
for depth in range(2,22,2):
    for est in range(10,100,10):
        model = RandomForestClassifier(random_state =12345, n_estimators = est)
        model.fit(X_upsampled,y_upsampled)
        predict_valid = model.predict(X_valid)
        f1 = f1_score(y_valid,predict_valid)
        if f1 > best_f1:
            best_f1 = f1
            best_est = est
            best_depth = depth
            print('estimator:',best_est,best_f1,best_depth)

estimator: 10 0.5783132530120483 2
estimator: 20 0.6105263157894737 2
estimator: 30 0.6307277628032345 2


In [45]:
model = LogisticRegression(random_state = 12345,solver='liblinear')
model.fit(X_upsampled,y_upsampled)
predict_valid = model.predict(X_valid)
f1_score(y_valid,predict_valid)


0.49007765314926655

In [46]:
# уровняем вес признаков уменьшение частых
def downsample(X,y,fraction):
    X_zeros = X[y == 0]
    X_ones = X[y == 1]
    y_zeros = y[y == 0]
    y_ones = y[y == 1]
    X_downsampled = pd.concat([X_zeros.sample(frac=fraction,random_state=12345)] + [X_ones])
    y_downsampled = pd.concat([y_zeros.sample(frac=fraction,random_state = 12345)] + [y_ones])
    return X_downsampled,y_downsampled
X_downsampled,y_downsampled = downsample(X_train,y_train,0.2)

In [47]:

model = LogisticRegression(random_state = 12345,solver='liblinear')
model.fit(X_downsampled,y_downsampled)
predict_valid = model.predict(X_valid)
f1_score(y_valid,predict_valid)

0.4804896710022954

In [48]:
best_est = 0
best_f1 = 0
for depth in range(2,22,2):
    for est in range(10,100,10):
        model = RandomForestClassifier(random_state =12345,n_estimators = est)
        model.fit(X_downsampled,y_downsampled)
        predict_valid = model.predict(X_valid)
        f1 = f1_score(y_valid,predict_valid)
        if f1 > best_f1:
            best_f1 = f1
            best_est = est
            print('estimator:',best_est,best_f1)

estimator: 10 0.5576748410535877
estimator: 20 0.5602775368603643
estimator: 50 0.5627632687447346
estimator: 70 0.5637583892617449


In [49]:
%%time
# Пробуем модель случайного леса
best_est = 0
best_f1 = 0
for depth in range(2, 22, 2):
    for est in range(100, 1001, 100):
        model = RandomForestClassifier(random_state=12345,n_estimators=est)
        model.fit(X_train,y_train)
        predict_valid = model.predict(X_valid)
        f1 = f1_score(y_valid,predict_valid)
        if f1 > best_f1:
            best_f1 = f1
            best_est = est
            print('estimator:',best_est,best_f1)

estimator: 100 0.5718654434250765
estimator: 200 0.5801526717557252
estimator: 300 0.583963691376702
Wall time: 4min 42s


In [50]:
#Проверим метрику roc кривой
best_auc_roc = 0
best_depth = 0
best_est = 0
for depth in range(2, 22, 2):
    for est in range(100, 1001, 100):
        model = RandomForestClassifier(random_state=12345,n_estimators=est)
        model.fit(X_train,y_train)
        predicted_proba = model.predict_proba(X_valid)
        predicted_proba_one = predicted_proba[:,1]
        auc_roc = roc_auc_score(y_valid,predicted_proba_one)
        if auc_roc > best_auc_roc:
            best_auc_roc = auc_roc
            best_depht = depth
            best_est = est
print(best_auc_roc,best_depht,best_est)

0.8451334994767692 2 400


## Тестирование модели

In [51]:
#Применим лучшую модель с лучшими гиперпараметрами на тестовую выборку
model = RandomForestClassifier(random_state=12345, n_estimators=200, max_depth=10, class_weight='balanced')
model.fit(X_train,y_train)
predict_test = model.predict(X_test)
print(f1_score(y_test,predict_test))

0.6123399301513389


In [52]:
#Проверим метрику roc кривой на тестовой выборке
model = RandomForestClassifier(random_state=12345, n_estimators=200, max_depth=10, class_weight='balanced')
model.fit(X_train,y_train)

predicted_proba = model.predict_proba(X_test)
predicted_proba_one = predicted_proba[:,1]
auc_roc = roc_auc_score(y_test,predicted_proba_one)
print(auc_roc)

0.8534114059822717
