Григорьев А. - Машинное обучение. Портфолио реальных проекто

In [57]:
import pandas as pd

from sklearn.model_selection import train_test_split

In [58]:
data = pd.read_csv(r'https://drive.google.com/uc?export=download&id=10VoHCAEu1Jqi4NkKhZE__zgyMLe4xSDP')

data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [59]:
data.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


**Подготовка исходных данных**

In [60]:
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [61]:
# преобразование столбца totalcharges в числовой и замена всех нечисловых значений на NaN

total_charges = pd.to_numeric(data['TotalCharges'], errors='coerce')

# вывод тех строк, где данные содержат нечисловые символы в столбце totalcharges

data[total_charges.isnull()][['customerID', 'TotalCharges']]

Unnamed: 0,customerID,TotalCharges
488,4472-LVYGI,
753,3115-CZMZD,
936,5709-LVOEQ,
1082,4367-NUYAO,
1340,1371-DWPAZ,
3331,7644-OMVMY,
3826,3213-VVOLG,
4380,2520-SGTTA,
5218,2923-ARZLG,
6670,4075-WKNIU,


In [62]:
# в этих ячейках установим недостающие значения равными нулю

data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['TotalCharges'] = data['TotalCharges'].fillna(0)

In [63]:
# переименуем столбцы и форматируем строковые данные для удобства

data.columns = data.columns.str.lower().str.replace(' ', '_')

string_columns = list(data.dtypes[data.dtypes == 'object'].index)
for col in string_columns:
    data[col] = data[col].str.lower().str.replace(' ', '_')

In [64]:
# преобразование категориальной переменной churn в числовую

data['churn'] = (data['churn'] == 'yes').astype(int)

In [65]:
# разделение данных на обучающую и тестовую выборки, затем разделение полной обучающей на обучающую и валидационную (проверочную)

data_train_full, data_test = train_test_split(data, test_size=0.2, random_state=0)

data_train, data_val = train_test_split(data_train_full, test_size=0.33, random_state=0)

y_train = data_train['churn'].values
y_val = data_val['churn'].values

# удаление столбцов churn из датафреймов

del data_train['churn']
del data_val['churn']

**Исследовательский анализ данных**

In [66]:
data_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [67]:
# проверка распределения значений в целевой переменной

data_train_full['churn'].value_counts()

churn
0    4133
1    1501
Name: count, dtype: int64

In [68]:
# расчет доли ушедших пользователей (вероятность оттока)

global_mean = data_train_full['churn'].mean()

print(global_mean.round(3))

0.266


In [69]:
# внести в списки категориальные и числовые переменные

categorical = list(data.dtypes[data.dtypes == 'object'].index)
numerical = list(data.dtypes[data.dtypes == 'int64'].index)

categorical.append('seniorcitizen')
categorical.remove('customerid')
numerical.remove('seniorcitizen')
numerical.remove('churn')
numerical.append('monthlycharges')
numerical.append('totalcharges')

print(categorical)
print(numerical)

['gender', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod', 'seniorcitizen']
['tenure', 'monthlycharges', 'totalcharges']


In [70]:
# узнаем, сколько уникальных значений в каждой переменной

data_train_full[categorical].nunique()

gender              2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
seniorcitizen       2
dtype: int64

In [71]:
# вычисление коэффициентов оттока для переменных

female_mean = data_train_full[data_train_full['gender'] == 'female']['churn'].mean()
male_mean = data_train_full[data_train_full['gender'] == 'male']['churn'].mean()

print(female_mean, male_mean, global_mean)

partner_yes = data_train_full[data_train_full['partner'] == 'yes']['churn'].mean()
partner_no = data_train_full[data_train_full['partner'] == 'no']['churn'].mean()

print(partner_yes, partner_no)

0.2729854455094072 0.2598509052183174 0.26641817536386225
0.19801980198019803 0.3305813553491572


In [72]:
# расчет коэффициента риска

female_risk = female_mean/global_mean
male_risk = male_mean/global_mean
partner_yes_risk = partner_yes/global_mean
partner_no_risk = partner_no/global_mean

print(female_risk, male_risk, partner_yes_risk, partner_no_risk)

1.0246502331778815 0.9753497668221187 0.7432668649942943 1.240836346460461


In [73]:
# расчет коэффициента риска для всех групп
for col in categorical:
    data_group = data_train_full.groupby(by=col)['churn'].agg(['mean'])
    data_group['diff'] = data_group['mean'] - global_mean
    data_group['risk'] = data_group['mean'] / global_mean
    display(data_group)

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.272985,0.006567,1.02465
male,0.259851,-0.006567,0.97535


Unnamed: 0_level_0,mean,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.330581,0.064163,1.240836
yes,0.19802,-0.068398,0.743267


Unnamed: 0_level_0,mean,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.315016,0.048598,1.182414
yes,0.152522,-0.113896,0.572492


Unnamed: 0_level_0,mean,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.241697,-0.024721,0.907211
yes,0.269049,0.002631,1.009877


Unnamed: 0_level_0,mean,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.251103,-0.015315,0.942514
no_phone_service,0.241697,-0.024721,0.907211
yes,0.289629,0.023211,1.087122


Unnamed: 0_level_0,mean,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.187435,-0.078983,0.703537
fiber_optic,0.419952,0.153534,1.576288
no,0.07856,-0.187858,0.294874


Unnamed: 0_level_0,mean,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.417114,0.150696,1.565637
no_internet_service,0.07856,-0.187858,0.294874
yes,0.14824,-0.118179,0.556417


Unnamed: 0_level_0,mean,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.409183,0.142765,1.535868
no_internet_service,0.07856,-0.187858,0.294874
yes,0.203998,-0.06242,0.765706


Unnamed: 0_level_0,mean,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.392799,0.126381,1.474371
no_internet_service,0.07856,-0.187858,0.294874
yes,0.223711,-0.042707,0.8397


Unnamed: 0_level_0,mean,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.416488,0.15007,1.563288
no_internet_service,0.07856,-0.187858,0.294874
yes,0.147826,-0.118592,0.554865


Unnamed: 0_level_0,mean,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.334531,0.068113,1.25566
no_internet_service,0.07856,-0.187858,0.294874
yes,0.302059,0.035641,1.13378


Unnamed: 0_level_0,mean,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.342638,0.07622,1.286093
no_internet_service,0.07856,-0.187858,0.294874
yes,0.29393,0.027512,1.103264


Unnamed: 0_level_0,mean,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.430774,0.164356,1.61691
one_year,0.102954,-0.163465,0.386436
two_year,0.028443,-0.237975,0.106761


Unnamed: 0_level_0,mean,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.163884,-0.102534,0.615138
yes,0.335914,0.069496,1.260853


Unnamed: 0_level_0,mean,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.162903,-0.103515,0.611457
credit_card_(automatic),0.144054,-0.122365,0.540705
electronic_check,0.459916,0.193497,1.726292
mailed_check,0.195552,-0.070866,0.734005


Unnamed: 0_level_0,mean,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.237026,-0.029392,0.889677
1,0.418401,0.151983,1.570467


In [74]:
# вычисление взаимной информации

from sklearn.metrics import mutual_info_score

def calculate_mi(series):
    return mutual_info_score(series, data_train_full['churn'])

data_mi = data_train_full[categorical].apply(calculate_mi)
data_mi = data_mi.sort_values(ascending=False).to_frame(name='MI')

data_mi

Unnamed: 0,MI
contract,0.101598
onlinesecurity,0.062598
techsupport,0.062494
internetservice,0.054856
onlinebackup,0.049249
paymentmethod,0.047916
deviceprotection,0.042962
streamingmovies,0.031053
streamingtv,0.030457
paperlessbilling,0.019033


In [75]:
# расчет коэффициента корреляции для числовых переменных

data_train_full[numerical].corrwith(data_train_full['churn'])

tenure           -0.362305
monthlycharges    0.190375
totalcharges     -0.208863
dtype: float64

**Конструирование признаков**

In [76]:
# пребразование датафрейма в список словарей

train_dict = data_train[categorical + numerical].to_dict(orient='records')

train_dict

[{'gender': 'female',
  'partner': 'no',
  'dependents': 'yes',
  'phoneservice': 'yes',
  'multiplelines': 'no',
  'internetservice': 'dsl',
  'onlinesecurity': 'no',
  'onlinebackup': 'no',
  'deviceprotection': 'no',
  'techsupport': 'no',
  'streamingtv': 'no',
  'streamingmovies': 'no',
  'contract': 'month-to-month',
  'paperlessbilling': 'yes',
  'paymentmethod': 'electronic_check',
  'seniorcitizen': 0,
  'tenure': 12,
  'monthlycharges': 44.55,
  'totalcharges': 480.6},
 {'gender': 'male',
  'partner': 'no',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'no',
  'internetservice': 'fiber_optic',
  'onlinesecurity': 'no',
  'onlinebackup': 'yes',
  'deviceprotection': 'no',
  'techsupport': 'yes',
  'streamingtv': 'yes',
  'streamingmovies': 'yes',
  'contract': 'two_year',
  'paperlessbilling': 'yes',
  'paymentmethod': 'credit_card_(automatic)',
  'seniorcitizen': 0,
  'tenure': 41,
  'monthlycharges': 99.65,
  'totalcharges': 4220.35},
 {'gender': 'male',


In [None]:
# преобразование списка словарей в матрицу для модели машинного обучения (прямое кодирование)

from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)

X_train[0]

array([  1.  ,   0.  ,   0.  ,   0.  ,   1.  ,   1.  ,   0.  ,   0.  ,
         1.  ,   0.  ,   1.  ,   0.  ,   0.  ,  44.55,   1.  ,   0.  ,
         0.  ,   1.  ,   0.  ,   0.  ,   1.  ,   0.  ,   0.  ,   0.  ,
         1.  ,   1.  ,   0.  ,   0.  ,   0.  ,   1.  ,   0.  ,   0.  ,
         1.  ,   0.  ,   1.  ,   0.  ,   0.  ,   1.  ,   0.  ,   0.  ,
         1.  ,   0.  ,   0.  ,  12.  , 480.6 ])

In [78]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

**Создание модели машинного обучения для классификации**

In [79]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', random_state=0)
model.fit(X_train, y_train)

In [80]:
# преобразование валидационной выборки в матрицу

val_dict = data_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [81]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.36122879, 0.01677782, 0.19925009, ..., 0.14466862, 0.00572386,
       0.62995606], shape=(1860,))

In [82]:
churn = y_pred>=0.5
churn

array([False, False, False, ..., False, False,  True], shape=(1860,))

In [83]:
# расчет достоверности модели (в каком % случаев модель делает правильные прогнозы)

(y_val == churn).mean()

np.float64(0.8021505376344086)

In [85]:
test_dict = data_test[categorical + numerical].to_dict(orient='records')
X_test = dv.transform(test_dict)
y_pred_test = model.predict_proba(X_test)[:, 1]

churn = y_pred_test>=0.5

y_test = data_test['churn'].values
(y_test == churn).mean()

np.float64(0.7991483321504613)