In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

# Описание датасета (взято с kaggle)

id	(continous)	Уникальный идентификатор Клиента.

Age	(continous)	Возраст клиента.

Gender	(dichotomous)	Пол клиента.

Driving_License	(dichotomous)	0 для клиента, у которого нет водительских прав, 1 для клиента, у которого есть водительские права.

Region_Code	(nominal)	Уникальный код для региона клиента.

Previously_Insured	(dichotomous)	0 для клиента, не имеющего страховки на транспортное средство, 1 для клиента, имеющего страховку на транспортное средство.

Vehicle_Age	(nominal)	Возраст транспортного средства.

Vehicle_Damage	(dichotomous)	1: В прошлом у клиента был поврежден автомобиль. 0: В прошлом у клиента не было повреждений автомобиля.

Annual_Premium	(continous)	Сумма, которую клиент должен заплатить в качестве взноса в течение года.

Policy_Sales_Channel	(nominal)	Анонимизированный код для канала связи с клиентом, т.е. Различные агенты, по почте, телефону, лично и т.д.

Vintage	(continous)	Количество дней, в течение которых Клиент был связан с компанией.

Response (Dependent Feature)	(dichotomous)	1 означает, что Клиент заинтересован, 0 - что Клиент не заинтересован.

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [3]:
df_train = df.iloc[:int(df.count()[0]*0.5)]

  df_train = df.iloc[:int(df.count()[0]*0.5)]


In [4]:
df_test = df.iloc[int(df.count()[0]*0.5)+1:]
df_test.head()

  df_test = df.iloc[int(df.count()[0]*0.5)+1:]


Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
5752400,5752400,Female,23,1,8.0,1,< 1 Year,No,38259.0,152.0,80,0
5752401,5752401,Female,34,1,28.0,0,1-2 Year,Yes,55855.0,124.0,278,0
5752402,5752402,Male,40,1,8.0,0,1-2 Year,No,31527.0,124.0,196,0
5752403,5752403,Female,45,1,28.0,0,> 2 Years,Yes,30294.0,124.0,90,0
5752404,5752404,Male,28,1,6.0,0,< 1 Year,Yes,53436.0,124.0,258,1


# Предобработка

In [5]:
df_train = df_train.drop('id', axis=1) # Удаление бесполезного признака id
df_test = df_test.drop('id', axis=1)
df_train.dtypes

Gender                   object
Age                       int64
Driving_License           int64
Region_Code             float64
Previously_Insured        int64
Vehicle_Age              object
Vehicle_Damage           object
Annual_Premium          float64
Policy_Sales_Channel    float64
Vintage                   int64
Response                  int64
dtype: object

### Заменим значения признака Vehicle_Age на порядковые, а признаков Vehicle_Damage, Driving_license, Previously_Insured - на булевые значения

In [6]:
v_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}

df_train['Vehicle_Age'] = df_train['Vehicle_Age'].map(v_age_mapping)
df_test['Vehicle_Age'] = df_test['Vehicle_Age'].map(v_age_mapping)


v_damage_mapping = {'Yes': True, 'No': False}

df_train['Vehicle_Damage'] = df_train['Vehicle_Damage'].map(v_damage_mapping)
df_test['Vehicle_Damage'] = df_test['Vehicle_Damage'].map(v_damage_mapping)


bool_mapping = {1: True, 0: False}

df_train['Driving_License'] = df_train['Driving_License'].map(bool_mapping)
df_train['Previously_Insured'] = df_train['Previously_Insured'].map(bool_mapping)

df_test['Driving_License'] = df_test['Driving_License'].map(bool_mapping)
df_test['Previously_Insured'] = df_test['Previously_Insured'].map(bool_mapping)

df_train.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,21,True,35.0,False,1,True,65101.0,124.0,187,0
1,Male,43,True,28.0,False,2,True,58911.0,26.0,288,1
2,Female,25,True,14.0,True,0,False,38043.0,152.0,254,0
3,Female,35,True,1.0,False,1,True,2630.0,156.0,76,0
4,Female,36,True,15.0,True,1,False,31951.0,152.0,294,0


### Преобразуем признаки Policy_Sales_Channel и Region_Code в категориальные, ибо они несут именно такой смысл

In [7]:
df_train['Policy_Sales_Channel'] = df_train['Policy_Sales_Channel'].astype('object')
df_train['Region_Code'] = df_train['Region_Code'].astype('object')

df_test['Policy_Sales_Channel'] = df_test['Policy_Sales_Channel'].astype('object')
df_test['Region_Code'] = df_test['Region_Code'].astype('object')

df_train.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,21,True,35.0,False,1,True,65101.0,124.0,187,0
1,Male,43,True,28.0,False,2,True,58911.0,26.0,288,1
2,Female,25,True,14.0,True,0,False,38043.0,152.0,254,0
3,Female,35,True,1.0,False,1,True,2630.0,156.0,76,0
4,Female,36,True,15.0,True,1,False,31951.0,152.0,294,0


### Нормировка

In [15]:
numerical_features = df_train.select_dtypes(exclude=['object', 'bool']).columns
numerical_features = np.delete(numerical_features, np.where(numerical_features == 'Response'))
numerical_features = np.delete(numerical_features, np.where(numerical_features == 'Vehicle_Age'))

In [17]:
scaler = StandardScaler()
df_train[numerical_features] = scaler.fit_transform(df_train[numerical_features])
df_test[numerical_features] = scaler.transform(df_test[numerical_features])

df_train.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,-1.15932,True,35.0,False,0.698932,True,2.103491,124.0,0.288608,0
1,Male,0.307726,True,28.0,False,2.460103,True,1.727573,26.0,1.551348,1
2,Female,-0.892584,True,14.0,True,-1.062239,False,0.46026,152.0,1.126267,0
3,Female,-0.225745,True,1.0,False,0.698932,True,-1.69037,156.0,-1.099157,0
4,Female,-0.159061,True,15.0,True,0.698932,False,0.090293,152.0,1.626363,0


In [18]:
df_train = df_train.drop(['Policy_Sales_Channel', 'Region_Code'], axis=1)
df_test = df_test.drop(['Policy_Sales_Channel', 'Region_Code'], axis=1)

df_train = pd.get_dummies(df_train, drop_first=True)
df_test = pd.get_dummies(df_test, drop_first=True)
df_train.head()

Unnamed: 0,Age,Driving_License,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Vintage,Response,Gender_Male
0,-1.15932,True,False,0.698932,True,2.103491,0.288608,0,True
1,0.307726,True,False,2.460103,True,1.727573,1.551348,1,True
2,-0.892584,True,True,-1.062239,False,0.46026,1.126267,0,False
3,-0.225745,True,False,0.698932,True,-1.69037,-1.099157,0,False
4,-0.159061,True,True,0.698932,False,0.090293,1.626363,0,False


In [20]:

X_train = df_train.drop('Response', axis=1)
y_train = df_train['Response']

X_test = df_test.drop('Response', axis=1)
y_test = df_test['Response']



### Попробуем без достаточно сильно коррелированного признака Vehicle_age

In [None]:
X_train = X_train.drop('Vehicle_Age', axis=1)

In [22]:
X_test = X_test.drop('Vehicle_Age', axis=1)

In [23]:
tree_model = DecisionTreeClassifier(random_state=42, max_depth=80, min_samples_split=2)
tree_model.fit(X_train, y_train)
tree_pred = tree_model.predict(X_test)
f1_score(y_test, tree_pred)

0.40162630257011206

In [29]:
import pickle

with open('baseline.pkl', 'wb') as f:
    pickle.dump({'model': tree_model, 'scaler': scaler}, f)