1. Построить обобщенную линейную модель (GLM) для прогнозирования наступления страховых случаев на рассмотренных в ноутбуке данных. 
2. Подобрать необходимое распределение и тип связи, при необходимости ознакомиться с документацией H20. 
3. Придумать и использовать дополнительные факторы при построении модели (например, пересечения признаков или функции от них и т.д.). 
4. Оценить результаты построенной модели при помощи различных метрик (можно использовать и другие метрики помимо представленных в ноутбуке).
5. Проанализировать вероятные проблемы. 
6. Предложить способы их решения и/или попробовать их решить, улучшив результат.

In [1]:
# !pip install h2o
import numpy as np
import pandas as pd
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [2]:
df = pd.read_csv('~/Yandex.Disk/geekbrains/Python/MLInBusines/lesson_6/freMPL-R.csv', low_memory=False)
df = df.loc[df.Dataset.isin([5, 6, 7, 8, 9])]
df.drop('Dataset', axis=1, inplace=True)
df.dropna(axis=1, how='all', inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115155 entries, 0 to 115154
Data columns (total 20 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Exposure           115155 non-null  float64
 1   LicAge             115155 non-null  int64  
 2   RecordBeg          115155 non-null  object 
 3   RecordEnd          59455 non-null   object 
 4   Gender             115155 non-null  object 
 5   MariStat           115155 non-null  object 
 6   SocioCateg         115155 non-null  object 
 7   VehUsage           115155 non-null  object 
 8   DrivAge            115155 non-null  int64  
 9   HasKmLimit         115155 non-null  int64  
 10  BonusMalus         115155 non-null  int64  
 11  ClaimAmount        115155 non-null  float64
 12  ClaimInd           115155 non-null  int64  
 13  ClaimNbResp        115155 non-null  float64
 14  ClaimNbNonResp     115155 non-null  float64
 15  ClaimNbParking     115155 non-null  float64
 16  Cl

In [3]:
NegClaimAmount = df.loc[df.ClaimAmount < 0, ['ClaimAmount','ClaimInd']]
print('Unique values of ClaimInd:', NegClaimAmount.ClaimInd.unique())
NegClaimAmount.head()

Unique values of ClaimInd: [0]


Unnamed: 0,ClaimAmount,ClaimInd
82,-74.206042,0
175,-1222.585196,0
177,-316.288822,0
363,-666.75861,0
375,-1201.600604,0


In [4]:
df.loc[df.ClaimAmount < 0, 'ClaimAmount'] = 0

In [5]:
def SeriesFactorizer(series):
    series, unique = pd.factorize(series)
    reference = {x: i for x, i in enumerate(unique)}
    print(reference)
    return series, reference

In [6]:
df.Gender, GenderRef = SeriesFactorizer(df.Gender)
df.MariStat, MariStatRef = SeriesFactorizer(df.MariStat)

{0: 'Male', 1: 'Female'}
{0: 'Other', 1: 'Alone'}


In [7]:
list(df.VehUsage.unique())

['Professional', 'Private+trip to office', 'Private', 'Professional run']

In [8]:
VU_dummies = pd.get_dummies(df.VehUsage, prefix='VehUsg', drop_first=False)
VU_dummies.head()

Unnamed: 0,VehUsg_Private,VehUsg_Private+trip to office,VehUsg_Professional,VehUsg_Professional run
0,0,0,1,0
1,0,0,1,0
2,0,1,0,0
3,0,1,0,0
4,1,0,0,0


In [9]:
df['SocioCateg'] = df.SocioCateg.str.slice(0,4)

In [10]:
pd.DataFrame(df.SocioCateg.value_counts().sort_values()).rename({'SocioCateg': 'Frequency'}, axis=1)

Unnamed: 0,Frequency
CSP7,14
CSP3,1210
CSP1,2740
CSP2,3254
CSP4,7648
CSP6,24833
CSP5,75456


In [11]:
df = pd.get_dummies(df, columns=['VehUsage','SocioCateg'])

In [12]:
df = df.select_dtypes(exclude=['object'])

In [13]:
df['DrivAgeSq'] = df.DrivAge.apply(lambda x: x**2)
df.head()

Unnamed: 0,Exposure,LicAge,Gender,MariStat,DrivAge,HasKmLimit,BonusMalus,ClaimAmount,ClaimInd,ClaimNbResp,...,VehUsage_Professional,VehUsage_Professional run,SocioCateg_CSP1,SocioCateg_CSP2,SocioCateg_CSP3,SocioCateg_CSP4,SocioCateg_CSP5,SocioCateg_CSP6,SocioCateg_CSP7,DrivAgeSq
0,0.083,332,0,0,46,0,50,0.0,0,0.0,...,1,0,0,0,0,0,1,0,0,2116
1,0.916,333,0,0,46,0,50,0.0,0,0.0,...,1,0,0,0,0,0,1,0,0,2116
2,0.55,173,0,0,32,0,68,0.0,0,0.0,...,0,0,0,0,0,0,1,0,0,1024
3,0.089,364,1,0,52,0,50,0.0,0,0.0,...,0,0,0,0,0,0,1,0,0,2704
4,0.233,426,0,0,57,0,50,0.0,0,0.0,...,0,0,0,0,0,0,0,1,0,3249


In [14]:
df['ClaimsCount'] = df.ClaimInd + df.ClaimNbResp + df.ClaimNbNonResp + df.ClaimNbParking + df.ClaimNbFireTheft + df.ClaimNbWindscreen
df.loc[df.ClaimAmount == 0, 'ClaimsCount'] = 0
df.drop(["ClaimNbResp", "ClaimNbNonResp", "ClaimNbParking", "ClaimNbFireTheft", "ClaimNbWindscreen"], axis=1, inplace=True)

In [15]:
pd.DataFrame(df.ClaimsCount.value_counts()).rename({'ClaimsCount': 'Policies'}, axis=1)

Unnamed: 0,Policies
0.0,104286
2.0,3529
1.0,3339
3.0,2310
4.0,1101
5.0,428
6.0,127
7.0,26
8.0,6
9.0,2


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115155 entries, 0 to 115154
Data columns (total 24 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Exposure                         115155 non-null  float64
 1   LicAge                           115155 non-null  int64  
 2   Gender                           115155 non-null  int64  
 3   MariStat                         115155 non-null  int64  
 4   DrivAge                          115155 non-null  int64  
 5   HasKmLimit                       115155 non-null  int64  
 6   BonusMalus                       115155 non-null  int64  
 7   ClaimAmount                      115155 non-null  float64
 8   ClaimInd                         115155 non-null  int64  
 9   OutUseNb                         115155 non-null  float64
 10  RiskArea                         115155 non-null  float64
 11  VehUsage_Private                 115155 non-null  uint8  
 12  Ve

In [17]:
# Разбиение датасета на train/val/test
# , 'CountPredicted', 'AvgClaimPredicted', 'BurningCost'
x_train_ind, x_test_ind, y_train_ind, y_test_ind = train_test_split(df.drop(['ClaimInd', 'ClaimAmount', 'ClaimsCount'], axis=1), df.ClaimInd, test_size=0.3, random_state=1)
x_valid_ind, x_test_ind, y_valid_ind, y_test_ind = train_test_split(x_test_ind, y_test_ind, test_size=0.5, random_state=1)

In [18]:
x_train_ind.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80608 entries, 54963 to 98539
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Exposure                         80608 non-null  float64
 1   LicAge                           80608 non-null  int64  
 2   Gender                           80608 non-null  int64  
 3   MariStat                         80608 non-null  int64  
 4   DrivAge                          80608 non-null  int64  
 5   HasKmLimit                       80608 non-null  int64  
 6   BonusMalus                       80608 non-null  int64  
 7   OutUseNb                         80608 non-null  float64
 8   RiskArea                         80608 non-null  float64
 9   VehUsage_Private                 80608 non-null  uint8  
 10  VehUsage_Private+trip to office  80608 non-null  uint8  
 11  VehUsage_Professional            80608 non-null  uint8  
 12  VehUsage_Profe

In [19]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.7" 2020-04-14; OpenJDK Runtime Environment (build 11.0.7+10-post-Ubuntu-2ubuntu219.10); OpenJDK 64-Bit Server VM (build 11.0.7+10-post-Ubuntu-2ubuntu219.10, mixed mode, sharing)
  Starting server from /home/aleksandr/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp41ypz1x6
  JVM stdout: /tmp/tmp41ypz1x6/h2o_aleksandr_started_from_python.out
  JVM stderr: /tmp/tmp41ypz1x6/h2o_aleksandr_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,05 secs
H2O_cluster_timezone:,Asia/Aqtobe
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,"14 days, 15 hours and 52 minutes"
H2O_cluster_name:,H2O_from_python_aleksandr_e9rtzk
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.902 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [20]:
# Преобразование в H2O-Frame
h2o_train_ind = h2o.H2OFrame(pd.concat([x_train_ind, y_train_ind], axis=1))
h2o_valid_ind = h2o.H2OFrame(pd.concat([x_valid_ind, y_valid_ind], axis=1))
h2o_test_ind = h2o.H2OFrame(pd.concat([x_test_ind, y_test_ind], axis=1))

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [21]:
# Преобразуем целевую переменную ClaimInd в категориальную при помощи метода asfactor во всех наборах данных
h2o_train_ind['ClaimInd'] = h2o_train_ind['ClaimInd'].asfactor()
h2o_valid_ind['ClaimInd'] = h2o_valid_ind['ClaimInd'].asfactor()
h2o_test_ind['ClaimInd'] = h2o_test_ind['ClaimInd'].asfactor()

In [28]:
# Инициализируем и обучим GLM модель c кросс-валидацией

glm_poisson = H2OGeneralizedLinearEstimator(family = "binomial", nfolds=5)

In [29]:
glm_poisson.train(y="ClaimInd", x = h2o_train_ind.names[1:-1], training_frame = h2o_train_ind, validation_frame = h2o_valid_ind, weights_column = "Exposure")

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [30]:
# Параметры модели: распределение, функция связи, гиперпараметры регуляризации, количество использованных объясняющих переменных

glm_poisson.summary()


GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,binomial,logit,"Elastic Net (alpha = 0.5, lambda = 3.295E-5 )",20,19,3,py_1_sid_9b3d




In [31]:
# Метрики качества модели - по всем данным и на кросс-валидации

glm_poisson.cross_validation_metrics_summary()


Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.50177556,0.09978487,0.547093,0.37196717,0.5541791,0.6117961,0.4238426
1,auc,0.56742203,0.006320417,0.57394934,0.5612734,0.561128,0.5737166,0.56704295
2,aucpr,0.15526523,0.0065451325,0.16566706,0.1485934,0.15686117,0.15162405,0.15358043
3,err,0.4982244,0.09978487,0.452907,0.6280328,0.44582093,0.3882039,0.5761574
4,err_count,3556.6296,737.19183,3217.152,4488.182,3149.431,2754.307,4174.076
5,f0point5,0.17747174,0.008400764,0.18626326,0.1655264,0.18288156,0.18024877,0.17243868
6,f1,0.24071655,0.005899519,0.24895051,0.23443726,0.24349752,0.23579505,0.24090236
7,f2,0.37629128,0.025434632,0.3752374,0.40164915,0.364217,0.34082532,0.39952758
8,lift_top_group,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,logloss,0.3800826,0.0073391143,0.38699064,0.3755703,0.38728178,0.37030974,0.38026065



See the whole table with table.as_data_frame()




In [32]:
# Таблица коэффициентов модели (в зависимости от модели могут выводиться также стандартная ошибка, z-score и p-value)

glm_poisson._model_json['output']['coefficients_table'].as_data_frame()

Unnamed: 0,names,coefficients,standardized_coefficients
0,Intercept,-2.174604,-1.939603
1,LicAge,-0.000189,-0.030147
2,Gender,0.015184,0.007353
3,MariStat,-0.052343,-0.018717
4,DrivAge,-0.003914,-0.058577
5,HasKmLimit,-0.425867,-0.132326
6,BonusMalus,0.00663,0.099958
7,OutUseNb,0.068515,0.045966
8,RiskArea,0.014767,0.03267
9,VehUsage_Private,-0.157718,-0.0748


In [33]:
# Таблица нормированных коэффициентов по всем данным и на кросс-валидации

pmodels = {}
pmodels['overall'] = glm_poisson.coef_norm()
for x in range(len(glm_poisson.cross_validation_models())):
    pmodels[x] = glm_poisson.cross_validation_models()[x].coef_norm()
pd.DataFrame.from_dict(pmodels).round(5)

Unnamed: 0,overall,0,1,2,3,4
Intercept,-1.9396,-1.94791,-1.93516,-1.94835,-1.92829,-1.9397
LicAge,-0.03015,-0.01102,-0.00625,-0.07874,-0.01811,-0.02738
Gender,0.00735,0.00779,0.01126,-0.001,0.00628,0.01338
MariStat,-0.01872,-0.02403,-0.02392,-0.01287,-0.023,-0.01126
DrivAge,-0.05858,-0.06333,-0.16164,-0.03795,-0.05693,-0.04986
HasKmLimit,-0.13233,-0.13631,-0.13927,-0.12993,-0.12779,-0.1287
BonusMalus,0.09996,0.09541,0.10134,0.09745,0.09702,0.1059
OutUseNb,0.04597,0.05193,0.04792,0.03984,0.04573,0.04478
RiskArea,0.03267,0.0428,0.03505,0.01544,0.025,0.04542
VehUsage_Private,-0.0748,-0.06266,-0.05594,-0.07796,-0.07135,-0.07941


In [34]:
# Построение прогнозных значений для обучающей, валидационной и тестовой выборок

c_train_pred = glm_poisson.predict(h2o_train_ind).as_data_frame()
c_valid_pred = glm_poisson.predict(h2o_valid_ind).as_data_frame()
c_test_pred = glm_poisson.predict(h2o_test_ind).as_data_frame()

glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%


In [35]:
# Выведем импортированные выше метрики классификации для обучающей, валидационной и тестовой выборок
true_test = h2o_test_ind['ClaimInd'].as_data_frame()['ClaimInd'].values
pred_test = c_test_pred['predict'].values
true_train = h2o_train_ind['ClaimInd'].as_data_frame()['ClaimInd'].values
pred_train = c_train_pred['predict'].values
# accuracy_score, f1_score, confusion_matrix
acu_train = accuracy_score(true_train, pred_train)
f1_train = f1_score(true_train, pred_train)
matr_train = confusion_matrix(true_train, pred_train)
acu_test = accuracy_score(true_test, pred_test)
f1_test = f1_score(true_test, pred_test)
matr_test = confusion_matrix(true_test, pred_test)

In [36]:
print(f'Accuracy score train: {acu_train}')
print(f'F_score train: {f1_train}')
print(f'confusion_matrix train: \n{matr_train}')
print(f'Accuracy score test: {acu_test}')
print(f'F_score test: {f1_test}')
print(f'confusion_matrix test: \n{matr_test}')

Accuracy score train: 0.5353066693132196
F_score train: 0.18459663024075926
confusion_matrix train: 
[[38910 34063]
 [ 3395  4240]]
Accuracy score test: 0.536934120643742
F_score test: 0.18568665377176016
confusion_matrix test: 
[[8363 7286]
 [ 713  912]]


In [37]:
# Сохранение обученной модели
model_glm_poisson = h2o.save_model(model=glm_poisson, path="model_lesson_7", force=True)

In [38]:
df['ClaimInd'].value_counts()

0    104286
1     10869
Name: ClaimInd, dtype: int64

Результаты работы модели неудовлетворительные, возможной причиной может быть не сбалансированность классов. Не успел изучить влияние балансировки классов на работу модели. Также для улучшения качества модели можно поэксперементировать с генерацией новых признаков.