## Постановка задачи
Задача страхового скоринга: https://www.kaggle.com/c/prudential-life-insurance-assessment

Требуется провести классификацию клиентов по уровню благонадежности для страхования жизни (всего 8 градаций) - Response. Для оценки доступно несколько параметров: виды страховки (Product_Info), возраст (Ins_Age), рост (Ht), вес (Wt), индекс массы тела (BMI), данные о работе (Employment_Info), данные страховки (InsuredInfo), история страхования (Insurance_History), семья (Family_Hist), медицинские данные (Medical_History) и медицинские термины (Medical_Keyword) - всего 126 переменных.

## Описание задания

Загрузите данные, приведите их к числовым, заполните пропуски, нормализуйте данные и оптимизируйте память.

Сформируйте параллельный ансамбль из CatBoost, градиентного бустинга, опорных векторов и LightGBM. Используйте лучшие гиперпараметры, подобранные ранее, или найдите их через перекрестную проверку. Итоговое решение рассчитайте на основании самого точного предсказания класса у определенной модели ансамбля: выберите для каждого класса модель, которая предсказывает его лучше всего.

Проведите расчеты и выгрузите результат в виде submission.csv

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
import lightgbm as lgb
from catboost import Pool, CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
# Загрузка данных
data = pd.read_csv('./data/train.csv.gz')
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 128 entries, Id to Response
dtypes: float64(18), int64(109), object(1)
memory usage: 58.0+ MB
None


In [3]:
# Предобработка данных
def data_preprocess (df):
    df["Product_Info_2_1"] = df["Product_Info_2"].str.slice(0, 1)
    df["Product_Info_2_2"] = pd.to_numeric(df["Product_Info_2"].str.slice(1, 2))
    df.drop(labels=["Product_Info_2"], axis=1, inplace=True)
    for l in df["Product_Info_2_1"].unique():
        df["Product_Info_2_1" + l] = df["Product_Info_2_1"].isin([l]).astype("int8")
    df.drop(labels=["Product_Info_2_1"], axis=1, inplace=True)
    df.fillna(value=-1, inplace=True)
    data["Response"] = data["Response"] - 1
    return df

In [4]:
data = data_preprocess(data)

In [5]:
# Набор столбцов для рассчета
columns_groups = ["Insurance_History", "InsurеdInfo", "Medical_Keyword",
                  "Family_Hist", "Medical_History", "Product_Info"]
columns = ["Wt", "Ht", "Ins_Age", "BMI"]
for cg in columns_groups:
    columns.extend(data.columns[data.columns.str.startswith(cg)])
print ('Columns:', len(columns))

Columns: 118


In [6]:
# Нормализация данных
scaler = preprocessing.StandardScaler()
data_transformed = pd.DataFrame(scaler.fit_transform(pd.DataFrame(data, columns=columns)))
columns_transformed = data_transformed.columns
data_transformed["Response"] = data["Response"]

In [7]:
# Оптимизация памяти
def reduce_mem_usage (df):
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == "float":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo("f2").min and c_max < np.finfo("f2").max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo("f4").min and c_max < np.finfo("f4").max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == "int":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo("i1").min and c_max < np.iinfo("i1").max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo("i2").min and c_max < np.iinfo("i2").max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo("i4").min and c_max < np.iinfo("i4").max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo("i8").min and c_max < np.iinfo("i8").max:
                df[col] = df[col].astype(np.int64)
        else:
            df[col] = df[col].astype("category")
    end_mem = df.memory_usage().sum() / 1024**2
    print('Потребление памяти меньше на', round(start_mem - end_mem, 2), 'Мб (минус', round(100 * (start_mem - end_mem) / start_mem, 1), '%)')
    return df

In [8]:
data_transformed = reduce_mem_usage(data_transformed)
print(data_transformed.info())

Потребление памяти меньше на 40.49 Мб (минус 75.1 %)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 119 entries, 0 to Response
dtypes: float16(118), int8(1)
memory usage: 13.4 MB
None


## Рассчет моделей

In [9]:
data_train, data_test = train_test_split(data_transformed, test_size=0.2)
x = pd.DataFrame(data_train, columns=columns_transformed)
y = data_train['Response']

## CatBoost

In [10]:
train_dataset = Pool(data=x, label=y)

model_cb = CatBoostClassifier(iterations=100,
        learning_rate=0.58,
        depth=7,
        l2_leaf_reg=1,
        random_seed=17, loss_function="MultiClass",
        bootstrap_type="MVS", custom_metric="WKappa")

model_cb.fit(train_dataset)

0:	learn: 1.5247446	total: 184ms	remaining: 18.2s
1:	learn: 1.3773391	total: 282ms	remaining: 13.8s
2:	learn: 1.3350589	total: 372ms	remaining: 12s
3:	learn: 1.3035095	total: 473ms	remaining: 11.4s
4:	learn: 1.2824761	total: 570ms	remaining: 10.8s
5:	learn: 1.2648602	total: 669ms	remaining: 10.5s
6:	learn: 1.2517940	total: 767ms	remaining: 10.2s
7:	learn: 1.2413652	total: 859ms	remaining: 9.88s
8:	learn: 1.2338946	total: 950ms	remaining: 9.61s
9:	learn: 1.2270294	total: 1.05s	remaining: 9.44s
10:	learn: 1.2210753	total: 1.14s	remaining: 9.26s
11:	learn: 1.2142604	total: 1.23s	remaining: 9.05s
12:	learn: 1.2051687	total: 1.35s	remaining: 9.04s
13:	learn: 1.2002528	total: 1.48s	remaining: 9.06s
14:	learn: 1.1934799	total: 1.6s	remaining: 9.07s
15:	learn: 1.1752871	total: 1.72s	remaining: 9.04s
16:	learn: 1.1704791	total: 1.84s	remaining: 9.01s
17:	learn: 1.1668390	total: 1.97s	remaining: 8.96s
18:	learn: 1.1597529	total: 2.1s	remaining: 8.95s
19:	learn: 1.1500299	total: 2.23s	remaining: 

<catboost.core.CatBoostClassifier at 0x7fbf6ab1b520>

## Градиентный бустинг

In [11]:
# model_gb = GradientBoostingClassifier(random_state=17, max_depth=13, max_features=26, min_samples_leaf=21, n_estimators=75)
model_gb = GradientBoostingClassifier(random_state=17, max_depth=7, max_features=14, min_samples_leaf=20, n_estimators=38)
model_gb.fit(x, y)

GradientBoostingClassifier(max_depth=7, max_features=14, min_samples_leaf=20,
                           n_estimators=38, random_state=17)

## Метод опорных векторов

In [12]:
model_svm = SVC(kernel='linear', probability=True, max_iter=1000)
model_svm.fit(x, y)



SVC(kernel='linear', max_iter=1000, probability=True)

## LightGBM

In [13]:
model_lgb = lgb.LGBMRegressor(random_state=17, max_depth=16,
            min_child_samples=17, num_leaves=36, n_estimators=1000,
            objective='multiclass', num_class=8)
model_lgb.fit(x, y)

LGBMRegressor(max_depth=16, min_child_samples=17, n_estimators=1000,
              num_class=8, num_leaves=36, objective='multiclass',
              random_state=17)

## Выбор для каждого класса модели, которая предсказывает его лучше всего

In [14]:
# Рассчитаем модели
x_test = pd.DataFrame(data_test, columns=columns_transformed)

data_test['target_cb'] = model_cb.predict(x_test)
data_test['target_gb'] = model_gb.predict(x_test)
data_test['target_svm'] = model_svm.predict(x_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [15]:
def calc_model_lgb(x):
    return np.argmax([x])
data_test['target_lgb'] = x_test.apply(calc_model_lgb, axis=1, result_type='expand')
data_test['target_lgb'] = data_test['target_lgb'].apply(lambda x:0 if x<0 else 7 if x>7 else x)

In [16]:
# Определим лучшую модель для каждого класса
models = [model_cb, model_gb, model_lgb, model_svm]
opt_models_by_class = [x for x in range(8)]
models_result = np.zeros((8, 4))

for i, row in data_test.iterrows():    
    r = int(row['Response'])
    if row['Response'] == row['target_cb']: 
        models_result[r][0] += 1
    if row['Response'] == row['target_gb']: 
        models_result[r][1] += 1
    if row['Response'] == row['target_lgb']: 
        models_result[r][2] += 1
    if row['Response'] == row['target_svm']: 
        models_result[r][3] += 1        

for i, cl in enumerate(models_result):
    model_class = str(type(models[np.argmax(cl)])).split('.')[-1]
    print(i, ':', model_class)
    opt_models_by_class[i] = models[np.argmax(cl)]

0 : SVC'>
1 : CatBoostClassifier'>
2 : CatBoostClassifier'>
3 : GradientBoostingClassifier'>
4 : GradientBoostingClassifier'>
5 : GradientBoostingClassifier'>
6 : CatBoostClassifier'>
7 : LGBMRegressor'>


## Загрузка данных для рассчетов

In [17]:
data_test = pd.read_csv("./data/test.csv.gz")
data_test = data_preprocess(data_test)
data_test = reduce_mem_usage(data_test)
data_test_transformed = pd.DataFrame(scaler.transform(pd.DataFrame(data_test, columns=columns)))
columns_transformed = data_test_transformed.columns
print (data_test_transformed.info())

Потребление памяти меньше на 16.34 Мб (минус 84.9 %)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19765 entries, 0 to 19764
Columns: 118 entries, 0 to 117
dtypes: float32(118)
memory usage: 8.9 MB
None


## Предсказание данных

In [18]:
x_test = pd.DataFrame(data_test_transformed, columns=columns_transformed)

data_test_cb = pd.DataFrame(model_cb.predict_proba(x_test))
data_test_gb = pd.DataFrame(model_gb.predict_proba(x_test))
data_test_lgb = pd.DataFrame(model_lgb.predict(x_test))
data_test_svm = pd.DataFrame(model_svm.predict_proba(x_test))

In [19]:
def vote_class (x):
    return np.argmax(x.values)

In [20]:
data_test_proba = data_test_cb.copy()
for i in range(0, 8):
    data_test_proba[i] = data_test_proba[i]
    data_test_proba[i] = data_test_proba[i] + data_test_gb[i]
    data_test_proba[i] = data_test_proba[i] + data_test_lgb[i]
    data_test_proba[i] = data_test_proba[i] + data_test_svm[i]
data_test_proba["voted_class"] = data_test_proba.apply(vote_class, axis=1)

In [29]:
def getOptModel(x):
    cb_res = model_cb.predict(x)
    gb_res = model_gb.predict([x])
    lgb_res = model_lgb.predict([x])
    lgb_res = np.argmax(lgb_res)
    lgb_res = 0 if lgb_res<0 else 7 if lgb_res>7 else lgb_res
    svm_res = model_svm.predict([x])

    n = 0
    model = opt_models_by_class[int(cb_res)]
    if str(type(model)).find('CatBoost'): 
        n += 1
    
    model = opt_models_by_class[int(gb_res)]
    if str(type(model)).find('GradientBoosting'):
        n += 1
    
    model = opt_models_by_class[int(lgb_res)]
    if str(type(model)).find('LGBM'):
        n += 1
    
    model = opt_models_by_class[int(svm_res)]
    if str(type(model)).find('SVC'):
        n += 1

    # Если никакая модель не выдала свой самый точный класс, или несколько моделей выдало свои самые точные классы,
    # то берем наилучшую модель для класса, выбранный мягким голосованием
    if n != 1:
        voted_class = data_test_proba['voted_class'].values[x.name]
        model = opt_models_by_class[voted_class]

    return model

In [30]:

def calculate_model(x):
    model = getOptModel(x)
        
    # LGB дает список вероятностей по классам
    if str(type(model)).find('LGBM'):
        y = model.predict([x])   
        y = np.argmax(y)    
        y = 0 if y<0 else 7 if y>7 else y  
    else:
        y = model.predict(x)
    x['Response'] = y

    return x

data_test_transformed = data_test_transformed.apply(calculate_model, axis=1, result_type='expand')

## Подготовка результатов

In [31]:
submission = pd.read_csv("./data/sample_submission.csv.gz")

In [32]:
submission["Response"] = data_test_transformed["Response"].astype("int8")

In [33]:
submission.to_csv("./data/submission.csv", index=False)