Наконец, используя тестовые датасеты и 8-фолдовую модель, делаем предикт

In [None]:
import numpy as np
import pandas as pd
import math
import os
import gc
import sys
import pickle
import lightgbm as lgb
from sklearn import preprocessing

In [None]:
def reduce_mem_usage(df):
    """ 
    iterate through all the columns of a dataframe and 
    modify the data type to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage of dataframe is {:.2f}' 
                     'MB').format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max <\
                  np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max <\
                   np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max <\
                   np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max <\
                   np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max <\
                   np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max <\
                   np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print(('Memory usage after optimization is: {:.2f}' 
                              'MB').format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) 
                                             / start_mem))
    
    return df

In [None]:
df_valid_lgb = pd.read_csv('../input/alfabattle-1-stat/alfa1_df_valid6.csv')
df_test_lgb = pd.read_csv('../input/alfabattle-1-stat-test/df_test_nn.csv')

In [None]:
df_valid_lgb.fillna('nothing', inplace=True)
df_test_lgb.fillna('nothing', inplace=True)

In [None]:
df_valid_lgb = reduce_mem_usage(df_valid_lgb)
df_test_lgb = reduce_mem_usage(df_test_lgb)

In [None]:
le1 = preprocessing.LabelEncoder()
le1.fit(df_valid_lgb['lag_1'].unique())

In [None]:
df_valid_lgb_exp = pd.read_csv('../input/alfabattle-1-stat4/alfa1_df_valid10.csv')
df_test_lgb_exp = pd.read_csv('../input/alfabattle-1-stat-test/df_test_exp.csv')

In [None]:
df_valid_lgb_exp = reduce_mem_usage(df_valid_lgb_exp)
df_test_lgb_exp = reduce_mem_usage(df_test_lgb_exp)

In [None]:
df_valid_lgb_exp1 = pd.read_csv('../input/alfabattle-1-stat5/alfa1_df_valid11.csv')
df_test_lgb_exp1 = pd.read_csv('../input/alfabattle-1-stat-test/df_test_exp1.csv')

In [None]:
df_valid_lgb_exp1 = reduce_mem_usage(df_valid_lgb_exp1)
df_test_lgb_exp1 = reduce_mem_usage(df_test_lgb_exp1)

In [None]:
df_valid_lgb_exp2 = pd.read_csv('../input/alfabattle-1-stat6/alfa1_df_valid12.csv')
df_test_lgb_exp2 = pd.read_csv('../input/alfabattle-1-stat-test/df_test_exp2.csv')

In [None]:
df_valid_lgb_exp2 = reduce_mem_usage(df_valid_lgb_exp2)
df_test_lgb_exp2 = reduce_mem_usage(df_test_lgb_exp2)

In [None]:
aug_lgb = df_valid_lgb_exp.drop(['client_pin', 'lag_1', 'lag_2', 'weight'], axis=1).columns
aug_lgb1 = df_valid_lgb_exp1.drop(['client_pin', 'lag_1', 'lag_2', 'weight'], axis=1).columns
aug_lgb2 = df_valid_lgb_exp2.drop(['client_pin', 'lag_1', 'lag_2', 'weight'], axis=1).columns

In [None]:
df_valid_lgb[aug_lgb] = df_valid_lgb_exp[aug_lgb]
df_test_lgb[aug_lgb] = df_test_lgb_exp[aug_lgb]

df_valid_lgb[aug_lgb1] = df_valid_lgb_exp1[aug_lgb1]
df_test_lgb[aug_lgb1] = df_test_lgb_exp1[aug_lgb1]

df_valid_lgb[aug_lgb2] = df_valid_lgb_exp2[aug_lgb2]
df_test_lgb[aug_lgb2] = df_test_lgb_exp2[aug_lgb2]

In [None]:
df_valid_lgb['class_weight'] = 1
df_test_lgb['class_weight'] = 1

In [None]:
lgb_valid = []
for k in range(df_valid_lgb.shape[0]):
    a = np.argmax(lgb_proba_valid[k,])
    lgb_valid.append(a)

lgb_test = []
for k in range(df_test_lgb.shape[0]):
    a = np.argmax(lgb_proba_test[k,])
    lgb_test.append(a)

In [None]:
to_drop = []
to_drop.append('lag_1')
to_drop.append('client_pin')
to_drop.append('weight')
to_drop.append('class_weight')

In [None]:
n_splits = 8
lgb_proba_valid = np.zeros((df_valid_lgb.shape[0], 10))
lgb_proba_test = np.zeros((df_test_lgb.shape[0], 10))
for i in range(n_splits):
    with open(f'../input/lgb-models/lgb_model8_fold{i}.pkl', 'rb') as fin:
        model_lgb = pickle.load(fin)
    lgb_proba_valid += model_lgb.predict(df_valid_lgb.drop(to_drop, axis=1)) / n_splits
    lgb_proba_test += model_lgb.predict(df_test_lgb.drop(to_drop, axis=1)) / n_splits

In [None]:
df_test_lgb['lag_1'] = le1.inverse_transform(lgb_test)

In [None]:
df_sample = pd.read_csv('../input/alfabattle1/alfabattle2_abattle_sample_prediction.csv')

In [None]:
df_sample.drop(['prediction'], axis=1, inplace=True)

In [None]:
df_sample = df_sample.merge(df_test_lgb[['client_pin', 'lag_1']].rename({'lag_1':'prediction'}, axis=1), how='left', on='client_pin')

In [None]:
df_sample.to_csv('alfa1_lgb.csv', index=False)