# Importing Libraries

In [141]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier
from sklearn.pipeline import make_pipeline

In [142]:
from sklearn.preprocessing import StandardScaler
import catboost as catb
from sklearn.ensemble import StackingClassifier

# Cleaning Outliers

In [143]:
def clean_number_features(df):
    
    num_col_upper_bound = {

        'balance_amt_avg':207000000,
        'balance_amt_max':382000000,
        'balance_amt_min': 200000000,
        'balance_amt_day_avg': 207000000,

        'rko_start_months': 300,
        'max_end_fact_fin_deals':26,
        'max_start_non_fin_deals':109,
        'min_end_plan_non_fin_deals': 120,
        'min_start_non_fin_deals': 10,
        'min_start_fin_deals': 140,
        'ogrn_days_end_month': 140,

        'ft_registration_date': 80000,
        'max_founderpres': 50000,
        'min_founderpres': 30000,
        'ogrn_exist_months': None,
        'sum_of_paym_2m': 10000000000,
        'sum_of_paym_6m': 10000000000,
        'sum_of_paym_1y': None,
        'sum_a_oper_1m': 10000000,
        'cnt_a_oper_1m': 1000,
        'sum_b_oper_1m': 10000000000,
        'cnt_b_oper_1m': 300,
        'sum_c_oper_1m': None,
        'cnt_c_oper_1m': 30000,
        'sum_deb_d_oper_1m': 1000000000,
        'cnt_deb_d_oper_1m': None,
        'sum_cred_d_oper_1m': None,
        
    
        'cnt_days_deb_e_oper_1m': 35,
        'cnt_days_cred_e_oper_1m': 35,
        'cnt_days_deb_f_oper_1m': 35,
        'cnt_days_cred_f_oper_1m': 35,
        'cnt_days_deb_g_oper_1m': 35,
        'cnt_days_cred_g_oper_1m': 35,
        'cnt_days_deb_h_oper_1m': 35,
        'cnt_days_cred_h_oper_1m': 35,
        
        'cnt_days_deb_e_oper_3m': 120,
        'cnt_days_cred_e_oper_3m': 120,
        'cnt_days_deb_f_oper_3m': 120,
        'cnt_days_cred_f_oper_3m': 120,
        'cnt_days_deb_g_oper_3m': 120,
        'cnt_days_cred_g_oper_3m': 120,
        'cnt_days_deb_h_oper_3m': 120,
        'cnt_days_cred_h_oper_3m': 120,
        
        'sum_cred_d_oper_1m': 1000000000,
        'cnt_cred_d_oper_1m': 1500,
        'sum_deb_e_oper_1m': 10000000000,
        'cnt_deb_e_oper_1m': 50000,
        'sum_cred_e_oper_1m': 10000000000,
        'cnt_cred_e_oper_1m': 10000,
        'sum_deb_f_oper_1m': None,
        'cnt_deb_f_oper_1m': 10000,
        'sum_cred_f_oper_1m': 1000000000,
        'cnt_cred_f_oper_1m': 10000,
        'sum_deb_g_oper_1m': None,
        'cnt_deb_g_oper_1m': 10000,
        'sum_cred_g_oper_1m': None,
        'cnt_cred_g_oper_1m': None,
        'sum_deb_h_oper_1m': 10000000000,
        'cnt_deb_h_oper_1m': 10000,
        'sum_cred_h_oper_1m': 10000000000,
        'cnt_cred_h_oper_1m': 10000,
        'sum_a_oper_3m': 100000000,
        'cnt_a_oper_3m': 1000,
        'sum_b_oper_3m': 10000000000,
        'cnt_b_oper_3m': 1000,
        'sum_c_oper_3m': 100000000,
        'cnt_c_oper_3m': 10000,
        'sum_deb_d_oper_3m': 1000000000,
        'cnt_deb_d_oper_3m': 10000,
        'sum_cred_d_oper_3m': None,
        'cnt_cred_d_oper_3m': 10000,
        'sum_deb_e_oper_3m': None,
        'cnt_deb_e_oper_3m': 1000000,
        'sum_cred_e_oper_3m': 10000000000,
        'cnt_cred_e_oper_3m': 100000,
        'sum_deb_f_oper_3m': 1000000000,
        'cnt_deb_f_oper_3m': 100000,
        'sum_cred_f_oper_3m': 1000000000,
        'cnt_cred_f_oper_3m': None,
        'sum_deb_g_oper_3m': None,
        'cnt_deb_g_oper_3m': 10000,
        'sum_cred_g_oper_3m': 10000000000,
        'cnt_cred_g_oper_3m': None,
        'sum_deb_h_oper_3m': 10000000000,
        'cnt_deb_h_oper_3m': 10000,
        'sum_cred_h_oper_3m': 10000000000,
        'cnt_cred_h_oper_3m': None
    }
    
    for column_name in num_col_upper_bound.keys():
        
        column_filter = (df[column_name] < 0)
        
        if num_col_upper_bound[column_name] is not None:
            column_filter = column_filter | (df[column_name] > num_col_upper_bound[column_name])
            
        df.loc[column_filter, column_name] = np.NaN
    return df

# Feature Engineering

In [144]:
def add_city_type_features(df):

    city_types = ['3597', '1252']

    for city_type in city_types:
        df[f'city_type_{city_type}'] = df['city_type'] == city_type
        
    return df

In [145]:
def add_segment_features(df):
    
    for i in range(4):
        df[f'segment_{i}'] = df['segment'] == str(i)
        
    return df

In [146]:
def add_paym_ratio(df):
    sum_of_pay_12_6 = df['sum_of_paym_1y'] - df['sum_of_paym_6m']
    sum_of_pay_6_2 = df['sum_of_paym_6m'] - df['sum_of_paym_2m']
    
    df['add_paym_ratio_1'] = sum_of_pay_12_6

In [147]:
def add_features(df):
    
    df = add_city_type_features(df)
    df = add_segment_features(df)
    
    return df

# Data

In [148]:
df= pd.read_parquet("C:\Documents\Machine Learning\Alfa\Siberian Alfa Hack Materials\Train.parquet")

In [149]:
df=clean_number_features(df)
df = add_features(df)

In [150]:
X, y = df.drop(columns=['id', 'target_1', 'target_2', 'total_target']), df['total_target']

In [151]:
X = X.loc[:, X.dtypes != 'object']

# Model

In [152]:
model1 = make_pipeline(
    LGBMClassifier()
)


In [153]:
model2 = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    StandardScaler(),
    catb.CatBoostClassifier()
)


Stacking of model1 and model2:

In [154]:
model3=make_pipeline(
        StackingClassifier([('lgbm', model1),
                        ('catb', model2)])
)

In [155]:
model3.fit(X,y)

[LightGBM] [Info] Number of positive: 25315, number of negative: 334685
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051532 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21119
[LightGBM] [Info] Number of data points in the train set: 360000, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.070319 -> initscore=-2.581793
[LightGBM] [Info] Start training from score -2.581793


KeyboardInterrupt: 

In [None]:
model1.fit(X,y)

[LightGBM] [Info] Number of positive: 25315, number of negative: 334685
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.053628 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21119
[LightGBM] [Info] Number of data points in the train set: 360000, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.070319 -> initscore=-2.581793
[LightGBM] [Info] Start training from score -2.581793


In [None]:
model2.fit(X, y)

Learning rate set to 0.127197
0:	learn: 0.5305546	total: 66.8ms	remaining: 1m 6s
1:	learn: 0.4146032	total: 109ms	remaining: 54.2s
2:	learn: 0.3398421	total: 150ms	remaining: 49.8s
3:	learn: 0.2947984	total: 189ms	remaining: 47.1s
4:	learn: 0.2614476	total: 231ms	remaining: 46s
5:	learn: 0.2383968	total: 275ms	remaining: 45.5s
6:	learn: 0.2222182	total: 331ms	remaining: 46.9s
7:	learn: 0.2111565	total: 381ms	remaining: 47.2s
8:	learn: 0.2029603	total: 422ms	remaining: 46.5s
9:	learn: 0.1972912	total: 468ms	remaining: 46.3s
10:	learn: 0.1932171	total: 517ms	remaining: 46.5s
11:	learn: 0.1900364	total: 577ms	remaining: 47.5s
12:	learn: 0.1877061	total: 627ms	remaining: 47.6s
13:	learn: 0.1855750	total: 673ms	remaining: 47.4s
14:	learn: 0.1843656	total: 722ms	remaining: 47.4s
15:	learn: 0.1830346	total: 774ms	remaining: 47.6s
16:	learn: 0.1822141	total: 814ms	remaining: 47.1s
17:	learn: 0.1812342	total: 854ms	remaining: 46.6s
18:	learn: 0.1805221	total: 896ms	remaining: 46.3s
19:	learn: 0

# The Most and the least important features for LGBM and CatBoost

In [None]:
important_features_dict = {}
for idx, val in enumerate(model1.steps[0][1].feature_importances_):
    important_features_dict[idx] = val

important_features_list = sorted(important_features_dict,
                                 key=important_features_dict.get,
                                 reverse=False)

In [None]:
arr=model1.steps[0][1].feature_importances_
feature_names=model1.steps[0][1].feature_name_

arr_1=[]
for i in range(len(arr_1)):
    arr_1.append((feature_names[i],arr_1[i]))
arr_1=sorted(arr_1,key=lambda x:x[1])  

Top 10 best features in LGBM model

In [None]:
print(arr_1[-10:])

[('min_founderpres', 59), ('cnt_cred_e_oper_3m', 60), ('ogrn_exist_months', 62), ('cnt_days_cred_e_oper_1m', 62), ('sum_of_paym_1y', 68), ('sum_deb_e_oper_1m', 73), ('rko_start_months', 87), ('cnt_days_cred_e_oper_3m', 88), ('sum_cred_e_oper_1m', 94), ('balance_amt_min', 138)]


Top 10 worst features in LGBM model

In [None]:
print(arr_1[:10])

[('city_type_1252', 0), ('segment_2', 0), ('segment_3', 0), ('cnt_b_oper_3m', 1), ('cnt_cred_f_oper_1m', 2), ('sum_b_oper_3m', 3), ('cnt_days_cred_f_oper_3m', 3), ('cnt_cred_d_oper_1m', 4), ('cnt_days_deb_f_oper_1m', 4), ('segment_1', 4)]


In [None]:
arr=model1.steps[0][1].feature_importances_
feature_names=model1.steps[0][1].feature_name_

arr_1=[]
for i in range(len(arr)):
    arr_1.append((feature_names[i],arr[i]))
arr_1=sorted(arr_1,key=lambda x:x[1]) 
arr_1[-10:]

[('min_founderpres', 59),
 ('cnt_cred_e_oper_3m', 60),
 ('ogrn_exist_months', 62),
 ('cnt_days_cred_e_oper_1m', 62),
 ('sum_of_paym_1y', 68),
 ('sum_deb_e_oper_1m', 73),
 ('rko_start_months', 87),
 ('cnt_days_cred_e_oper_3m', 88),
 ('sum_cred_e_oper_1m', 94),
 ('balance_amt_min', 138)]

In [None]:
feature_names=X.columns
arr=model2.steps[2][1].feature_importances_

#feature_names=model2.steps[2][1].feature_name_

arr_2=[]
for i in range(len(arr)):
    arr_2.append((feature_names[i],arr[i]))
arr_2=sorted(arr_2,key=lambda x:x[1]) 

[('sum_b_oper_3m', 0.01995421484917514),
 ('cnt_days_cred_f_oper_3m', 0.03241100610670362),
 ('segment_1', 0.04310210779358404),
 ('cnt_days_deb_g_oper_1m', 0.05522309946499522),
 ('cnt_cred_h_oper_1m', 0.06745682264202177),
 ('cnt_deb_h_oper_3m', 0.06932549258925934),
 ('segment_3', 0.08142956636362048),
 ('cnt_deb_g_oper_1m', 0.08637991912901972),
 ('city_type_1252', 0.0948986136437457),
 ('min_start_fin_deals', 0.09760784948610124)]

Top 10 best features for CatBoost

In [None]:
arr_2[-10:]

[('sum_deb_e_oper_1m', 2.533223276406402),
 ('sum_a_oper_1m', 2.5491272121643926),
 ('cnt_cred_e_oper_3m', 2.678035029683883),
 ('cnt_days_cred_e_oper_3m', 3.0088749007936215),
 ('ogrn_exist_months', 3.0323314142560345),
 ('balance_amt_max', 3.0745973699151725),
 ('sum_cred_e_oper_1m', 4.091105745932757),
 ('balance_amt_day_avg', 4.4858393167119495),
 ('balance_amt_avg', 4.665099280104531),
 ('balance_amt_min', 5.648464779834567)]

Top 10 worst features for CatBoost

In [None]:
arr_2[:10]

[('sum_b_oper_3m', 0.01995421484917514),
 ('cnt_days_cred_f_oper_3m', 0.03241100610670362),
 ('segment_1', 0.04310210779358404),
 ('cnt_days_deb_g_oper_1m', 0.05522309946499522),
 ('cnt_cred_h_oper_1m', 0.06745682264202177),
 ('cnt_deb_h_oper_3m', 0.06932549258925934),
 ('segment_3', 0.08142956636362048),
 ('cnt_deb_g_oper_1m', 0.08637991912901972),
 ('city_type_1252', 0.0948986136437457),
 ('min_start_fin_deals', 0.09760784948610124)]

# Train/Test

In [None]:
df_test=pd.read_parquet("C:\Documents\Machine Learning\Alfa\Siberian Alfa Hack Materials\Test.parquet")
df_test = add_features(df_test)

In [None]:
X_test = df_test.drop(columns='id')
X_test = X_test.loc[:, X_test.dtypes != 'object']

# Prediction

In [None]:
y_pred = model3.predict_proba(X_test)[:, 1]

# Submission

In [None]:
sub = pd.concat([df_test['id'], pd.Series(y_pred)], axis=1)
sub.columns = ['id', 'score']
sub.to_csv('submission_final.csv', index=False)