### Libraries

In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
pd.set_option('display.max_columns', 100)
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import LabelEncoder
from itertools import combinations

### Read data and perform basic preprocessing

In [2]:
def process_data(DATA_DIR):
    
    train = pd.read_csv(DATA_DIR+"train_Df64byy.csv")
    test = pd.read_csv(DATA_DIR+"test_YCcRUnU.csv")
    
    
    #Removes train rows which has Region_Code not present in test set
    test_region_list=test['Region_Code'].tolist()
    train=train[train['Region_Code'].isin(test_region_list)]
    
    train['train_or_test']='train'
    test['train_or_test']='test'
    df=pd.concat([train,test])
    
    df['Holding_Policy_Duration']=(df['Holding_Policy_Duration'].replace(['14+'],[15])).astype(float)
    
    df['Holding_Policy_Duration'].fillna(-999,inplace=True)
    df['Holding_Policy_Type'].fillna(-999,inplace=True)
    
    le = LabelEncoder()
    for col in ['City_Code','Accomodation_Type','Reco_Insurance_Type','Health Indicator','Is_Spouse']:
        df[col]=  df[col].astype('str')
        df[col]= le.fit_transform(df[col])
        

    
    return train,test,df

### Feature Engineering

In [3]:
def frequency_encoding(column_name,output_column_name,df):
    fe_pol = (df.groupby(column_name).size()) / len(df)
    df[output_column_name] = df[column_name].apply(lambda x : fe_pol[x])

def feature_engineering(df):
    cat_features=[]
    
    #Interaction Feature (Combining 2 categorical features and performing frequency encoding)
    
    columns=['City_Code','Accomodation_Type','Reco_Insurance_Type','Health Indicator',
             'Is_Spouse','Region_Code','Holding_Policy_Type','Reco_Policy_Cat']

    comb = combinations(columns, 2) 

    for i in list(comb):  
        df[f'{i[0]}_{i[1]}']=df[i[0]].astype(str)+'_'+df[i[1]].astype(str)
        frequency_encoding(f'{i[0]}_{i[1]}',f'{i[0]}_{i[1]}',df)
        cat_features.append(f'{i[0]}_{i[1]}')
    
    #Frequency Encoding
    
    frequency_encoding('City_Code','City_Code_fe',df)
    frequency_encoding('Region_Code','Region_Code_fe',df)
    frequency_encoding('Holding_Policy_Duration','Holding_Policy_Duration',df)
    frequency_encoding('Holding_Policy_Type','Holding_Policy_Type_fe',df)
    
    #Deriving characteristics of each city by creating aggregate features
    
    city_aggregate_features = df.groupby(['City_Code']).agg({'Lower_Age': ['mean', 'max', 'min','std'],
                                                     'Reco_Policy_Premium': ['mean', 'max', 'min','std','sum'], 
                                                     'Region_Code': ['nunique','count'], 
                                                     'Accomodation_Type': ['nunique','count'],
                                                     'Reco_Insurance_Type': ['nunique','count'] ,
                                                     'Health Indicator': ['nunique','count'] ,
                                                     'Holding_Policy_Type': ['nunique','count'] ,
                                                     'Reco_Policy_Cat': ['nunique','count'] ,
                                                     })
    city_aggregate_features.columns = ['city_aggregate_features' + '_'.join(c).strip('_') for c in city_aggregate_features.columns]
    df = pd.merge(df, city_aggregate_features, on = ['City_Code'], how='left')

    
    city_region_aggregate_features = df.groupby(['City_Code','Region_Code']).agg({'Lower_Age': ['mean', 'max', 'min','std'],
                                                     'Reco_Policy_Premium': ['mean', 'max', 'min','std','sum'],  
                                                     'Accomodation_Type': ['nunique','count'],
                                                     'Reco_Insurance_Type': ['nunique','count'] ,
                                                     'Health Indicator': ['nunique','count'] ,
                                                     'Holding_Policy_Type': ['nunique','count'] ,
                                                     'Reco_Policy_Cat': ['nunique','count'] ,
                                                     })
    city_region_aggregate_features.columns = ['city_region_aggregate_features' + '_'.join(c).strip('_') for c in city_region_aggregate_features.columns]
    df = pd.merge(df, city_region_aggregate_features, on = ['City_Code','Region_Code'], how='left')

    
    city_recopolicycat_aggregate_features = df.groupby(['City_Code','Reco_Policy_Cat']).agg({'Lower_Age': ['mean', 'max', 'min','std'],
                                                     'Reco_Policy_Premium': ['mean', 'max', 'min','std','sum'], 
                                                     'Region_Code': ['nunique','count'], 
                                                     'Accomodation_Type': ['nunique','count'],
                                                     'Reco_Insurance_Type': ['nunique','count'] ,
                                                     'Health Indicator': ['nunique','count'] ,
                                                     'Holding_Policy_Type': ['nunique','count'] 
                                                     })
    city_recopolicycat_aggregate_features.columns = ['city_recopolicycat_aggregate_features' + '_'.join(c).strip('_') for c in city_recopolicycat_aggregate_features.columns]
    df = pd.merge(df, city_recopolicycat_aggregate_features, on = ['City_Code','Reco_Policy_Cat'], how='left')
    
    
    city_regioncoderecopolicycat_aggregate_features = df.groupby(['City_Code','Region_Code_Reco_Policy_Cat']).agg({'Lower_Age': ['mean', 'max', 'min','std'],
                                                     'Reco_Policy_Premium': ['mean', 'max', 'min','std','sum'], 
                                                     'Region_Code': ['nunique','count'], 
                                                     'Accomodation_Type': ['nunique','count'],
                                                     'Reco_Insurance_Type': ['nunique','count'] ,
                                                     'Health Indicator': ['nunique','count'] ,
                                                     'Holding_Policy_Type': ['nunique','count'] ,
                                                     'Reco_Policy_Cat': ['nunique','count'] ,
                                                     })

    city_regioncoderecopolicycat_aggregate_features.columns = ['city_regioncoderecopolicycat_aggregate_features' + '_'.join(c).strip('_') for c in city_regioncoderecopolicycat_aggregate_features.columns]
    df = pd.merge(df, city_regioncoderecopolicycat_aggregate_features, on = ['City_Code','Region_Code_Reco_Policy_Cat'], how='left')
    
    for i in cat_features:
        df[f'city_{i}_max']=df.groupby('City_Code')[i].transform('max')
        df[f'city_{i}_min']=df.groupby('City_Code')[i].transform('min')
        df[f'city_{i}_mean']=df.groupby('City_Code')[i].transform('mean')
        df[f'city_{i}_std']=df.groupby('City_Code')[i].transform('std')

    
        df[f'city_region_{i}_max']=df.groupby(['City_Code','Region_Code'])[i].transform('max')
        df[f'city_region_{i}_min']=df.groupby(['City_Code','Region_Code'])[i].transform('min')
        df[f'city_region_{i}_mean']=df.groupby(['City_Code','Region_Code'])[i].transform('mean')
        df[f'city_region_{i}_std']=df.groupby(['City_Code','Region_Code'])[i].transform('std')

    
        df[f'city_recopolicycat_{i}_max']=df.groupby(['City_Code','Reco_Policy_Cat'])[i].transform('max')
        df[f'city_recopolicycat_{i}_min']=df.groupby(['City_Code','Reco_Policy_Cat'])[i].transform('min')
        df[f'city_recopolicycat_{i}_mean']=df.groupby(['City_Code','Reco_Policy_Cat'])[i].transform('mean')
        df[f'city_recopolicycat_{i}_std']=df.groupby(['City_Code','Reco_Policy_Cat'])[i].transform('std')
        
        
    
    #Creating Age Bins and deriving characteristics of each age group by creating aggregate features
    
    Lower_Age_Bins = KBinsDiscretizer(n_bins=14, encode='ordinal', strategy='quantile')
    df['Lower_Age_Bins'] =Lower_Age_Bins.fit_transform(df['Lower_Age'].values.reshape(-1,1)).astype(int)
    
    age_aggregate_features = df.groupby(['Lower_Age_Bins']).agg({'Lower_Age': ['mean', 'max', 'min','std'],
                                                     'Reco_Policy_Premium': ['mean', 'max', 'min','std','sum'], 
                                                     'Region_Code': ['nunique','count'], 
                                                     'Accomodation_Type': ['nunique','count'],
                                                     'Reco_Insurance_Type': ['nunique','count'] ,
                                                     'Health Indicator': ['nunique','count'] ,
                                                     'Holding_Policy_Type': ['nunique','count'] ,
                                                     'City_Code': ['nunique','count'] ,
                                                     })
    age_aggregate_features.columns = ['age_aggregate_features' + '_'.join(c).strip('_') for c in age_aggregate_features.columns]
    df = pd.merge(df, age_aggregate_features, on = ['Lower_Age_Bins'], how='left')

 
    #Deriving characteristics of Holding_Policy_Type by creating aggregate features
    
    holdingpolicytype_aggregate_features = df.groupby(['Holding_Policy_Type']).agg({'Lower_Age': ['mean', 'max', 'min','std'],
                                                     'Reco_Policy_Premium': ['mean', 'max', 'min','std','sum'], 
                                                     'Region_Code': ['nunique','count'], 
                                                     'Accomodation_Type': ['nunique','count'],
                                                     'Reco_Insurance_Type': ['nunique','count'] ,
                                                     'Health Indicator': ['nunique','count'] ,
                                                     'City_Code': ['nunique','count'] ,
                                                     })
    holdingpolicytype_aggregate_features.columns = ['holdingpolicytype_aggregate_features' + '_'.join(c).strip('_') for c in holdingpolicytype_aggregate_features.columns]
    df = pd.merge(df, holdingpolicytype_aggregate_features, on = ['Holding_Policy_Type'], how='left')
    
    #Deriving characteristics of Health Indicator by creating aggregate features
    
    Health_Indicator_aggregate_features = df.groupby(['Health Indicator']).agg({'Lower_Age': ['mean', 'max', 'min','std'],
                                                     'Reco_Policy_Premium': ['mean', 'max', 'min','std','sum'],  
                                                     'Region_Code': ['nunique','count'], 
                                                     'Reco_Insurance_Type': ['nunique','count'] ,
                                                     'Accomodation_Type': ['nunique','count'],
                                                     'Holding_Policy_Type': ['nunique','count'] ,
                                                     'City_Code': ['nunique','count'] ,
                                                     })
    Health_Indicator_aggregate_features.columns = ['Health_Indicator_aggregate_features' + '_'.join(c).strip('_') for c in Health_Indicator_aggregate_features.columns]
    df = pd.merge(df, Health_Indicator_aggregate_features, on = ['Health Indicator'], how='left')
    

    #Deriving characteristics of Interaction_features by creating aggregate features (These interaction feature are selected for aggregating based on its feature importance)
    
    Region_CodeReco_Policy_Cat_grpd = df.groupby(['Region_Code_Reco_Policy_Cat']).agg({ 'Reco_Policy_Premium': ['mean', 'max', 'min', 'std']})                                                              
                                                     
    Region_CodeReco_Policy_Cat_grpd.columns = ['grpd_by_Region_Code_Reco_Policy_Cat_' + '_'.join(c).strip('_') for c in Region_CodeReco_Policy_Cat_grpd.columns]
    df = pd.merge(df, Region_CodeReco_Policy_Cat_grpd, on = ['Region_Code_Reco_Policy_Cat'], how='left')


    City_CodeRegion_Code_grpd = df.groupby(['City_Code_Region_Code']).agg({ 'Reco_Policy_Premium': ['mean', 'max', 'min', 'std']})                                                              
                                                     
    City_CodeRegion_Code_grpd.columns = ['grpd_by_City_CodeRegion_Code_' + '_'.join(c).strip('_') for c in City_CodeRegion_Code_grpd.columns]
    df = pd.merge(df, City_CodeRegion_Code_grpd, on = ['City_Code_Region_Code'], how='left')


    City_CodeReco_Policy_Cat_grpd = df.groupby(['City_Code_Reco_Policy_Cat']).agg({ 'Reco_Policy_Premium': ['mean', 'max', 'min', 'std']})                                                              
                                                     
    City_CodeReco_Policy_Cat_grpd.columns = ['grpd_by_City_CodeReco_Policy_Cat_' + '_'.join(c).strip('_') for c in City_CodeReco_Policy_Cat_grpd.columns]
    df = pd.merge(df, City_CodeReco_Policy_Cat_grpd, on = ['City_Code_Reco_Policy_Cat'], how='left')


    Holding_Policy_TypeReco_Policy_Cat_grpd = df.groupby(['Holding_Policy_Type_Reco_Policy_Cat']).agg({ 'Reco_Policy_Premium': ['mean', 'max', 'min', 'std']})                                                              
                                                     
    Holding_Policy_TypeReco_Policy_Cat_grpd.columns = ['grpd_by_Holding_Policy_TypeReco_Policy_Cat_' + '_'.join(c).strip('_') for c in Holding_Policy_TypeReco_Policy_Cat_grpd.columns]
    df = pd.merge(df, Holding_Policy_TypeReco_Policy_Cat_grpd, on = ['Holding_Policy_Type_Reco_Policy_Cat'], how='left')
    
    return df,cat_features
    

### Remove unnecessary columns and prepare the train and test data for training

In [4]:
def preparedatafortraining(df,train,test):
    
    train=df.loc[df.train_or_test.isin(['train'])]
    test=df.loc[df.train_or_test.isin(['test'])]
    
    drop_columns={'ID','Response','Upper_Age','Lower_Age_Bins','train_or_test'}
    
    target=['Response']
    
    x=train.drop(columns=drop_columns,axis=1)
    y=train[target]
    x_test=test.drop(columns=drop_columns,axis=1)
    
    print(x.shape)
    
    return x,y,x_test

### Save Data

In [5]:
def savedata(**DATA_DIR):
    
    train,test,df=process_data("../input/analytics-vidhya-jobathon/")
    df,cat_features=feature_engineering(df)
    x_train,y_train,x_test=preparedatafortraining(df,train,test)
    
    #x_train.to_pickle("x_train_lgbm.pkl")
    #y_train.to_pickle("y_train_lgbm.pkl")
    #x_test.to_pickle("x_test_lgbm.pkl")
    
    return x_train,y_train,x_test

### Train LGBM Model and save the validation and test set prediction for ensembling

In [6]:
def lgbm_model():
    
    x,y,x_test=savedata()
    
    params={'lambda': 2.8849054495567423, 
        'alpha': 0.001054193185317787, 
        'colsample_bytree': 0.5, 
        'subsample': 0.4, 
        'learning_rate': 0.014, 
        'max_depth': 13, 
        'random_state': 24,
        'min_child_weight': 5}
    
    err = [] 

    oofs = np.zeros(shape=(len(x)))
    preds = np.zeros(shape=(len(x_test)))

    Folds=8

    fold = StratifiedKFold(n_splits=Folds, shuffle=True, random_state=2020)
    i = 1

    for train_index, test_index in fold.split(x, y):
        x_train, x_val = x.iloc[train_index], x.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
        m = LGBMClassifier(n_estimators=10000,**params,verbose= -1)
    
        m.fit(x_train, y_train,eval_set=[(x_val, y_val)], early_stopping_rounds=30,verbose=False,eval_metric='auc')
    
        pred_y = m.predict_proba(x_val)[:,1]
        oofs[test_index] = pred_y
        print(i, " err_lgm: ", roc_auc_score(y_val,pred_y))
        err.append(roc_auc_score(y_val,pred_y))
        preds+= m.predict_proba(x_test)[:,1]
        i = i + 1
    preds=preds/Folds
    
    print(f"Average StratifiedKFold Score : {sum(err)/Folds} ")
    oof_score = roc_auc_score(y, oofs)
    print(f'\nOOF Auc is : {oof_score}')
    
    oofs=pd.DataFrame(oofs,columns=['lgbmoof'])
    preds=pd.DataFrame(preds,columns=['lgbmpred'])
    
    oofs.to_csv('lgbmoof.csv',index=False)
    preds.to_csv('lgbmpred.csv',index=False)

In [7]:
lgbm_model()

(48752, 533)
1  err_lgm:  0.8224546903570833
2  err_lgm:  0.8075914491950066
3  err_lgm:  0.8254098613745128
4  err_lgm:  0.8141514620067063
5  err_lgm:  0.8007063365772867
6  err_lgm:  0.8139061610305045
7  err_lgm:  0.8169790418878722
8  err_lgm:  0.8104893632353531
Average StratifiedKFold Score : 0.8139610457080408 

OOF Auc is : 0.8138034878946384


### Train XGB Model and save the validation and test set prediction for ensembling

In [8]:
def xgb_model():
    
    x,y,x_test=savedata()
    
    params={'lambda': 1.417495651744778, 
        'alpha': 0.4281901245971981, 
        'colsample_bytree': 0.7, 
        'subsample': 0.8, 
        'learning_rate': 0.016,
        'max_depth': 9, 
        'random_state': 2020, 
        'min_child_weight': 30}
    
    err = [] 

    oofs = np.zeros(shape=(len(x)))
    preds = np.zeros(shape=(len(x_test)))

    Folds=8

    fold = StratifiedKFold(n_splits=Folds, shuffle=True, random_state=2020)
    i = 1

    for train_index, test_index in fold.split(x, y):
        x_train, x_val = x.iloc[train_index], x.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
        m = XGBClassifier(n_estimators=10000,**params)
    
        m.fit(x_train, y_train,eval_set=[(x_val, y_val)], early_stopping_rounds=30,verbose=False,eval_metric='auc')
    
        pred_y = m.predict_proba(x_val)[:,1]
        oofs[test_index] = pred_y
        print(i, " err_xgb: ", roc_auc_score(y_val,pred_y))
        err.append(roc_auc_score(y_val,pred_y))
        preds+= m.predict_proba(x_test)[:,1]
        i = i + 1
    preds=preds/Folds
    
    print(f"Average StratifiedKFold Score : {sum(err)/Folds} ")
    oof_score = roc_auc_score(y, oofs)
    print(f'\nOOF Auc is : {oof_score}')
    
    oofs=pd.DataFrame(oofs,columns=['xgboof'])
    preds=pd.DataFrame(preds,columns=['xgbpred'])
    
    oofs.to_csv('xgbmoof.csv',index=False)
    preds.to_csv('xgbmpred.csv',index=False)

In [9]:
xgb_model()

(48752, 533)
1  err_xgb:  0.8222688379779438
2  err_xgb:  0.8095131302933026
3  err_xgb:  0.8262987173674321
4  err_xgb:  0.8158737728461399
5  err_xgb:  0.8017014659219515
6  err_xgb:  0.8138619347003774
7  err_xgb:  0.8185491873577966
8  err_xgb:  0.8103272246360889
Average StratifiedKFold Score : 0.8147992838876292 

OOF Auc is : 0.8145901095374922
