# Algebras Datathon

## Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import FeatureHasher

# For imputing NaNs
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.model_selection import train_test_split

In [2]:
SEED = 42

Read in the data

In [3]:
df_train = pd.read_csv('UCT_Train.csv')
df_test = pd.read_csv('UCT_Test.csv')

One-hot encode the categorical features

In [4]:
def one_hot_encode(df):
    """ One-hot encodes the df's categorical features. """
    cat_features = ['Building_Painted', 'Building_Fenced', 'Garden', 'Settlement', 'Building_Type', 'Residential']
    
    for feat in cat_features:
        dummy = pd.get_dummies(df[feat], prefix=feat)
        df = pd.concat([df, dummy], axis = 1)
        df.drop(feat, axis=1, inplace=True)
    return df 

Encode NumberOfWindows as numeric by changing .'s to the mean number of windows and >= 10 to 10

In [5]:
def encode_num_windows(df):
    """ Encode num windows with mean windows for the ` .` and 10 for `>=10` """
    df['NumberOfWindows'] = df['NumberOfWindows'].apply(lambda num_w : 10 if num_w == '>=10' else num_w)
    df['NumberOfWindows'] = df['NumberOfWindows'].apply(lambda num_w : 0 if num_w == '   .' else num_w)
    df['NumberOfWindows'] = df['NumberOfWindows'].astype(float)
    mean_windows = df[df['NumberOfWindows'] != 0]['NumberOfWindows'].mean()
    df['NumberOfWindows'] = df['NumberOfWindows'].apply(lambda num_w : mean_windows if num_w == 0 else num_w)
    return df

Encode Geo_Code using FeatureHashing

In [6]:
def encode_geocode(df):
    """ Encode geo_code with hashing. """
    fh = FeatureHasher(n_features=1, input_type='string')
    hashed_features = fh.fit_transform(df['Geo_Code'].astype(str))
    hashed_features = hashed_features.toarray()
    df['Geo_Code_Encoded'] = hashed_features
    return df

Do all the preprocessing

In [7]:
def preprocess(df, train_or_test):
    """ Preprocess a train or test df:
        1. One hot encode features
        2. NumberOfWindows change . to mean, and >= 10 to 10
        3. Encode geo_codes
    """
    df = one_hot_encode(df)
    df = encode_num_windows(df)
    df = encode_geocode(df) 
    return df

In [8]:
df_train = preprocess(df_train, train_or_test="train")
df_test = preprocess(df_test, train_or_test="test")

Impute the Nans using IterativeImputer

In [9]:
imp_mean = IterativeImputer(random_state=SEED)
imp_mean.fit(df_train.drop(["Customer Id", "Claim", "Geo_Code"], axis=1))

IterativeImputer(random_state=42)

In [10]:
df_train_imputed = pd.DataFrame(imp_mean.transform(df_train.drop(["Customer Id", "Claim", "Geo_Code"], axis=1)), columns=df_train.columns.drop(["Customer Id", "Claim", "Geo_Code"]))
df_test_imputed = pd.DataFrame(imp_mean.transform(df_test.drop(["Customer Id", "Geo_Code"], axis=1)), columns=df_test.columns.drop(["Customer Id", "Geo_Code"]))

In [11]:
df_train = pd.concat([df_train_imputed, df_train[['Claim']]], axis=1)
df_test = pd.concat([df_test_imputed, df_test[['Customer Id']]], axis=1)

Create a validation set for use in ensembling

In [12]:
def split_into_train_and_val(df_train, val_prop = 0.2, seed = SEED):
    df_train, df_val = train_test_split(df_train, test_size=val_prop, stratify = df_train['Claim'], random_state = SEED)
    return df_train, df_val

In [13]:
df_train, df_val = split_into_train_and_val(df_train)

Create Xs and ys for model fitting

In [14]:
def create_X_and_ys(df_train, df_val, df_test):
    X_train, y_train = df_train.drop(['Claim'], axis=1), df_train['Claim']
    X_val, y_val = df_val.drop(['Claim'], axis=1), df_val['Claim']
    X_test = df_test.drop('Customer Id', axis=1)
    return X_train, y_train, X_val, y_val, X_test

In [15]:
X_train, y_train, X_val, y_val, X_test = create_X_and_ys(df_train, df_val, df_test)

## Model Building

In [16]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
import catboost as cat

### XGBoost (with a grid search over hyperparameters)

In [17]:
xg = xgb.XGBClassifier()

In [18]:
param_dist = {'max_depth': [1, 2, 3],
              'learning_rate': [0.1, 0.01],
              'subsample': [0.8, 1],
              'colsample_bytree' : [0.8, 1],
              'random_state' : [SEED],
              'n_jobs':[-1],
              'n_estimators':[100, 500, 1000]}

random_search = GridSearchCV(xg, param_grid=param_dist, scoring="roc_auc")
random_search.fit(X_train, y_train)

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, validate_parameter

In [19]:
xg = random_search.best_estimator_

### Light GBM (with a grid search over hyperparameters)

In [29]:
lg = lgb.LGBMClassifier()
param_dist = {'max_depth': [1,2,5, 10],
              'learning_rate': [0.5, 0.1, 0.05, 0.01],
               'min_data_in_leaf' : [5, 10,20,30],
              'random_state' : [SEED]}
random_search = GridSearchCV(lg, param_grid=param_dist, scoring="roc_auc")
random_search.fit(X_train, y_train)

GridSearchCV(estimator=LGBMClassifier(),
             param_grid={'learning_rate': [0.5, 0.1, 0.05, 0.01],
                         'max_depth': [1, 2, 5, 10],
                         'min_data_in_leaf': [5, 10, 20, 30],
                         'random_state': [42]},
             scoring='roc_auc')

In [30]:
lg = random_search.best_estimator_

### Random Forest (with a grid search over hyperparameters)

In [24]:
rf = RandomForestClassifier()
param_dist = {
               'max_depth':[10,20],
              'n_estimators' : [100, 500, 1000],
               'max_features' : ['sqrt', 'log2'],
                'n_jobs': [-1],
              'random_state' : [SEED]}
random_search = GridSearchCV(rf, param_grid=param_dist, scoring="roc_auc")
random_search.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [10, 20],
                         'max_features': ['sqrt', 'log2'],
                         'n_estimators': [100, 500, 1000], 'n_jobs': [-1],
                         'random_state': [42]},
             scoring='roc_auc')

In [25]:
rf = random_search.best_estimator_

### CatBoost (with a grid search over hyperparameters)

In [45]:
ct = cat.CatBoostClassifier()

param_dist = {'max_depth': [1,2],
              'learning_rate': [0.1, 0.2],
              'n_estimators' : [500],
              'random_state' : [SEED]}

n_iter_search = 4
random_search = RandomizedSearchCV(ct, param_distributions=param_dist,
                                   n_iter=n_iter_search)
random_search.fit(X_train, y_train)

0:	learn: 0.6539761	total: 1.4ms	remaining: 701ms
1:	learn: 0.6231185	total: 2.93ms	remaining: 730ms
2:	learn: 0.5973278	total: 4.42ms	remaining: 732ms
3:	learn: 0.5825551	total: 5.81ms	remaining: 721ms
4:	learn: 0.5684162	total: 6.91ms	remaining: 684ms
5:	learn: 0.5572917	total: 7.94ms	remaining: 654ms
6:	learn: 0.5452247	total: 9.04ms	remaining: 637ms
7:	learn: 0.5393361	total: 10.3ms	remaining: 631ms
8:	learn: 0.5311306	total: 11.5ms	remaining: 627ms
9:	learn: 0.5243481	total: 12.5ms	remaining: 610ms
10:	learn: 0.5206676	total: 13.6ms	remaining: 606ms
11:	learn: 0.5156845	total: 14.6ms	remaining: 595ms
12:	learn: 0.5121018	total: 15.7ms	remaining: 587ms
13:	learn: 0.5088740	total: 16.9ms	remaining: 587ms
14:	learn: 0.5052476	total: 18ms	remaining: 581ms
15:	learn: 0.5036043	total: 19.1ms	remaining: 579ms
16:	learn: 0.5021571	total: 20.4ms	remaining: 580ms
17:	learn: 0.5005960	total: 21.5ms	remaining: 576ms
18:	learn: 0.4983222	total: 22.5ms	remaining: 570ms
19:	learn: 0.4968429	tota

RandomizedSearchCV(estimator=<catboost.core.CatBoostClassifier object at 0x7fa259959c70>,
                   n_iter=4,
                   param_distributions={'learning_rate': [0.1, 0.2],
                                        'max_depth': [1, 2],
                                        'n_estimators': [500],
                                        'random_state': [42]})

In [32]:
ct = random_search.best_estimator_

## Ensemble the predictions

In [33]:
def get_ens_probs(models, X, weights):
    preds = [model.predict_proba(X)[:,1] for model in models]
    return [np.sum(np.array(weights) * np.array(pred)) for pred in zip(*preds)]

In [34]:
def get_val_auc(*models,weights=[1]):

    ens_probs = get_ens_probs(models, X=X_val,weights=weights)

    return roc_auc_score(y_val, ens_probs)

In [46]:
xg.fit(X_train, y_train)
lg.fit(X_train, y_train)
ct.fit(X_train, y_train)
rf.fit(X_train, y_train)

Learning rate set to 0.02025
0:	learn: 0.6839250	total: 3.2ms	remaining: 3.19s
1:	learn: 0.6751065	total: 4.91ms	remaining: 2.45s
2:	learn: 0.6667826	total: 6.5ms	remaining: 2.16s
3:	learn: 0.6586772	total: 8.09ms	remaining: 2.01s
4:	learn: 0.6510842	total: 9.73ms	remaining: 1.94s
5:	learn: 0.6443992	total: 11.3ms	remaining: 1.87s
6:	learn: 0.6373203	total: 12.9ms	remaining: 1.82s
7:	learn: 0.6305651	total: 14.5ms	remaining: 1.79s
8:	learn: 0.6236329	total: 16.1ms	remaining: 1.78s
9:	learn: 0.6176176	total: 17.7ms	remaining: 1.75s
10:	learn: 0.6117768	total: 19.3ms	remaining: 1.73s
11:	learn: 0.6061203	total: 20.8ms	remaining: 1.72s
12:	learn: 0.6007978	total: 22.4ms	remaining: 1.7s
13:	learn: 0.5952835	total: 23.9ms	remaining: 1.69s
14:	learn: 0.5908910	total: 25.1ms	remaining: 1.65s
15:	learn: 0.5858592	total: 26.6ms	remaining: 1.64s
16:	learn: 0.5811820	total: 28.1ms	remaining: 1.63s
17:	learn: 0.5769509	total: 29.9ms	remaining: 1.63s
18:	learn: 0.5727334	total: 31.7ms	remaining: 1.

RandomForestClassifier(max_depth=10, max_features='sqrt', n_estimators=1000,
                       n_jobs=-1, random_state=42)

Individual Validation AUCs

In [36]:
get_val_auc(ct)

0.6970623446687276

In [37]:
get_val_auc(xg)

0.6980701036552099

In [38]:
get_val_auc(lg)

0.6985550403103594

In [39]:
get_val_auc(rf)

0.6944444444444444

Ensemble Validation AUC

In [40]:
weights = [0.25, 0.25, 0.25, 0.25]

In [41]:
get_val_auc(xg, lg, rf, ct, weights=weights)

0.7013661574831787

Refit on train and validation data combined

In [42]:
X_all = pd.concat([X_train, X_val], axis=0)
y_all = pd.concat([y_train, y_val], axis=0)

In [43]:

xg.fit(X_all, y_all)
lg.fit(X_all, y_all)
rf.fit(X_all, y_all)
ct.fit(X_all, y_all)

models = [xg, lg, rf, ct]
preds = get_ens_probs(models, X=X_test, weights=weights)

0:	learn: 0.6607574	total: 1.93ms	remaining: 963ms
1:	learn: 0.6290892	total: 3.06ms	remaining: 762ms
2:	learn: 0.6033080	total: 4.16ms	remaining: 689ms
3:	learn: 0.5832697	total: 5.26ms	remaining: 653ms
4:	learn: 0.5660478	total: 6.38ms	remaining: 632ms
5:	learn: 0.5530103	total: 7.5ms	remaining: 618ms
6:	learn: 0.5432710	total: 8.64ms	remaining: 609ms
7:	learn: 0.5346096	total: 9.78ms	remaining: 602ms
8:	learn: 0.5271649	total: 11.3ms	remaining: 616ms
9:	learn: 0.5211634	total: 13.1ms	remaining: 643ms
10:	learn: 0.5166412	total: 15.3ms	remaining: 680ms
11:	learn: 0.5131105	total: 17ms	remaining: 690ms
12:	learn: 0.5101590	total: 18.7ms	remaining: 699ms
13:	learn: 0.5078799	total: 20.2ms	remaining: 700ms
14:	learn: 0.5058042	total: 21.7ms	remaining: 700ms
15:	learn: 0.5041896	total: 23.1ms	remaining: 699ms
16:	learn: 0.5024093	total: 24.4ms	remaining: 694ms
17:	learn: 0.5011405	total: 25.6ms	remaining: 685ms
18:	learn: 0.5002272	total: 26.7ms	remaining: 677ms
19:	learn: 0.4985444	tota

## Submission

In [44]:
df_sub = df_test[['Customer Id']]
df_sub = df_sub.assign(Claim=preds)
df_sub.head()

Unnamed: 0,Customer Id,Claim
0,H9737,0.157874
1,H8692,0.337506
2,H1199,0.244232
3,H10656,0.046465
4,H11022,0.097918


In [183]:
df_sub.to_csv("Cat.csv", index=False)