In [15]:
import numpy as np
import pandas as pd
from pathlib import Path
import glob 
import os
import matplotlib.pyplot as plt 
import matplotlib
from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import gc
from sklearn import preprocessing
from scipy.stats import linregress 
import pickle
from sklearn.model_selection import StratifiedKFold

In [13]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [2]:
path = Path('/home/jovyan/workspace/amex-challenge/archive')

In [7]:
train_data = pd.read_feather(path / 'data/processed_train_data_v1.ftr')
train_labels = pd.read_feather(path / 'data/train_labels.ftr')

In [9]:
train_data = train_data.set_index("customer_ID")
train_labels = train_labels.set_index("customer_ID")

In [10]:
data = train_data.join(train_labels)

In [31]:
y = data.target.to_frame()
X = data.drop("target", axis=1)
y

Unnamed: 0_level_0,target
customer_ID,Unnamed: 1_level_1
fd4c7bc1f25a81c1fb312163300b3c274290274db5bb9644bedbb6e93cb43b78,0
bc481708906edce95b12e8a5dae6388b4915297ff51335333726e9249c0bbdf5,1
1ac9d919f1d52dcf0948f9de5b103a2a73ed1ec307b8aa156f4398b40f33c850,1
50cdae209ecbbbc3ed7717e2a011185f3dcf9253778d409e47d8058b0f7d0426,0
a516564cbe710367599548f0fd4710ab2b549f36bb318c526d8439055187a320,0
...,...
d8aff595434eea892f97bc12fa24bed600b0d8cf71d579207db22a239d8fd513,0
a9466942532c1c68cbe20b3969acadea4faa6454baeaad2b23a6cfab9dfcd941,0
63e85b15549b1dedf5d69c4293b61953ab9d88ec29ba6e3224ec2633098b6487,1
ea48581777dd5c5c6603554d01558e7cefb02568b01bc8782e965a992d370651,0


In [None]:
skf = StratifiedKFold(n_splits=4)
StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
models = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index,:], y.iloc[test_index,:]

    model = GradientBoostingClassifier(verbose=True)
    model.fit(X_train, y_train.target)
    models.append(model)
    
    y_pred = pd.DataFrame(model.predict(X_test), columns=["prediction"])
    print("mae validation:", mean_absolute_error(y_test, y_pred))
    print("amex metric validation:", amex_metric(y_test, y_pred))
    
    y_pred = pd.DataFrame(model.predict(X), columns=["prediction"])
    print("mae tot:", mean_absolute_error(y, y_pred))
    print("amex metric tot:", amex_metric(y, y_pred))
    

      Iter       Train Loss   Remaining Time 
         1           1.0463           34.39m


In [22]:
filename = './models/xgb_stratified.sav'
pickle.dump(models, open(filename, 'wb'))