In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
import glob 
import os
import matplotlib.pyplot as plt 
import matplotlib
from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import gc
from sklearn import preprocessing
from scipy.stats import linregress 

In [3]:
input_path = Path('/home/jovyan/workspace/amex-challenge/data')

In [4]:
train_data = pd.read_feather(input_path / 'train_data.ftr')

In [5]:
train_labels = pd.read_feather(input_path/ 'train_labels.ftr')

## Load Data

In [6]:
train_data = train_data.set_index("customer_ID")
train_labels = train_labels.set_index("customer_ID")

In [7]:
joined_data = train_data.join(train_labels, on="customer_ID")

In [8]:
_ = gc.collect()

# Handle Non-numerical Values

## Date Object to Integer

In [9]:
date_converted_data =  joined_data.copy()
date_converted_data["S_2"] = pd.to_datetime(joined_data["S_2"]).astype(int)/ 10**9

## Dropping Columns

In [None]:
to_drop = ["D_63", "D_64"]

## One-hot Encoding Columns

In [21]:
# just drop it for now
d63_encoded = date_converted_data.drop("D_63", axis=1)
d64_encoded = d63_encoded.drop("D_64", axis=1)

KeyError: 'D_63'

In [69]:
d63categories = date_converted_data["D_63"].value_counts().index.to_list()

TypeError: 'numpy.ndarray' object is not callable

In [67]:
for 
date_converted_data["D_63_CO"] = date_converted_data["D_63"].apply(lambda x: 0 if x=="CO" else 1)
date_converted_data["D_63_CR"] = date_converted_data["D_63"].apply(lambda x: 0 if x=="CO" else 1)

## Null Value Handling

In [30]:
null_handled_data = d64_encoded.fillna(0)

In [31]:
null_handled_data

Unnamed: 0_level_0,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,1.489018e+09,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,0.0,...,0.0,0.0,0.002427,0.003706,0.003818,0.0,0.000569,0.000610,0.002674,0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,1.491523e+09,0.936665,0.005775,0.004923,1.000653,0.006151,0.126750,0.000798,0.002714,0.0,...,0.0,0.0,0.003954,0.003167,0.005032,0.0,0.009576,0.005492,0.009217,0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,1.495930e+09,0.954180,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,0.0,...,0.0,0.0,0.003269,0.007329,0.000427,0.0,0.003429,0.006986,0.002603,0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,1.497312e+09,0.960384,0.002455,0.013683,1.002700,0.001373,0.117169,0.000685,0.005531,0.0,...,0.0,0.0,0.006117,0.004516,0.003200,0.0,0.008419,0.006527,0.009600,0
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,1.500163e+09,0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,0.0,...,0.0,0.0,0.003671,0.004946,0.008889,0.0,0.001670,0.008126,0.009827,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6741ae09c0d1994fc3d82db5f1d0def2ddef2ce2eedbac1855d6a79bd20efb45,1.505347e+09,0.935964,0.065323,0.144130,0.118111,0.007775,0.179059,0.003264,0.112451,0.0,...,0.0,0.0,0.007345,0.002702,0.004573,0.0,0.008092,0.006998,0.004092,0
6741ae09c0d1994fc3d82db5f1d0def2ddef2ce2eedbac1855d6a79bd20efb45,1.509235e+09,0.984484,0.007234,0.151288,0.081358,0.008814,0.166747,0.008977,0.132621,0.0,...,0.0,0.0,0.001673,0.002989,0.006005,0.0,0.003539,0.008174,0.007820,0
6741ae09c0d1994fc3d82db5f1d0def2ddef2ce2eedbac1855d6a79bd20efb45,1.510099e+09,0.987328,0.005613,0.160568,0.084942,0.007989,0.170484,0.004548,0.130240,0.0,...,0.0,0.0,0.009667,0.000625,0.004987,0.0,0.001349,0.000387,0.009348,0
6741ae09c0d1994fc3d82db5f1d0def2ddef2ce2eedbac1855d6a79bd20efb45,1.513987e+09,0.952133,0.004498,0.117157,0.205511,0.006114,0.171415,0.002902,0.102681,0.0,...,0.0,0.0,0.001431,0.008863,0.005268,0.0,0.007844,0.004883,0.005416,0


## Coalescing Statements

In [19]:
co_data = null_handled_data.sort_values('S_2').groupby("customer_ID").tail(1)
co_data

Unnamed: 0_level_0,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6741ae09c0d1994fc3d82db5f1d0def2ddef2ce2eedbac1855d6a79bd20efb45,1.517270e+09,0.988497,0.002449,0.111594,0.161712,0.000048,0.171049,0.006524,0.094155,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
5575f0ac52adba2cf7d50bd439ab1d995cbe8455f98a3f17ece657d3987af637,1.519862e+09,0.393795,0.267857,0.364433,0.043562,0.002946,0.528196,0.159848,0.255350,0.149812,...,0.0,0.0,0.001388,0.001265,0.003240,0.000000,0.007173,0.005289,0.003837,1
2f75d7d737b1177ac50a9c85fa9320bb49ff60b150cc5a1cb7bbd7e93adc2762,1.519862e+09,0.995858,0.125595,0.023270,1.002422,0.001642,0.166702,0.007740,0.006197,0.000000,...,0.0,0.0,0.001546,0.000410,0.003658,0.000000,0.004105,0.002805,0.002660,0
09c16ea2443828d3594cfd71bc16fae3117d9984bd8454a833aa45fdb791404d,1.519862e+09,0.409347,0.008454,0.061898,0.811739,0.002153,0.218186,0.007712,0.017593,0.514809,...,0.0,0.0,1.004142,0.009367,0.872501,0.673629,1.005774,0.005064,0.915554,1
09c15997e04b58396cf64190ee254a3933e5411e1530a437a89808ec60477cdd,1.519862e+09,0.537817,0.215867,0.165552,0.193996,0.005643,0.143134,0.009226,0.119452,0.000000,...,0.0,0.0,0.007243,0.004295,0.006366,0.000000,0.006961,0.008826,0.005169,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5d8196ce0801d016057eefa4b5143baf7ac2ff89d6a6ea88aa410db63d10d22d,1.522454e+09,0.806488,0.245162,0.040643,1.005704,0.007031,0.083408,0.164064,0.009668,0.000000,...,0.0,0.0,0.006408,0.003713,0.004561,0.000000,0.007963,0.002290,0.009167,0
5a85acff5566766e33991a74998e956d4f4035c5ed3833a8a447bb116cccf649,1.522454e+09,0.031355,0.068570,0.406384,0.010538,0.500903,0.655850,0.001903,0.260901,0.077843,...,0.0,0.0,0.006872,0.008075,0.008933,0.000000,0.002180,0.007063,0.006520,1
090c7df4c2b6481e218780119f35b657db83370d8d091988fd1619dca5bf1874,1.522454e+09,0.604065,0.562667,0.047496,1.004274,0.001434,0.506520,0.000121,0.006500,0.000000,...,0.0,0.0,0.009273,0.006477,0.005610,0.000000,0.001974,0.007229,0.007651,0
531d4be7af11155139a1a8e32ed01477ba0fa73fee0aedb91617a1398d8d64c8,1.522454e+09,0.114801,0.445874,0.164222,0.038988,0.005417,0.615550,0.004005,0.564636,0.000000,...,0.0,0.0,0.000064,0.003073,0.007625,0.000000,0.003618,0.001056,0.002697,1


In [39]:
# slope_data = grouped_data.apply(lambda v: linregress(v.S_2, v.P_2)[0])
# slope_data

## Split Data

In [29]:
co_data["D_63"]

customer_ID
6741ae09c0d1994fc3d82db5f1d0def2ddef2ce2eedbac1855d6a79bd20efb45    CO
5575f0ac52adba2cf7d50bd439ab1d995cbe8455f98a3f17ece657d3987af637    CO
2f75d7d737b1177ac50a9c85fa9320bb49ff60b150cc5a1cb7bbd7e93adc2762    CO
09c16ea2443828d3594cfd71bc16fae3117d9984bd8454a833aa45fdb791404d    CO
09c15997e04b58396cf64190ee254a3933e5411e1530a437a89808ec60477cdd    CO
                                                                    ..
5d8196ce0801d016057eefa4b5143baf7ac2ff89d6a6ea88aa410db63d10d22d    CR
5a85acff5566766e33991a74998e956d4f4035c5ed3833a8a447bb116cccf649    CO
090c7df4c2b6481e218780119f35b657db83370d8d091988fd1619dca5bf1874    CO
531d4be7af11155139a1a8e32ed01477ba0fa73fee0aedb91617a1398d8d64c8    CO
622a266b9014d97b28dd504d8826153b1744b7fbd04b9c26273a42699fda90ca    CO
Name: D_63, Length: 185224, dtype: object

In [16]:
y = co_data.target
X = co_data.drop("target", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [17]:
model = GradientBoostingClassifier(max_depth = 5, n_estimators=10, verbose=True)
model.fit(X_train, y_train)

ValueError: could not convert string to float: 'CO'