# Trimmed Sequential Data

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import copy
from Modules import *
sns.set()
%matplotlib inline
import tpot
import imblearn


  return f(*args, **kwds)
  return f(*args, **kwds)


### read in the full sequential data

In [2]:
df = pd.read_csv('Sequential_Data1.csv')
y = df['Y']

df.head().T

Unnamed: 0,0,1,2,3,4
LIMIT_BAL,20000.000000,120000.000000,90000.000000,50000.000000,50000.000000
AGE,24.000000,26.000000,34.000000,37.000000,57.000000
PAY_1,2.000000,-1.000000,0.000000,0.000000,-1.000000
PAY_2,2.000000,2.000000,0.000000,0.000000,0.000000
PAY_3,-1.000000,0.000000,0.000000,0.000000,-1.000000
PAY_4,-1.000000,0.000000,0.000000,0.000000,0.000000
PAY_5,-2.000000,0.000000,0.000000,0.000000,0.000000
PAY_6,-2.000000,2.000000,0.000000,0.000000,0.000000
BILL_AMT1,3913.000000,2682.000000,29239.000000,46990.000000,8617.000000
BILL_AMT2,3102.000000,1725.000000,14027.000000,48233.000000,5670.000000


In the `Naive_TPOT-Copy1.ipynb`, we didn't trim any of the intermediate features, in that there are still features whose values are used to create new features, which would inherently make all of these features too highly correlated with each other. So here, we will eliminate feautes used in calculation of the new feautures, which includes the pay_amt, bill_amt, and the outstanding balances.  

In [3]:
columns_drop = ['LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
       'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'OUTSTANDING_BAL1',
       'OUTSTANDING_BAL2', 'OUTSTANDING_BAL3', 'OUTSTANDING_BAL4',
       'OUTSTANDING_BAL5', 'OUTSTANDING_BAL6', 'OUSTANDING_BAL_1_INDICATOR',
       'OUSTANDING_BAL_2_INDICATOR', 'OUSTANDING_BAL_3_INDICATOR',
       'OUSTANDING_BAL_4_INDICATOR', 'OUSTANDING_BAL_5_INDICATOR']

df_trimmed = df.drop(columns = columns_drop)

In [4]:
df_trimmed.head().T

Unnamed: 0,0,1,2,3,4
AGE,24.0,26.0,34.0,37.0,57.0
PAY_1,2.0,-1.0,0.0,0.0,-1.0
PAY_2,2.0,2.0,0.0,0.0,0.0
PAY_3,-1.0,0.0,0.0,0.0,-1.0
PAY_4,-1.0,0.0,0.0,0.0,0.0
PAY_5,-2.0,0.0,0.0,0.0,0.0
PAY_6,-2.0,2.0,0.0,0.0,0.0
Y,1.0,1.0,0.0,0.0,0.0
SEX_Female,1.0,1.0,1.0,1.0,0.0
SEX_Male,0.0,0.0,0.0,0.0,1.0


In [5]:
#df_trimmed.to_csv('Seqential_data_trimmed.csv', sep=',')

## Baseline evaluation

Using the baseline evaluation code in our `Modules.py` file

In [6]:
#evaluate using the baseline evaluator.
evaluate_baseline(df_trimmed, clf = 'Logistic')

10-fold f1 scores:
[0.43203883 0.4400978  0.45714286 0.49217002 0.46778043 0.47006652
 0.46511628 0.50717703 0.47906977 0.43349754]

corss-validation f1 score: 0.4644157079368223


## Evaluation of `Sequential_TPOT1.py` code

In [9]:
#build a pipeline with SMOTE and Sequential_TPOT1.py

from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import Binarizer, PolynomialFeatures
from sklearn.model_selection import KFold, cross_val_score, train_test_split

#read in the data, same as the TPOT pipeline
tpot_data = pd.read_csv('Sequential_data_trimmed.csv', sep=',', dtype=np.float64)
features = tpot_data.drop('Y', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['Y'].values, random_state=2019)

#instatiate pipeline sequences
#oversampling
smote = SMOTE(random_state = 2019, n_jobs=-1)

#steps from best tpot pipeline
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
binarizer = Binarizer(threshold=0.8)
clf = BernoulliNB(alpha=10.0, fit_prior=True)

#build the pipeline
pipeline = make_pipeline(smote, poly, binarizer, clf)
pipeline.fit(training_features, training_target)

#set criteria for cross_validation scoring
kf = KFold(n_splits = 10, random_state = 2019)

#store the scores in an array
scores = cross_val_score(pipeline, X = testing_features,  y= testing_target, cv=kf, scoring = 'f1')

print(scores)
print(f"cross-validation f1 score: {np.mean(scores)}")




[0.48979592 0.47651007 0.53687316 0.54237288 0.46905537 0.5129683
 0.49101796 0.54019293 0.4952381  0.5       ]
cross-validation f1 score: 0.5054024682839173


## Evaluation of basic XGBoost classifier

In [5]:
XGBoost_evaluate(df_trimmed)

10-fold f1 scores:
[0.42931937 0.48387097 0.44186047 0.45918367 0.49302326 0.39572193
 0.42592593 0.51515152 0.48430493 0.51648352]

corss-validation f1 score: 0.4644845549299378


## Create another TPOT run, using only XGBoost and RandomForests, ExtraTrees, GradientBoostingClassifier.

In [None]:
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

X = df.drop(columns = ['Y'])
y = df['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2019)

oversampler = SMOTE(random_state = 2019)
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, y_train)

X_train_oversampled = pd.DataFrame(X_train_oversampled, columns = X_train.columns)
y_train_oversampled = pd.Series(y_train_oversampled)

#run tpot
tpot = TPOTClassifier(verbosity=2, n_jobs = -1, scoring = 'f1', random_state = 2019)
tpot.fit(X_train_oversampled, y_train_oversampled)
print(tpot.score(X_test, y_test))
tpot.export('tpot_sequential_data_trimmed.py')

  return f(*args, **kwds)


HBox(children=(IntProgress(value=0, description='Optimization Progress', max=10100, style=ProgressStyle(descri…



Generation 1 - Current best internal CV score: 0.8341928231513014
Generation 2 - Current best internal CV score: 0.8341928231513014
Generation 3 - Current best internal CV score: 0.8345135476152284
Generation 4 - Current best internal CV score: 0.8345135476152284
Generation 5 - Current best internal CV score: 0.8362626487166356


In [11]:
print(tpot.score(X_test, y_test))
tpot.export('tpot_mnist_pipeline.py')

0.822
