In [1]:
import yaml
import pandas as pd
from sklearn.model_selection import train_test_split

from src.data.prepare_data import prepare_data
from src.data.utils import resample_data
from src.models.utils import train_splits, imbalanced_sampling
from src.models.model_selection import GridSearch
from src.models.classification import Classification
from src.models.feature_selection import FeatureSelection
from src.models.evaluation import Evaluation

import warnings
warnings.filterwarnings("ignore")

In [2]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

In [3]:
# load and prepare data
df = pd.read_csv(config['data_loader']['path'])
df = prepare_data(df=df)
display(df.head())

# resample for imbalanced sets
df_sampled = resample_data(df=df, pos_share=0.01)

# check class distributions
print(
    df['label'].value_counts(normalize=True)
    , df_sampled['label'].value_counts(normalize=True)
    )

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,label
0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,0.637735,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0
1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,0.529808,...,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37,0
2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,0.690708,...,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,2513.54,0
3,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,0.575231,...,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,5384.44,0
4,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,0.968046,...,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,14278.97,0


label
0    0.5
1    0.5
Name: proportion, dtype: float64 label
0    0.99
1    0.01
Name: proportion, dtype: float64


In [4]:
# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df_sampled.iloc[:,:-1], df_sampled['label']
    , test_size=config['train_test_split']['test_size']
    , random_state=123
    , shuffle=True
    , stratify=df_sampled['label']
    )

# check class distributions
print(
    y_train.value_counts(normalize=True)
    , y_test.value_counts(normalize=True)
    )

label
0    0.99
1    0.01
Name: proportion, dtype: float64 label
0    0.990011
1    0.009989
Name: proportion, dtype: float64


In [5]:
# apply oversampling to the train set
X_train_rs, y_train_rs = imbalanced_sampling(
    method='over'
    , X_train=X_train
    , y_train=y_train
)

# check class distributions
print(
    y_train.value_counts(normalize=True)
    , y_train_rs.value_counts(normalize=True)
    )

label
0    0.99
1    0.01
Name: proportion, dtype: float64 label
0    0.5
1    0.5
Name: proportion, dtype: float64


In [6]:
# split train sets into multiple sets and check class distributions
train = train_splits(X_train_rs, y_train_rs, config['train_test_split'])
[train[i].iloc[:,-1].value_counts(normalize=True) for i in train.keys()]

[label
 1    0.500002
 0    0.499998
 Name: proportion, dtype: float64,
 label
 0    0.500002
 1    0.499998
 Name: proportion, dtype: float64]

In [7]:
# search best algorithm and hyperparams
grid_search = GridSearch(config=config['optimization'])
grid_search.fit(X=train[1].iloc[:,:-1], y=train[1].iloc[:,-1])

for j in grid_search.results.keys():
    print(j, '-', grid_search.results[j]['best_score'])

LogisticRegression - 0.9415288343282342
DecisionTreeClassifier - 0.958099756490161
RandomForestClassifier - 0.9557736263514643
XGBClassifier - 0.9996260242261192


In [8]:
# greedy feature selection
clf = Classification(
    algorithm=grid_search.best_algorithm
    , **grid_search.best_hyperparams
    )

feature_selection = FeatureSelection(X=train[2].iloc[:,:-1], y=train[2].iloc[:,-1])
feature_selection.stats_test()
feature_selection.wrapper(clf=clf, config=config['optimization'])

feats = feature_selection.features
feats

Index(['V4', 'V14', 'V17', 'V27'], dtype='object')

In [9]:
# fit best algorithm on most important features of training data 
clf = Classification(
    algorithm=grid_search.best_algorithm
    , **grid_search.best_hyperparams
    )
clf.fit(X=X_train_rs[feats], y=y_train_rs)

# test set evaluation
eval = Evaluation(clf=clf, threshold=0.5)
eval.fit(
    X_train=X_train_rs[feats], y_train=y_train_rs
    , X_test=X_test[feats], y_test=y_test
    )

Unnamed: 0,metric,train,test
0,accuracy,0.99982,0.99852
1,precision,0.99964,0.91724
2,recall,1.0,0.93662
3,f1_score,0.99982,0.92683
4,TP,267397.0,133.0
5,TN,267301.0,14062.0
6,FP,96.0,12.0
7,FN,0.0,9.0
