In [1]:
import pandas as pd
import numpy as np
import warnings
pd.set_option('display.max_columns', 500)
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, f1_score
from imblearn.over_sampling import RandomOverSampler
from xgboost import XGBClassifier 
import json
from sklearn.preprocessing import LabelEncoder

In [2]:
InitDf = pd.read_csv("../data/transaction_dataset.csv")

In [3]:
InitDf.drop(['Unnamed: 0','Index'],axis=1,inplace=True)

In [4]:
def resample(X,Y):
    le = LabelEncoder()
    Y = le.fit_transform(Y)
    random_oversampler = RandomOverSampler(sampling_strategy='minority',random_state=42)
    
    x_train,x_test,y_train,y_test = train_test_split(X,Y,random_state=42,train_size=0.7,test_size=0.3)
    
    x_train, y_train = random_oversampler.fit_resample(x_train,y_train)
    
    return x_train, x_test, y_train, y_test

In [5]:
def scaler(data):
    sc = StandardScaler()
    sc.fit(data)
#     pickle.dump(sc, open('../outputs/scaler.pkl','wb'))
    scaled = sc.transform(data)
    scaled = pd.DataFrame(scaled,columns=data.columns)
    return scaled

In [6]:
def model_assesment(ground_truth,predictions):
    
    cm = confusion_matrix(ground_truth,predictions)
    
    TP = cm[1,1] # true positive 
    TN = cm[0,0] # true negatives
    FP = cm[0,1] # false positives
    FN = cm[1,0] # false negatives
    
    Sensitivity = TP / float(TP+FN)
    Specificity = TN / float(TN+FP)
    Precision = TP / float(TP + FP)
    Recall = TP / float(TP + FN)
    
    F1 = round(f1_score(ground_truth,predictions)*100,2)
    
    return {'Sensitivity':Sensitivity,'Specificity':Specificity,'Precision':Precision,'Recall':Recall,'F1':F1}

In [7]:
def CICD(data):
    
    
    with open('../outputs/ColNames.txt') as column_names:
        train_columns = column_names.read()
        train_columns = train_columns.split("\n")
        train_columns.pop()

        
    target = data['FLAG']
    independent = data.iloc[:,2:-1]
    
    object_valued_columns = []
    numerical_valued_columns = []
    
    for i in independent.columns:
        if independent[i].dtype == float or independent[i].dtype == int:
            numerical_valued_columns.append(i)

    for i in independent.columns:
        if independent[i].dtype == object or independent[i].dtype == str:
            object_valued_columns.append(i)
            
    numerical_valued_columns_renamed = [x.lstrip().rstrip().replace(" ","_") for x in numerical_valued_columns]
    numerical_rename = dict(zip(numerical_valued_columns,numerical_valued_columns_renamed))
    independent.rename(columns=numerical_rename,inplace=True)
    
    object_valued_columns_renamed = [x.lstrip().rstrip().replace(" ","_") for x in object_valued_columns]
    object_rename = dict(zip(object_valued_columns,object_valued_columns_renamed))
    independent.rename(columns=object_rename,inplace=True)
    
    independent = independent[train_columns]
    
    
    for i in train_columns:
        independent[i].fillna(independent[i].median(),inplace=True)
        
    normalize_independent = scaler(independent) 
        
    x_train,x_test,y_train,y_test = resample(normalize_independent,target)  
    
    xgbclassifier_init = XGBClassifier()
    xgbclassifier = xgbclassifier_init.fit(x_train,y_train)
    
    pred = xgbclassifier.predict(x_test)
    
    assesment = model_assesment(y_test,pred)
    
    if assesment['F1'] > 0.8:
        assesment['Manual Retraining Necessity'] = 'NO'
        return assesment
    
    elif assesment['F1'] < 0.8:
        assesment['Manual Retraining Necessity'] = 'YES'
        return assesment 
        
    
#     pickle.dump(xgbclassifier, open('../outputs/xbg.pkl', "wb"))
    

In [8]:
train_df = CICD(InitDf)

In [9]:
train_df

{'Sensitivity': 0.9646153846153847,
 'Specificity': 0.9900130264871906,
 'Precision': 0.9646153846153847,
 'Recall': 0.9646153846153847,
 'F1': 96.46,
 'Manual Retraining Necessity': 'NO'}