In [202]:
import pandas as pd
import numpy as np
import warnings
import pickle
from sklearn.ensemble import (RandomForestClassifier,
                              GradientBoostingClassifier,
                              AdaBoostClassifier) 
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

from sklearn.model_selection import (train_test_split as tts,
                                     GridSearchCV as gsv,
                                     RandomizedSearchCV as rsv,
                                     cross_validate as cv)
from sklearn import metrics
from sklearn.preprocessing import (OneHotEncoder,
                                   LabelEncoder,
                                   MinMaxScaler,
                                   StandardScaler,
                                   RobustScaler,
                                   PowerTransformer,
                                   OrdinalEncoder)

from sklearn.compose import ColumnTransformer
from sklearn.impute import (KNNImputer,
                            SimpleImputer)
from sklearn.pipeline import (Pipeline,
                              make_pipeline)

from sklearn import metrics

import warnings

warnings.filterwarnings("ignore")



import sklearn

In [203]:

def calculate_metrics(model, X, y):
        y_pred = model.predict(X)

        model_metrics = {
        'accuracy': metrics.accuracy_score(y, y_pred),
        'balanced_accuracy': metrics.balanced_accuracy_score(y, y_pred),
        'precision': metrics.precision_score(y, y_pred),
        'recall': metrics.recall_score(y, y_pred),
        'f1': metrics.f1_score(y, y_pred),
    }

        try:
            y_pred_proba = model.predict_proba(X)[:,1]
            model_metrics['roc-auc'] = metrics.roc_auc_score(y, y_pred_proba)
            return model_metrics
        except:
            return model_metrics
        


def date_engineering(df: pd.DataFrame)->pd.DataFrame:
    df["date"]=pd.to_datetime(df["transaction_date"])
    df["date"]=pd.to_datetime(df["transaction_date"]).dt.date
    df["year"]=pd.to_datetime(df["transaction_date"]).dt.year
    df["month"]=pd.to_datetime(df["transaction_date"]).dt.month
    df["day"]=pd.to_datetime(df["transaction_date"]).dt.day
    df["weekday"]=pd.to_datetime(df["transaction_date"]).dt.weekday
    df["hour"]=pd.to_datetime(df["transaction_date"]).dt.hour
    df["min"]=pd.to_datetime(df["transaction_date"]).dt.minute

    

    return df

In [204]:
df=pd.read_html(r"https://gist.github.com/cloudwalk-tests/76993838e65d7e0f988f40f1b1909c97#file-transactional-sample-csv")[0]
df=df.drop(columns=["Unnamed: 0","transaction_id"])
df.head()
df.dropna(subset="device_id", inplace=True)
df

Unnamed: 0,merchant_id,user_id,card_number,transaction_date,transaction_amount,device_id,has_cbk
0,29744,97051,434505******9116,2019-12-01T23:16:32.812632,374.56,285475.0,False
1,92895,2708,444456******4210,2019-12-01T22:45:37.873639,734.87,497105.0,True
4,54075,64367,650487******6116,2019-12-01T21:30:53.347051,55.36,860232.0,False
5,59566,40759,516292******8220,2019-12-01T21:25:53.374213,60.49,192705.0,False
6,20917,25661,650485******9310,2019-12-01T21:25:19.532243,318.37,760932.0,False
...,...,...,...,...,...,...,...
2756,17348,57594,406168******4222,2019-11-11T23:14:56.480571,2771.39,645413.0,False
2763,95558,44280,650491******3107,2019-11-11T21:24:34.346913,151.21,743254.0,False
2846,26765,84730,527497******8763,2019-11-10T16:15:05.469610,561.34,723309.0,False
2866,91972,24644,464297******6840,2019-11-10T01:37:02.447645,136.27,977260.0,False


In [231]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2369 entries, 0 to 2920
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   merchant_id         2369 non-null   int64  
 1   user_id             2369 non-null   int64  
 2   card_number         2369 non-null   object 
 3   transaction_amount  2369 non-null   float64
 4   device_id           2369 non-null   float64
 5   has_cbk             2369 non-null   int64  
 6   date                2369 non-null   object 
 7   year                2369 non-null   int64  
 8   month               2369 non-null   int64  
 9   day                 2369 non-null   int64  
 10  weekday             2369 non-null   int64  
 11  hour                2369 non-null   int64  
 12  min                 2369 non-null   int64  
dtypes: float64(2), int64(9), object(2)
memory usage: 259.1+ KB


In [205]:
df=date_engineering(df)
# encoding my has_cbk for 0->False 1->True
df["has_cbk"]=df["has_cbk"].apply(lambda x :0 if x == False else 1)
df.drop(columns="transaction_date", inplace=True)

In [206]:
num_col=df.select_dtypes("number").columns.to_list()
cat_col=[col for col in df if col not in num_col]
cat_col=[
    #'card_number', 
    #'date'
    ]

In [207]:
num_col=[
'merchant_id',
 'user_id',
 'transaction_amount',
 'device_id',
 #'has_cbk',
 'year',
 'month',
 'day',
 'weekday',
 'hour',
 'min'
 ]
x=num_col+cat_col
y='has_cbk'

In [208]:
x_train, x_test, y_train, y_test=tts(df[x], df[y], test_size=0.25, random_state=101, stratify=df[y])
     

In [209]:
num_trans=Pipeline([
    ("imputer",SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])

cat_trans=Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore",drop="if_binary", sparse_output=False))
])
     
preprocessing=ColumnTransformer([
    ("num", num_trans, num_col),
    ("cat", cat_trans, cat_col)
])

pipe=Pipeline([
    ("preprocess", preprocessing),
    ("model", LogisticRegression(class_weight="balanced"))
])
pipe

In [210]:
pipe=Pipeline([
    ("preprocess", preprocessing),
    ("model", LogisticRegression(class_weight="balanced"))
])
pipe

In [211]:

pipe.fit(x_train, y_train)

In [212]:
calculate_metrics(pipe,x_test, y_test)

{'accuracy': 0.718381112984823,
 'balanced_accuracy': 0.6810257523148149,
 'precision': 0.2712765957446808,
 'recall': 0.6296296296296297,
 'f1': 0.37918215613382894,
 'roc-auc': 0.7450810185185185}

In [213]:
pipe.set_params(model=RandomForestClassifier(class_weight="balanced")).fit(x_train,y_train)

In [214]:
calculate_metrics(pipe,x_test, y_test)

{'accuracy': 0.9055649241146712,
 'balanced_accuracy': 0.6751060956790124,
 'precision': 0.8787878787878788,
 'recall': 0.35802469135802467,
 'f1': 0.5087719298245613,
 'roc-auc': 0.8873577353395061}

In [215]:
models={
    "dummy":DummyClassifier(),
    "lr":LogisticRegression(class_weight="balanced"),
    "rf":RandomForestClassifier(class_weight="balanced", random_state=101),
    "gbc":GradientBoostingClassifier(random_state=101),
    "ada":AdaBoostClassifier(random_state=101)
}
     

In [216]:
results=[]
for model_name, modelo in models.items():
  print(modelo)
  pipe.set_params(model=modelo).fit(x_train, y_train)
  result=calculate_metrics(pipe,x_test, y_test)
  results.append(result)
  print(results)

DummyClassifier()
[{'accuracy': 0.863406408094435, 'balanced_accuracy': 0.5, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'roc-auc': 0.5}]
LogisticRegression(class_weight='balanced')
[{'accuracy': 0.863406408094435, 'balanced_accuracy': 0.5, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'roc-auc': 0.5}, {'accuracy': 0.718381112984823, 'balanced_accuracy': 0.6810257523148149, 'precision': 0.2712765957446808, 'recall': 0.6296296296296297, 'f1': 0.37918215613382894, 'roc-auc': 0.7450810185185185}]
RandomForestClassifier(class_weight='balanced', random_state=101)
[{'accuracy': 0.863406408094435, 'balanced_accuracy': 0.5, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'roc-auc': 0.5}, {'accuracy': 0.718381112984823, 'balanced_accuracy': 0.6810257523148149, 'precision': 0.2712765957446808, 'recall': 0.6296296296296297, 'f1': 0.37918215613382894, 'roc-auc': 0.7450810185185185}, {'accuracy': 0.9021922428330523, 'balanced_accuracy': 0.6783492476851851, 'precision': 0.8108108108108109, 'recall': 0.370

In [217]:

df_results=pd.DataFrame(results).T
df_results.columns=["dummy","lr","rf","gbc","ada"]
df_results

Unnamed: 0,dummy,lr,rf,gbc,ada
accuracy,0.863406,0.718381,0.902192,0.892074,0.878583
balanced_accuracy,0.5,0.681026,0.678349,0.651705,0.659481
precision,0.0,0.271277,0.810811,0.742857,0.591837
recall,0.0,0.62963,0.37037,0.320988,0.358025
f1,0.0,0.379182,0.508475,0.448276,0.446154
roc-auc,0.5,0.745081,0.887719,0.855903,0.81684


In [218]:
models={
    "dummy":DummyClassifier(),
    "lr":LogisticRegression(class_weight="balanced"),
    "rf":RandomForestClassifier(class_weight="balanced", random_state=101),
    "gbc":GradientBoostingClassifier(random_state=101),
    "ada":AdaBoostClassifier(random_state=101)
}
cv_results=[]
for model_name, modelo in models.items():
  print(modelo)
  pipe.set_params(model=modelo).fit(x_train,y_train)
  cross=cv(pipe, x_train, y_train, n_jobs=-1, cv=5,scoring=["precision", "accuracy", "f1","recall", "roc_auc" ])
  cross_result=pd.DataFrame(cross).mean().to_frame(model_name).T
  cv_results.append(cross_result)
     

DummyClassifier()


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


LogisticRegression(class_weight='balanced')
RandomForestClassifier(class_weight='balanced', random_state=101)
GradientBoostingClassifier(random_state=101)
AdaBoostClassifier(random_state=101)


In [219]:
df_cross=pd.concat(cv_results)
df_cross

Unnamed: 0,fit_time,score_time,test_precision,test_accuracy,test_f1,test_recall,test_roc_auc
dummy,0.042094,0.205497,0.0,0.863176,0.0,0.0,0.5
lr,0.101525,0.07155,0.297106,0.74043,0.408316,0.654507,0.769603
rf,1.047537,0.09369,0.837645,0.899766,0.472077,0.333163,0.896838
gbc,0.955937,0.040616,0.737229,0.899207,0.521722,0.406973,0.880791
ada,0.53712,0.120408,0.606124,0.88119,0.440915,0.349575,0.847239


In [220]:

params={
    "model":[
    DummyClassifier(),
    LogisticRegression(class_weight="balanced"),
    RandomForestClassifier(class_weight="balanced", random_state=101),
    GradientBoostingClassifier(random_state=101),
    AdaBoostClassifier(random_state=101)
    ]
}

In [221]:

random=rsv(pipe, params, cv=5, scoring="recall", verbose=1)
random.fit(x_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [222]:
random.best_estimator_

In [223]:
df_random=pd.DataFrame(random.cv_results_).sort_values("rank_test_score")
df_random

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.17547,0.217043,0.015453,0.005747,LogisticRegression(class_weight='balanced'),{'model': LogisticRegression(class_weight='bal...,0.55102,0.625,0.729167,0.714286,0.653061,0.654507,0.064389,1
3,0.650984,0.088804,0.012286,0.003101,GradientBoostingClassifier(random_state=101),{'model': GradientBoostingClassifier(random_st...,0.469388,0.3125,0.395833,0.346939,0.510204,0.406973,0.073764,2
4,0.335245,0.018817,0.039577,0.007581,AdaBoostClassifier(random_state=101),{'model': AdaBoostClassifier(random_state=101)},0.408163,0.25,0.395833,0.22449,0.469388,0.349575,0.095382,3
2,0.843046,0.37203,0.028208,0.005548,RandomForestClassifier(class_weight='balanced'...,{'model': RandomForestClassifier(class_weight=...,0.44898,0.25,0.375,0.265306,0.326531,0.333163,0.073114,4
0,0.123317,0.06233,0.029673,0.008001,DummyClassifier(),{'model': DummyClassifier()},0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


In [224]:
df_random.loc[:,~df_random.columns.str.contains("split|time")]

Unnamed: 0,param_model,params,mean_test_score,std_test_score,rank_test_score
1,LogisticRegression(class_weight='balanced'),{'model': LogisticRegression(class_weight='bal...,0.654507,0.064389,1
3,GradientBoostingClassifier(random_state=101),{'model': GradientBoostingClassifier(random_st...,0.406973,0.073764,2
4,AdaBoostClassifier(random_state=101),{'model': AdaBoostClassifier(random_state=101)},0.349575,0.095382,3
2,RandomForestClassifier(class_weight='balanced'...,{'model': RandomForestClassifier(class_weight=...,0.333163,0.073114,4
0,DummyClassifier(),{'model': DummyClassifier()},0.0,0.0,5


In [225]:
param_grid={
    "model__penalty":["l1","l2"],
    "model__solver":["liblinear",'lbfgs','newton-cg'],
    "model__max_iter":[100,1000,2500],
    }

pipe=Pipeline([
    ("preprocess", preprocessing),
    ("model", LogisticRegression(class_weight="balanced"))
])
pipe

final_random=gsv(pipe,param_grid=param_grid, cv=5, scoring="recall",n_jobs=1,verbose=5)

final_random.fit(x_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END model__max_iter=100, model__penalty=l1, model__solver=liblinear;, score=0.571 total time=   0.1s
[CV 2/5] END model__max_iter=100, model__penalty=l1, model__solver=liblinear;, score=0.646 total time=   0.3s
[CV 3/5] END model__max_iter=100, model__penalty=l1, model__solver=liblinear;, score=0.688 total time=   0.1s
[CV 4/5] END model__max_iter=100, model__penalty=l1, model__solver=liblinear;, score=0.714 total time=   0.2s
[CV 5/5] END model__max_iter=100, model__penalty=l1, model__solver=liblinear;, score=0.653 total time=   0.1s
[CV 1/5] END model__max_iter=100, model__penalty=l1, model__solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END model__max_iter=100, model__penalty=l1, model__solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END model__max_iter=100, model__penalty=l1, model__solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END model__max_iter=100, model__penalty=l1, model__solver=lbfgs;, sc

In [226]:

df_random_final=pd.DataFrame(final_random.cv_results_).set_index("rank_test_score").sort_index()
df_random_final.loc[:,~df_random_final.columns.str.contains("split|time")].head(1)

Unnamed: 0_level_0,param_model__max_iter,param_model__penalty,param_model__solver,params,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2500,l2,newton-cg,"{'model__max_iter': 2500, 'model__penalty': 'l...",0.654507,0.064389


In [227]:
pipe=Pipeline([
    ("preprocess", preprocessing),
    ("model", LogisticRegression(class_weight="balanced", 
                                 max_iter=2500, penalty="l2",solver="newton-cg"))
])
pipe

In [228]:
pipe.fit(x_train, y_train)

In [229]:
calculate_metrics(pipe,x_test, y_test)

{'accuracy': 0.718381112984823,
 'balanced_accuracy': 0.6810257523148149,
 'precision': 0.2712765957446808,
 'recall': 0.6296296296296297,
 'f1': 0.37918215613382894,
 'roc-auc': 0.7450810185185185}

In [230]:
with open('model.pkl', 'wb') as model_file:
    pickle.dump(pipe, model_file)