In [1]:
import copy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn .metrics import roc_auc_score, f1_score

In [2]:
df = pd.read_csv("./Preprocessed.csv")
df_test = pd.read_csv("./Test_Preprocessed.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,isFraud
0,0,0.520514,0.001118,4,0.397735,0.188,0.381679,3,0.481752,3,...,0.75,260,0,1,1,1,1,0,829,0
1,1,0.428925,0.001526,4,0.397275,0.972,0.381679,3,0.905109,2,...,0.75,260,0,1,1,1,1,0,829,0
2,2,0.834197,0.01203,4,0.808577,0.14,0.381679,3,0.014599,2,...,0.75,260,0,1,1,1,1,0,829,0
3,3,0.570549,0.003404,4,0.930559,0.442,0.381679,4,0.919708,3,...,0.75,260,0,1,1,1,1,0,829,0
4,4,0.214968,0.001526,4,0.125201,0.78,0.381679,4,0.919708,3,...,0.75,260,0,1,1,1,1,0,829,0


In [4]:
df.drop(labels = ['Unnamed: 0'], axis = 1, inplace=True)

In [5]:
Y = df["isFraud"]
X = df.drop(labels = ["isFraud"], axis = 1)

In [6]:
X.head()

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0.520514,0.001118,4,0.397735,0.188,0.381679,3,0.481752,3,0.059091,...,11,0.75,260,0,1,1,1,1,0,829
1,0.428925,0.001526,4,0.397275,0.972,0.381679,3,0.905109,2,0.452273,...,11,0.75,260,0,1,1,1,1,0,829
2,0.834197,0.01203,4,0.808577,0.14,0.381679,3,0.014599,2,0.052273,...,11,0.75,260,0,1,1,1,1,0,829
3,0.570549,0.003404,4,0.930559,0.442,0.381679,4,0.919708,3,0.452273,...,11,0.75,260,0,1,1,1,1,0,829
4,0.214968,0.001526,4,0.125201,0.78,0.381679,4,0.919708,3,0.384091,...,11,0.75,260,0,1,1,1,1,0,829


# Logistic Regression Hyper Parameter Tuning using GridSearchCV

In [7]:
model = LogisticRegression(penalty='none', solver='newton-cg', C=14.38449888287663, max_iter=2000)

In [8]:
# param_grid = [    
#     {
#     'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
#     # 'C' : np.logspace(-2, 2, 20),
#     'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
#     # 'max_iter' : [500,1000,2000]
#     }
# ]

In [9]:
# clf = GridSearchCV(model, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)

In [10]:
# best_clf = clf.fit(X,Y)

In [11]:
# best_clf.best_estimator_

The penalty parameter is a form of regularization. There are several common types of regularization you see
    L2 regularization
    L1 regularization
    'elasticnet': both L1 and L2 penalty terms are added.
    none.

C:float, default=1.0
    A high value of C tells the model to give more weight to the training data. A lower value of C will indicate the model to give complexity more weight at the cost of fitting the data. Thus, a high Hyper Parameter value C indicates that training data is more important and reflects the real world data, whereas low value is just the opposite of this.

    Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.

solver{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default=’lbfgs’

    Algorithm to use in the optimization problem. Default is ‘lbfgs’. To choose a solver, you might want to consider the following aspects:

            For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones;

            For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss;

            ‘liblinear’ is limited to one-versus-rest schemes.

max_iterint, default=100

    Maximum number of iterations taken for the solvers to converge.



In [12]:
model = LogisticRegression(penalty='l2', 
solver='newton-cg', 
C=14.38449888287663, 
max_iter=2000)

In [14]:
model.fit(X,Y)



# Checking f1-score

In [15]:
y_pred = model.predict(X)

In [16]:
X

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0.520514,0.001118,4,0.397735,0.188,0.381679,3,0.481752,3,0.059091,...,11,0.75,260,0,1,1,1,1,0,829
1,0.428925,0.001526,4,0.397275,0.972,0.381679,3,0.905109,2,0.452273,...,11,0.75,260,0,1,1,1,1,0,829
2,0.834197,0.012030,4,0.808577,0.140,0.381679,3,0.014599,2,0.052273,...,11,0.75,260,0,1,1,1,1,0,829
3,0.570549,0.003404,4,0.930559,0.442,0.381679,4,0.919708,3,0.452273,...,11,0.75,260,0,1,1,1,1,0,829
4,0.214968,0.001526,4,0.125201,0.780,0.381679,4,0.919708,3,0.384091,...,11,0.75,260,0,1,1,1,1,0,829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128215,0.038745,0.003656,4,0.864222,0.762,0.381679,3,0.014599,2,0.511364,...,11,0.75,260,0,1,1,1,1,0,829
128216,0.321036,0.009386,1,0.145091,0.910,0.381679,4,0.919708,3,0.522727,...,99,1.00,163,3,2,0,0,2,2,1728
128217,0.613280,0.005346,4,0.168947,0.910,0.381679,3,0.124088,3,0.286364,...,11,0.75,260,0,1,1,1,1,0,829
128218,0.784404,0.000900,4,0.685905,0.522,0.381679,4,0.919708,3,0.854545,...,11,0.75,260,0,1,1,1,1,0,829


In [17]:
print(f'f1 Score - : {f1_score(y_pred,Y):.3f}')
print(f'ROC AUC Score - : {roc_auc_score(y_pred,Y):.3f}')

f1 Score - : 0.660
ROC AUC Score - : 0.806


In [18]:
df_test.drop(labels = ["Unnamed: 0"], axis = 1, inplace=True)

In [19]:
y_preds = model.predict(df_test)

In [20]:
CSV4 = pd.DataFrame(y_preds)
file = CSV4.to_csv("PredictionsLogistic2.csv")