In [117]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
%matplotlib inline

In [119]:
train = pd.read_csv('training_data')

In [121]:
X_train = train.drop('output', axis = 1)

In [123]:
y_train = train['output']

In [125]:
test = pd.read_csv('testing_data')

In [127]:
X_test = test.drop('output', axis = 1)

In [129]:
y_test = test['output']

In [131]:
from sklearn.linear_model import LogisticRegression

In [133]:
log_model = LogisticRegression()

In [135]:
log_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [137]:
predictions = log_model.predict(X_test)

In [139]:
from sklearn.metrics import confusion_matrix, classification_report

In [145]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719



In [147]:
print(confusion_matrix(y_test, predictions))

[[553211    363]
 [  2145      0]]


### It seems that the model has problem with identifying the credit card frauds. Let's train it on balanced data instead of unbalanced one

In [150]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
undersample = RandomUnderSampler(sampling_strategy='majority')
X_under_train, y_under_train = undersample.fit_resample(X_train, y_train)
print("Undersampled class distribution:", Counter(y_under_train))

Undersampled class distribution: Counter({0: 7506, 1: 7506})


In [154]:
log_model1 = LogisticRegression()

In [156]:
log_model1.fit(X_under_train, y_under_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [158]:
predictions1 = log_model1.predict(X_test)

In [160]:
print(classification_report(y_test, predictions1))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98    553574
           1       0.06      0.74      0.12      2145

    accuracy                           0.96    555719
   macro avg       0.53      0.85      0.55    555719
weighted avg       1.00      0.96      0.97    555719



In [162]:
print(confusion_matrix(y_test, predictions1))

[[530220  23354]
 [   555   1590]]


### Using balanced Data greatly increased rate of predictions of Fraud transactions, let's try changing the solver functions of models, in hope of better results

In [206]:
log_model2 = LogisticRegression(solver= 'saga')

In [208]:
log_model2.fit(X_under_train, y_under_train)



In [209]:
predictions2 = log_model2.predict(X_test)

In [211]:
print(classification_report(y_test, predictions2))

              precision    recall  f1-score   support

           0       1.00      0.79      0.88    553574
           1       0.01      0.77      0.03      2145

    accuracy                           0.79    555719
   macro avg       0.51      0.78      0.45    555719
weighted avg       1.00      0.79      0.88    555719



In [213]:
print(confusion_matrix(y_test, predictions2))

[[434967 118607]
 [   495   1650]]


In [216]:
log_model3 = LogisticRegression(solver= 'newton-cg')
log_model3.fit(X_under_train, y_under_train)

  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


In [218]:
predictions3 = log_model3.predict(X_test)

In [220]:
print(classification_report(y_test, predictions3))
print(confusion_matrix(y_test, predictions3))

              precision    recall  f1-score   support

           0       1.00      0.93      0.96    553574
           1       0.04      0.73      0.07      2145

    accuracy                           0.93    555719
   macro avg       0.52      0.83      0.52    555719
weighted avg       1.00      0.93      0.96    555719

[[514641  38933]
 [   569   1576]]


### Let's try increasing the number of iterations. We cannot scale the data since it also involves Hot one encoding

In [234]:
log_model4 = LogisticRegression(solver= 'newton-cg', max_iter= 10000)
log_model4.fit(X_under_train, y_under_train)

  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


In [236]:
predictions4 = log_model4.predict(X_test)

In [238]:
print(classification_report(y_test, predictions4))
print(confusion_matrix(y_test, predictions4))

              precision    recall  f1-score   support

           0       1.00      0.93      0.96    553574
           1       0.04      0.74      0.07      2145

    accuracy                           0.93    555719
   macro avg       0.52      0.83      0.52    555719
weighted avg       1.00      0.93      0.96    555719

[[512936  40638]
 [   564   1581]]


### Not much useful, now lets try letting go of hot one encoding, and then scaling the data

In [246]:
X_under_train1 = X_under_train.drop(['gender','entertainment', 'food_dining', 'gas_transport', 'grocery_net','grocery_pos', 'health_fitness', 'home', 'kids_pets', 'misc_net','misc_pos', 'personal_care', 'shopping_net', 'shopping_pos', 'travel'], axis = 1)

In [248]:
X_test1 = X_test.drop(['gender','entertainment', 'food_dining', 'gas_transport', 'grocery_net','grocery_pos', 'health_fitness', 'home', 'kids_pets', 'misc_net','misc_pos', 'personal_care', 'shopping_net', 'shopping_pos', 'travel'], axis = 1)

In [292]:
X_under_train1.columns

Index(['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long',
       'trans_year', 'trans_month', 'trans_date', 'trans_hour', 'dob_year'],
      dtype='object')

In [250]:
from sklearn.preprocessing import StandardScaler

In [252]:
scaler = StandardScaler()

In [258]:
scaler.fit_transform(X_under_train1)

array([[-0.53161667, -0.37493469, -0.21275287, ...,  1.49974695,
        -0.63526693,  0.45357344],
       [-0.7803617 , -0.43047439, -0.57194631, ..., -1.56829505,
        -0.397875  , -1.08292427],
       [-0.59282814,  0.86324626,  0.9201297 , ..., -1.00013912,
         0.67038866, -0.53417509],
       ...,
       [-0.77291056, -1.58369168, -0.06044344, ...,  0.59069747,
        -1.3474427 ,  1.22182229],
       [-0.74233154, -1.31107235, -0.90500103, ...,  0.59069747,
        -1.22874674, -0.15005066],
       [-0.77301739, -1.58369168, -0.06044344, ...,  0.59069747,
        -1.22874674,  1.22182229]])

In [284]:
log_model5 = LogisticRegression( max_iter=2500)
log_model5.fit(X_under_train1, y_under_train)
predictions5 = log_model5.predict(X_test1)
print(classification_report(y_test, predictions5))
print(confusion_matrix(y_test, predictions5))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98    553574
           1       0.08      0.74      0.14      2145

    accuracy                           0.96    555719
   macro avg       0.54      0.85      0.56    555719
weighted avg       1.00      0.96      0.98    555719

[[534571  19003]
 [   559   1586]]


In [286]:
log_model6 = LogisticRegression(solver= 'saga', max_iter=2500)
log_model6.fit(X_under_train1, y_under_train)
predictions6 = log_model6.predict(X_test1)
print(classification_report(y_test, predictions6))
print(confusion_matrix(y_test, predictions6))



              precision    recall  f1-score   support

           0       1.00      0.98      0.99    553574
           1       0.11      0.70      0.19      2145

    accuracy                           0.98    555719
   macro avg       0.56      0.84      0.59    555719
weighted avg       1.00      0.98      0.99    555719

[[541665  11909]
 [   653   1492]]


In [290]:
log_model7 = LogisticRegression(solver= 'newton-cg', max_iter=2500)
log_model7.fit(X_under_train1, y_under_train)
predictions7 = log_model7.predict(X_test1)
print(classification_report(y_test, predictions7))
print(confusion_matrix(y_test, predictions7))

  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(


              precision    recall  f1-score   support

           0       1.00      0.97      0.98    553574
           1       0.08      0.74      0.14      2145

    accuracy                           0.96    555719
   macro avg       0.54      0.85      0.56    555719
weighted avg       1.00      0.96      0.98    555719

[[534526  19048]
 [   560   1585]]


In [298]:
from sklearn.metrics import roc_auc_score

In [302]:
print(roc_auc_score(y_test,predictions7))

0.8522593042205597


In [304]:
print(roc_auc_score(y_test,predictions6))

0.8370290816220358


In [306]:
print(roc_auc_score(y_test,predictions5))

0.8525330494261477


In [308]:
print(roc_auc_score(y_test,predictions4))

0.8318263487100895


In [310]:
print(roc_auc_score(y_test,predictions3))

0.8322008403910733


In [312]:
print(roc_auc_score(y_test,predictions2))

0.7774869790183009


In [314]:
print(roc_auc_score(y_test,predictions1))

0.849535533129777


In [316]:
print(roc_auc_score(y_test,predictions))

0.4996721305552645


## From above models, it is evident that the model which gives the best outcome (based on ROC-AUC) is the model5 (with 'lbfgs' solver and scaled data)
