In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

In [None]:
# load the dataset
df = pd.read_csv('fraudTest.csv')
print(df.head())

   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2020-06-21 12:14:25  2291163933867244   
1           1   2020-06-21 12:14:33  3573030041201292   
2           2   2020-06-21 12:14:53  3598215285024754   
3           3   2020-06-21 12:15:15  3591919803438423   
4           4   2020-06-21 12:15:17  3526826139003047   

                               merchant        category    amt   first  \
0                 fraud_Kirlin and Sons   personal_care   2.86    Jeff   
1                  fraud_Sporer-Keebler   personal_care  29.84  Joanne   
2  fraud_Swaniawski, Nitzsche and Welch  health_fitness  41.28  Ashley   
3                     fraud_Haley Group        misc_pos  60.05   Brian   
4                 fraud_Johnston-Casper          travel   3.19  Nathan   

       last gender                       street  ...      lat      long  \
0   Elliott      M            351 Darlene Green  ...  33.9659  -80.9355   
1  Williams      F             3638 Marsh Union  ...  40.3207 

In [None]:
print(df.dtypes)

Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object


In [None]:
print(df.isnull().sum())

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64


In [None]:
# Convert relevant columns to datetime formats, then split 'trans_date_trans_time' into separate 'trans_date' and 'trans_time' columns.
df['dob'] = pd.to_datetime(df['dob'])
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['trans_date'] = df['trans_date_trans_time'].dt.date
df['trans_date'] = pd.to_datetime(df['trans_date'])
df['trans_time'] = df['trans_date_trans_time'].dt.time
df['trans_time'] = pd.to_datetime(df['trans_time'], format='%H:%M:%S')

# drop the original column as it is not needed
df.drop(['trans_date_trans_time'], axis=1, inplace=True)

In [None]:
# removing not needed column
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
# convert all categorical columns to numerical values
labelEncoder = LabelEncoder()

categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

for column in categorical_columns:
    df[column] = labelEncoder.fit_transform(df[column])

df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,0,2291163933867244,319,10,2.86,151,115,1,341,...,33.9659,-80.9355,333497,275,376,98699,1371816865,33.986391,-81.200714,0
1,1,1,3573030041201292,591,10,29.84,163,457,0,354,...,40.3207,-110.436,302,392,760,108785,1371816873,39.450498,-109.960431,0
2,2,2,3598215285024754,611,5,41.28,24,249,0,865,...,40.6729,-73.5365,34496,259,421,433979,1371816893,40.49581,-74.196111,0
3,3,3,3591919803438423,222,9,60.05,42,457,1,320,...,28.5697,-80.8191,54767,407,718,71993,1371816915,28.812398,-80.883061,0
4,4,4,3526826139003047,292,13,3.19,247,261,1,548,...,44.2529,-85.017,1126,196,177,190585,1371816917,44.959148,-85.884734,0


In [None]:
df.dtypes

Unnamed: 0                 int64
trans_date_trans_time      int32
cc_num                     int64
merchant                   int32
category                   int32
amt                      float64
first                      int32
last                       int32
gender                     int32
street                     int32
city                       int32
state                      int32
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                        int32
dob                        int32
trans_num                  int32
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [None]:
# Assigning feature variables to x and target variable to y
x = df.drop(['is_fraud'], axis=1)
y = df['is_fraud']

#Splitting dataset in test and train set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(x_test.shape)
print(y_test.shape)
print(x_train.shape)
print(y_train.shape)

(111144, 22)
(111144,)
(444575, 22)
(444575,)


In [None]:
# Convert datetime columns to unix timestamp for better simplicity
# Do for columns in x_train
for col in x_train.select_dtypes(include=['datetime']):
    x_train[col] = x_train[col].astype('int64') // 10**9

# Do the same for x_test
for col in x_test.select_dtypes(include=['datetime']):
    x_test[col] = x_test[col].astype('int64') // 10**9

In [None]:
# Initialize the logistic regression model
model = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)

# Train the model using the training data
model.fit(x_train, y_train)

# Make predictions on the test data
y_pred = model.predict(x_test)
y_prob = model.predict_proba(x_test)[:, 1]

# Evaluate the model using a confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

#Generating a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Calculating ROC-AUC score
print("\nROC-AUC Score:")
print(roc_auc_score(y_test, y_prob))

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


Confusion Matrix:
[[110718      0]
 [   426      0]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    110718
           1       0.00      0.00      0.00       426

    accuracy                           1.00    111144
   macro avg       0.50      0.50      0.50    111144
weighted avg       0.99      1.00      0.99    111144


ROC-AUC Score:
0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
