In [1]:
# Dependencies
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter("ignore")
import psycopg2
from psycopg2 import sql

from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import RandomOverSampler

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# READ Data from Database
#### Here we are going to read data from the database and creating a Dataframe

In [2]:
# Define your PostgreSQL connection parameters
db_params = {
    "host": "localhost",
    "database": "credit_card_data",
    "user": "postgres",
    "password": ""
}

cols = ["Transaction ID", "Date", "Day of Week", "Time", "Type of Card", "Entry Mode",
        "Amount", "Type of Transaction", "Merchant Group", "Country of Transaction",
        "Shipping Address", "Country of Residence", "Gender", "Age", "Bank", "Fraud"]
column_list = [
    ('transaction_id', 'varchar(9)'),
    ('date', 'varchar(9)'),
    ('day_of_week', 'varchar(9)'),
    ('time', 'int4'),
    ('type_of_card', 'varchar(10)'),
    ('entry_mode', 'varchar(3)'),
    ('amount', 'varchar'),
    ('type_of_transaction', 'varchar(6)'),
    ('merchant_group', 'varchar(13)'),
    ('country_of_transaction', 'varchar(14)'),
    ('shipping_address', 'varchar(14)'),
    ('country_of_residence', 'varchar(14)'),
    ('gender', 'varchar(1)'),
    ('age', 'numeric(4,1)'),
    ('bank', 'varchar(8)'),
    ('fraud', 'bit(1)')
]

# Establish a connection to the database
connection = psycopg2.connect(**db_params)
print("Connected Successfully")

# Create a cursor object
cursor = connection.cursor()

# Generate a SQL query to select the desired columns
select_columns = ", ".join([f'"{col_name}"' for col_name, _ in column_list])
query = f"SELECT {select_columns} FROM public.credit_card_data"

# Execute the query
cursor.execute(query)

# Fetch all rows from the result set
rows = cursor.fetchall()

# Create a DataFrame from the fetched data
df = pd.DataFrame(rows, columns=[col_name for col_name in cols])

# Close the cursor and the database connection
cursor.close()
connection.close()
print("DataFrame Generated Successfully")



Connected Successfully
DataFrame Generated Successfully


In [3]:
df.head()

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,14-Oct-20,Wednesday,19,Visa,Tap,Â£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,Â£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0
2,#2694 780,14-Oct-20,Wednesday,14,Visa,Tap,Â£5,POS,Restaurant,India,India,India,F,42.2,Barclays,0
3,#2640 960,13-Oct-20,Tuesday,14,Visa,Tap,Â£28,POS,Entertainment,United Kingdom,India,United Kingdom,F,51.0,Barclays,0
4,#2771 031,13-Oct-20,Tuesday,23,Visa,CVC,Â£91,Online,Electronics,USA,USA,United Kingdom,M,38.0,Halifax,1


In [4]:
# Reading and importing data from postgres database table
CreditCardData_df = df
CreditCardData_df.head()

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,14-Oct-20,Wednesday,19,Visa,Tap,Â£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,Â£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0
2,#2694 780,14-Oct-20,Wednesday,14,Visa,Tap,Â£5,POS,Restaurant,India,India,India,F,42.2,Barclays,0
3,#2640 960,13-Oct-20,Tuesday,14,Visa,Tap,Â£28,POS,Entertainment,United Kingdom,India,United Kingdom,F,51.0,Barclays,0
4,#2771 031,13-Oct-20,Tuesday,23,Visa,CVC,Â£91,Online,Electronics,USA,USA,United Kingdom,M,38.0,Halifax,1


---------------

## Cleaning and Preparation

--------------

In [5]:
# Figuring out the type of info in this data
CreditCardData_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Transaction ID          100000 non-null  object
 1   Date                    100000 non-null  object
 2   Day of Week             100000 non-null  object
 3   Time                    100000 non-null  int64 
 4   Type of Card            100000 non-null  object
 5   Entry Mode              100000 non-null  object
 6   Amount                  100000 non-null  object
 7   Type of Transaction     100000 non-null  object
 8   Merchant Group          100000 non-null  object
 9   Country of Transaction  100000 non-null  object
 10  Shipping Address        100000 non-null  object
 11  Country of Residence    100000 non-null  object
 12  Gender                  100000 non-null  object
 13  Age                     100000 non-null  object
 14  Bank                    100000 non-nu

In [6]:
# Number of rows and columns before cleaning
CreditCardData_df.shape

(100000, 16)

In [7]:
# Remove rows with zero values
CreditCardData_df_clean = CreditCardData_df[(CreditCardData_df['Transaction ID'] != 0) & (CreditCardData_df['Date'] != 0) & (CreditCardData_df['Day of Week'] != 0) & (CreditCardData_df['Time'] != 0) &
                                            (CreditCardData_df['Type of Card'] != 0) & (CreditCardData_df['Entry Mode'] != 0) & (CreditCardData_df['Amount'] != 0) & (CreditCardData_df['Type of Transaction'] != 0) &
                                            (CreditCardData_df['Merchant Group'] != 0) & (CreditCardData_df['Country of Transaction'] != 0) & (CreditCardData_df['Shipping Address'] != 0) &
                                            (CreditCardData_df['Country of Residence'] != 0) & (CreditCardData_df['Gender'] != 0) & (CreditCardData_df['Age'] != 0) & (CreditCardData_df['Bank'] != 0)]

In [8]:
CreditCardData_df_clean.shape

(99491, 16)

In [9]:
CreditCardData_df_clean = CreditCardData_df_clean.replace([np.inf, -np.inf], np.nan).dropna()
CreditCardData_df_clean.shape

(99491, 16)

In [10]:
# Figuring out the type of info in this data
CreditCardData_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99491 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Transaction ID          99491 non-null  object
 1   Date                    99491 non-null  object
 2   Day of Week             99491 non-null  object
 3   Time                    99491 non-null  int64 
 4   Type of Card            99491 non-null  object
 5   Entry Mode              99491 non-null  object
 6   Amount                  99491 non-null  object
 7   Type of Transaction     99491 non-null  object
 8   Merchant Group          99491 non-null  object
 9   Country of Transaction  99491 non-null  object
 10  Shipping Address        99491 non-null  object
 11  Country of Residence    99491 non-null  object
 12  Gender                  99491 non-null  object
 13  Age                     99491 non-null  object
 14  Bank                    99491 non-null  object
 15  Fr

In [11]:
# Number of rows and columns after cleaning
CreditCardData_df_clean.shape

(99491, 16)

In [12]:
CreditCardData_df_clean.shape[0]

99491

In [13]:
print("We've got rid of " + str(CreditCardData_df.shape[0] - CreditCardData_df_clean.shape[0]) + " rows with NaN or nulls")

We've got rid of 509 rows with NaN or nulls


In [14]:
# Checking data for nulls
nulls = CreditCardData_df_clean.isnull()
nulls

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
99996,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
99997,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
99998,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [15]:
nulls.value_counts()

Transaction ID  Date   Day of Week  Time   Type of Card  Entry Mode  Amount  Type of Transaction  Merchant Group  Country of Transaction  Shipping Address  Country of Residence  Gender  Age    Bank   Fraud
False           False  False        False  False         False       False   False                False           False                   False             False                 False   False  False  False    99491
dtype: int64

In [16]:
num_nulls = CreditCardData_df_clean.isnull().sum()
num_nulls

Transaction ID            0
Date                      0
Day of Week               0
Time                      0
Type of Card              0
Entry Mode                0
Amount                    0
Type of Transaction       0
Merchant Group            0
Country of Transaction    0
Shipping Address          0
Country of Residence      0
Gender                    0
Age                       0
Bank                      0
Fraud                     0
dtype: int64

#### So no more nulls

In [17]:
CreditCardData_df_clean.head(2)

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,14-Oct-20,Wednesday,19,Visa,Tap,Â£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,Â£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0


In [18]:
# Figuring out type of data these columns hold
CreditCardData_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99491 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Transaction ID          99491 non-null  object
 1   Date                    99491 non-null  object
 2   Day of Week             99491 non-null  object
 3   Time                    99491 non-null  int64 
 4   Type of Card            99491 non-null  object
 5   Entry Mode              99491 non-null  object
 6   Amount                  99491 non-null  object
 7   Type of Transaction     99491 non-null  object
 8   Merchant Group          99491 non-null  object
 9   Country of Transaction  99491 non-null  object
 10  Shipping Address        99491 non-null  object
 11  Country of Residence    99491 non-null  object
 12  Gender                  99491 non-null  object
 13  Age                     99491 non-null  object
 14  Bank                    99491 non-null  object
 15  Fr

#### Now all columns hold equally same number of data

In [19]:
# There is "#" next to each number, it should be removed 1st then converting column data type from Object to integer
CreditCardData_df_clean["Transaction ID"].head()

0    #3577 209
1    #3039 221
2    #2694 780
3    #2640 960
4    #2771 031
Name: Transaction ID, dtype: object

----------------

### 1- Fixing data in "Transaction ID" column

----------------

In [20]:
# 1st getting rid of '#' in the "Transaction ID"
CreditCardData_df_clean.loc[:, 'Transaction ID'] = CreditCardData_df_clean['Transaction ID'].str.replace('#', '')
CreditCardData_df_clean['Transaction ID'].head()

0    3577 209
1    3039 221
2    2694 780
3    2640 960
4    2771 031
Name: Transaction ID, dtype: object

In [21]:
CreditCardData_df_clean.head(2)

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,3577 209,14-Oct-20,Wednesday,19,Visa,Tap,Â£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,Â£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0


In [22]:
CreditCardData_df_clean['Transaction ID'].dtype

dtype('O')

In [23]:
# 2nd getting rid of 'space' in the "Transaction ID"
CreditCardData_df_clean.loc[:, 'Transaction ID'] = CreditCardData_df_clean['Transaction ID'].str.replace(' ', '')
CreditCardData_df_clean['Transaction ID'].head()

0    3577209
1    3039221
2    2694780
3    2640960
4    2771031
Name: Transaction ID, dtype: object

In [24]:
# 3rd converting the data in CreditCardData_df_clean['Transaction ID'] into integer
CreditCardData_df_clean['Transaction ID'] = pd.to_numeric(CreditCardData_df_clean['Transaction ID'])
CreditCardData_df_clean['Transaction ID'].dtype

dtype('int64')

---------

### 2- Fixing data in "Date" column

_______

In [25]:
# Converting "Date" column from object to date
CreditCardData_df_clean['Date'] = pd.to_datetime(CreditCardData_df_clean['Date'], format='%d-%b-%y')
CreditCardData_df_clean['Date'].head()

0   2020-10-14
1   2020-10-14
2   2020-10-14
3   2020-10-13
4   2020-10-13
Name: Date, dtype: datetime64[ns]

In [26]:
# Now checking "Date" data type
CreditCardData_df_clean['Date'].dtypes

dtype('<M8[ns]')

--------------

### 3- Converting data type in column "Amount" from object to integer

--------------

In [27]:
# a) Getting rid of "£" 
CreditCardData_df_clean['Amount'] = CreditCardData_df_clean['Amount'].str.replace('Â£', '')
CreditCardData_df_clean['Amount'].head()

0      5
1    288
2      5
3     28
4     91
Name: Amount, dtype: object

In [28]:
# b) Converting data type in column "Amount" to Integer from Object
CreditCardData_df_clean['Amount'] = pd.to_numeric(CreditCardData_df_clean['Amount'])
CreditCardData_df_clean['Amount'].dtype

dtype('float64')

We can not convert data in column "Amount" into integer unless we get rid of non-finite and NaN values.


In [29]:
# Figuring out if there is NaN values in colum "Amount"
nan_count = CreditCardData_df_clean.isna().sum()
print(nan_count)

Transaction ID            0
Date                      0
Day of Week               0
Time                      0
Type of Card              0
Entry Mode                0
Amount                    6
Type of Transaction       0
Merchant Group            0
Country of Transaction    0
Shipping Address          0
Country of Residence      0
Gender                    0
Age                       0
Bank                      0
Fraud                     0
dtype: int64


In [30]:
# Figuring out if there is non-finite values in colum "Amount"
inf_count = CreditCardData_df_clean.isin([np.inf, -np.inf]).sum()
print(inf_count)

Transaction ID            0
Date                      0
Day of Week               0
Time                      0
Type of Card              0
Entry Mode                0
Amount                    0
Type of Transaction       0
Merchant Group            0
Country of Transaction    0
Shipping Address          0
Country of Residence      0
Gender                    0
Age                       0
Bank                      0
Fraud                     0
dtype: int64


In [31]:
# Getting rid of NA
CreditCardData_df_clean = CreditCardData_df_clean.dropna()
CreditCardData_df_clean.shape

(99485, 16)

In [32]:
CreditCardData_df_clean['Amount'] = CreditCardData_df_clean['Amount'].astype(np.int64)
CreditCardData_df_clean['Amount'].dtype

dtype('int64')

-----------

### 4- Converting data type in "Age" into Integer from Float

----------

In [33]:
# Rounding it 1st
CreditCardData_df_clean['Age'] = CreditCardData_df_clean['Age'].astype(float)
CreditCardData_df_clean['Age'] = round(CreditCardData_df_clean['Age'], 0)
CreditCardData_df_clean['Age'].head()

0    25.0
1    50.0
2    42.0
3    51.0
4    38.0
Name: Age, dtype: float64

In [34]:
# 2nd converting it
CreditCardData_df_clean['Age'] = pd.to_numeric(CreditCardData_df_clean['Age'])
CreditCardData_df_clean['Age'] .dtype

dtype('float64')

In [35]:
CreditCardData_df_clean['Age'] = CreditCardData_df_clean['Age'].astype(np.int64)
CreditCardData_df_clean['Age'].dtype

dtype('int64')

---------------------

In [36]:
# Checking all the data types
CreditCardData_df_clean.dtypes

Transaction ID                     int64
Date                      datetime64[ns]
Day of Week                       object
Time                               int64
Type of Card                      object
Entry Mode                        object
Amount                             int64
Type of Transaction               object
Merchant Group                    object
Country of Transaction            object
Shipping Address                  object
Country of Residence              object
Gender                            object
Age                                int64
Bank                              object
Fraud                             object
dtype: object

In [37]:
# Or like this
CreditCardData_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99485 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Transaction ID          99485 non-null  int64         
 1   Date                    99485 non-null  datetime64[ns]
 2   Day of Week             99485 non-null  object        
 3   Time                    99485 non-null  int64         
 4   Type of Card            99485 non-null  object        
 5   Entry Mode              99485 non-null  object        
 6   Amount                  99485 non-null  int64         
 7   Type of Transaction     99485 non-null  object        
 8   Merchant Group          99485 non-null  object        
 9   Country of Transaction  99485 non-null  object        
 10  Shipping Address        99485 non-null  object        
 11  Country of Residence    99485 non-null  object        
 12  Gender                  99485 non-null  object

_________________

### Total rows were removed

In [38]:
print("We've got ridden of " + str(CreditCardData_df.shape[0] - CreditCardData_df_clean.shape[0]) + " rows with NaN or nulls")

We've got ridden of 515 rows with NaN or nulls


-------------------

### Data Conversion Done

------------------------

---------------

## Creating Machine Learning Model

--------------

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

### Logistical Regression 1st Model before Balancing the Training Data

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

In [39]:
CreditCardData_df_clean['Fraud'].value_counts()

0    92802
1     6683
Name: Fraud, dtype: int64

In [40]:
print('No of samples belongs to Safe Transactions:', CreditCardData_df_clean['Fraud'].value_counts()[0])
print('No of samples belongs to Fraudalent Transactions:', CreditCardData_df_clean['Fraud'].value_counts()[1])

No of samples belongs to Safe Transactions: 92802
No of samples belongs to Fraudalent Transactions: 6683


In [41]:
print( str(round(CreditCardData_df_clean['Fraud'].value_counts()[1] / CreditCardData_df_clean['Fraud'].value_counts()[0] * 100, 2)) + "% of data is fraud so it's unbalanced")

7.2% of data is fraud so it's unbalanced


##### It is clearly observed that the data is highly imbalanced. There are 92785 records belonging to one group and only 7192 records are belonging to other group.

### Split the Data into Training and Testing Sets

In [42]:
# Getting necessary data.
columns = list(CreditCardData_df_clean.columns[2:])
df_unbalanced = CreditCardData_df_clean[columns]
df_unbalanced.head()

Unnamed: 0,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,Wednesday,19,Visa,Tap,5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25,RBS,0
1,Wednesday,17,MasterCard,PIN,288,POS,Services,USA,USA,USA,F,50,Lloyds,0
2,Wednesday,14,Visa,Tap,5,POS,Restaurant,India,India,India,F,42,Barclays,0
3,Tuesday,14,Visa,Tap,28,POS,Entertainment,United Kingdom,India,United Kingdom,F,51,Barclays,0
4,Tuesday,23,Visa,CVC,91,Online,Electronics,USA,USA,United Kingdom,M,38,Halifax,1


### Data Preprocessing and Encoding

In [43]:
# Encoding Categorical values
df_string = df_unbalanced.select_dtypes(include=['object'])
for i in df_string.columns:
    df_unbalanced[i] = LabelEncoder().fit_transform(df_unbalanced[i])
df_unbalanced.head()

Unnamed: 0,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,3,19,1,2,5,2,3,4,5,4,2,25,7,0
1,3,17,0,1,288,2,9,3,4,3,1,50,4,0
2,3,14,1,2,5,2,8,1,2,1,1,42,0,0
3,2,14,1,2,28,2,3,4,2,4,1,51,0,0
4,2,23,1,0,91,1,2,3,4,4,2,38,3,1


### Defining the target variable

##### Creating the labels set (`y`)  from the “Fraud” column, and then create the features (`X`) DataFrame from the remaining columns.

In [44]:
# Separate the data into labels and features
# Separate the X variable, the features
features = list(df_unbalanced.columns[0:-1])
x = df_unbalanced[features]

print(x.isnull().sum())

# Separate the y variable, the labels
y = df_unbalanced['Fraud']

Day of Week               0
Time                      0
Type of Card              0
Entry Mode                0
Amount                    0
Type of Transaction       0
Merchant Group            0
Country of Transaction    0
Shipping Address          0
Country of Residence      0
Gender                    0
Age                       0
Bank                      0
dtype: int64


### Spliting Imbalanced data

In [45]:
x_train, x_test, y_train, y_test = train_test_split(x, y , 
                                                    test_size = 0.2, 
                                                    random_state=42)

print('Number of Records in Traning data:', x_train.shape[0])
print('Number of columns in Traning data:', x_train.shape[1])
print('Number of Records in Testing data:', x_test.shape[0])
print('Number of columns in Testing data:', x_test.shape[1])

Number of Records in Traning data: 79588
Number of columns in Traning data: 13
Number of Records in Testing data: 19897
Number of columns in Testing data: 13


### Model Training on Imbalanced Data

In [46]:
model = LogisticRegression()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)

### Model Evaluation on Imbalanced Data
* Accuracy
* Confusion matrix
* Classification Report

#### Accuracy

In [47]:
acc  = round(100 * accuracy_score(y_test, y_predict))
print(f'Accuracy of Model on Test data: {acc}%')

Accuracy of Model on Test data: 95%


#### Confusion matrix

In [48]:
print('\nThe confusion matrix for unbalanced data is:\n')
print(confusion_matrix(y_test, y_predict))


The confusion matrix for unbalanced data is:

[[18300   222]
 [  684   691]]


#### Classification Report

In [49]:
print('\nClassification report on Unbalanced data:\n\n',classification_report(y_test, y_predict))


Classification report on Unbalanced data:

               precision    recall  f1-score   support

           0       0.96      0.99      0.98     18522
           1       0.76      0.50      0.60      1375

    accuracy                           0.95     19897
   macro avg       0.86      0.75      0.79     19897
weighted avg       0.95      0.95      0.95     19897



+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

### Logistical Regression 2nd Model after Balancing the Training Data

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

### Data Balancing

#### Upsampling

In [50]:
# Define the oversampling method
oversampler = RandomOverSampler()

# oversample the data
x_balanced, y_balanced = oversampler.fit_resample(x_train, y_train)
print('No of samples Belongs to Safe Transactions:',y_balanced.value_counts()[0])
print('No of samples Belongs to Fraudulent Transactions:', y_balanced.value_counts()[1])

No of samples Belongs to Safe Transactions: 74280
No of samples Belongs to Fraudulent Transactions: 74280


##### It is clearly shown that the number of records belongs to Safe Transactions is 74280 and Fraudulent Transactions is also 74280. It means that the data is in balanced state.

### 2nd Model Training on Balanced Data

In [51]:
oversample_model = LogisticRegression()
oversample_model.fit(x_balanced, y_balanced)
y_balanced_predict = oversample_model.predict(x_test)

### 2nd Model Evaluation on Balanced Data
* Accuracy
* Confusion matrix
* Classification Report

#### Accuracy

In [52]:
acc  = round(100 * accuracy_score(y_test, y_balanced_predict))
print(f'Accuracy of Model on Test data: {acc}%')

Accuracy of Model on Test data: 93%


#### Confusion matrix

In [53]:
print('\nThe confusion matrix for balanced data is:\n')
print(confusion_matrix(y_test, y_balanced_predict))


The confusion matrix for balanced data is:

[[17348  1174]
 [  164  1211]]


#### Classification Report

In [54]:
print('\nClassification report on balanced data:\n\n',classification_report(y_test, y_balanced_predict))


Classification report on balanced data:

               precision    recall  f1-score   support

           0       0.99      0.94      0.96     18522
           1       0.51      0.88      0.64      1375

    accuracy                           0.93     19897
   macro avg       0.75      0.91      0.80     19897
weighted avg       0.96      0.93      0.94     19897



As it can be shown that the accuracy on imbalanced data is 95% and on balanced data after applying oversampling technique is 93%. The reason behind that the model was overfitted due to imbalanced data and predicting one class with high accuracy and other class with low accuracy which can be seen in confusion matrix. When the data is balanced, A Bias-Varience trade-off is established and the accuracy is reduced to 93% and the model is accurately predicting both classes rather than only a single class.
But we have still got weak precision and f1-score for class 1 AKA fraudulent cases. This is not acceptable therefore we have to use Random Forest which is good for detecting non-linear data points.

++++++++++++++++++++++++++

### Random Forest 3rd Model

++++++++++++++++++++++++++

In [55]:
# Define target vector
y = CreditCardData_df_clean['Fraud'].values.reshape(-1, 1)
y[:5]

array([['0'],
       ['0'],
       ['0'],
       ['0'],
       ['1']], dtype=object)

Split the data into training and testing sets.

In [56]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=78)

In [57]:
y_train.shape

(74613, 1)

In [58]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [59]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [60]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#### Fitting the Random Forest Model

Once the data is scaled, we create a random forest instance and train it with the training data (`X_train_scaled` and `y_train`), define `n_estimators=500` and `random_state=78`.

In [61]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [62]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

#### Making Predictions Using the Random Forest Model

We validate the trained model by predicting fraudulent transactions by using the testing data (`X_test_scaled`).

In [63]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

#### Model Evaluation

Evaluate model's results, by using `sklearn` to calculate the confusion matrix, the accuracy score and to generate the classification report.

In [64]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0 (Safe Transactions)", "Predicted 1 (Fraud Transactions)"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [65]:
# Displaying results
print(f"Accuracy Score : {round(acc_score*100,1)}")

print("  ")

print("Confusion Matrix")
display(cm_df)

print("Classification Report")
print(classification_report(y_test, predictions))

Accuracy Score : 98.7
  
Confusion Matrix


Unnamed: 0,Predicted 0 (Safe Transactions),Predicted 1 (Fraud Transactions)
Actual 0,23191,22
Actual 1,309,1350


Classification Report
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     23213
           1       0.98      0.81      0.89      1659

    accuracy                           0.99     24872
   macro avg       0.99      0.91      0.94     24872
weighted avg       0.99      0.99      0.99     24872



### Conclusion

Now with Rondom Forest model we got pretty good results both Accuracy (98.7%), precision, recall, and f1-score. We notice that pecision and f1-score have significantly improved for class 1 (fraudulent transactions) which is critical and most important for banks and their clients.