In [1]:
# Deendencies
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter("ignore")


In [2]:
# Reading and importing data from csv file
CreditCardData_df = pd.read_csv("./Resources/CreditCardData.csv")
CreditCardData_df.head()

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,14-Oct-20,Wednesday,19,Visa,Tap,£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0
2,#2694 780,14-Oct-20,Wednesday,14,Visa,Tap,£5,POS,Restaurant,India,India,India,F,42.2,Barclays,0
3,#2640 960,13-Oct-20,Tuesday,14,Visa,Tap,£28,POS,Entertainment,United Kingdom,India,United Kingdom,F,51.0,Barclays,0
4,#2771 031,13-Oct-20,Tuesday,23,Visa,CVC,£91,Online,Electronics,USA,USA,United Kingdom,M,38.0,Halifax,1


---------------

## Cleaning and Preparation

--------------

In [3]:
# Figuring out the type of info in this data
CreditCardData_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Transaction ID          100000 non-null  object 
 1   Date                    100000 non-null  object 
 2   Day of Week             100000 non-null  object 
 3   Time                    100000 non-null  int64  
 4   Type of Card            100000 non-null  object 
 5   Entry Mode              100000 non-null  object 
 6   Amount                  99994 non-null   object 
 7   Type of Transaction     100000 non-null  object 
 8   Merchant Group          99990 non-null   object 
 9   Country of Transaction  100000 non-null  object 
 10  Shipping Address        99995 non-null   object 
 11  Country of Residence    100000 non-null  object 
 12  Gender                  99996 non-null   object 
 13  Age                     100000 non-null  float64
 14  Bank                 

#### Some columns contain less number of rows than others

In [4]:
# Number of rows and columns before cleaning
CreditCardData_df.shape

(100000, 16)

In [5]:
# Getting rid of NA
CreditCardData_df_clean = CreditCardData_df.dropna()

In [6]:
# Number of rows and columns after cleaning
CreditCardData_df_clean.shape

(99977, 16)

In [7]:
CreditCardData_df_clean.shape[0]

99977

In [8]:
print("We've got rid of " + str(CreditCardData_df.shape[0] - CreditCardData_df_clean.shape[0]) + " rows with NaN or nulls")

We've got rid of 23 rows with NaN or nulls


In [9]:
# Checking data for nulls
nulls = CreditCardData_df_clean.isnull()
nulls

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
99996,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
99997,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
99998,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [10]:
nulls.value_counts()

Transaction ID  Date   Day of Week  Time   Type of Card  Entry Mode  Amount  Type of Transaction  Merchant Group  Country of Transaction  Shipping Address  Country of Residence  Gender  Age    Bank   Fraud
False           False  False        False  False         False       False   False                False           False                   False             False                 False   False  False  False    99977
dtype: int64

In [11]:
num_nulls = CreditCardData_df_clean.isnull().sum()
num_nulls

Transaction ID            0
Date                      0
Day of Week               0
Time                      0
Type of Card              0
Entry Mode                0
Amount                    0
Type of Transaction       0
Merchant Group            0
Country of Transaction    0
Shipping Address          0
Country of Residence      0
Gender                    0
Age                       0
Bank                      0
Fraud                     0
dtype: int64

#### So no more nulls

In [12]:
CreditCardData_df_clean.head(2)

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,14-Oct-20,Wednesday,19,Visa,Tap,£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0


In [13]:
# Figuring out type of data these columns hold
CreditCardData_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99977 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Transaction ID          99977 non-null  object 
 1   Date                    99977 non-null  object 
 2   Day of Week             99977 non-null  object 
 3   Time                    99977 non-null  int64  
 4   Type of Card            99977 non-null  object 
 5   Entry Mode              99977 non-null  object 
 6   Amount                  99977 non-null  object 
 7   Type of Transaction     99977 non-null  object 
 8   Merchant Group          99977 non-null  object 
 9   Country of Transaction  99977 non-null  object 
 10  Shipping Address        99977 non-null  object 
 11  Country of Residence    99977 non-null  object 
 12  Gender                  99977 non-null  object 
 13  Age                     99977 non-null  float64
 14  Bank                    99977 non-null

#### Now all columns hold equally same number of data

In [14]:
# There is "#" next to each number, it should be removed 1st then converting column data type from Object to integer
CreditCardData_df_clean["Transaction ID"].head()

0    #3577 209
1    #3039 221
2    #2694 780
3    #2640 960
4    #2771 031
Name: Transaction ID, dtype: object

----------------

### 1- Fixing data in "Transaction ID" column

----------------

In [15]:
# 1st getting rid of '#' in the "Transaction ID"
CreditCardData_df_clean.loc[:, 'Transaction ID'] = CreditCardData_df_clean['Transaction ID'].str.replace('#', '')
CreditCardData_df_clean['Transaction ID'].head()

0    3577 209
1    3039 221
2    2694 780
3    2640 960
4    2771 031
Name: Transaction ID, dtype: object

In [16]:
CreditCardData_df_clean.head(2)

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,3577 209,14-Oct-20,Wednesday,19,Visa,Tap,£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0


In [17]:
CreditCardData_df_clean['Transaction ID'].dtype

dtype('O')

In [18]:
# 2nd getting rid of 'space' in the "Transaction ID"
CreditCardData_df_clean.loc[:, 'Transaction ID'] = CreditCardData_df_clean['Transaction ID'].str.replace(' ', '')
CreditCardData_df_clean['Transaction ID'].head()

0    3577209
1    3039221
2    2694780
3    2640960
4    2771031
Name: Transaction ID, dtype: object

In [19]:
# 3rd converting the data in CreditCardData_df_clean['Transaction ID'] into integer
CreditCardData_df_clean['Transaction ID'] = pd.to_numeric(CreditCardData_df_clean['Transaction ID'])
CreditCardData_df_clean['Transaction ID'].dtype

dtype('int64')

---------

### 2- Fixing data in "Date" column

_______

In [20]:
# Converting "Date" column from object to date
CreditCardData_df_clean['Date'] = pd.to_datetime(CreditCardData_df_clean['Date'], format='%d-%b-%y')
CreditCardData_df_clean['Date'].head()

0   2020-10-14
1   2020-10-14
2   2020-10-14
3   2020-10-13
4   2020-10-13
Name: Date, dtype: datetime64[ns]

In [21]:
# Now checking "Date" data type
CreditCardData_df_clean['Date'].dtypes

dtype('<M8[ns]')

--------------

### 3- Converting data type in column "Amount" from object to integer

--------------

In [22]:
# a) Getting rid of "£" 
CreditCardData_df_clean['Amount'] = CreditCardData_df_clean['Amount'].str.replace('£', '')
CreditCardData_df_clean['Amount'].head()

0      5
1    288
2      5
3     28
4     91
Name: Amount, dtype: object

In [23]:
# b) Converting data type in column "Amount" to Integer from Object
CreditCardData_df_clean['Amount'] = pd.to_numeric(CreditCardData_df_clean['Amount'])
CreditCardData_df_clean['Amount'].dtype

dtype('int64')

-----------

### 4- Converting data type in "Age" into Integer from Float

----------

In [24]:
# Rounding it 1st
CreditCardData_df_clean['Age'] = round(CreditCardData_df_clean['Age'] , 0)
CreditCardData_df_clean['Age'].head()

0    25.0
1    50.0
2    42.0
3    51.0
4    38.0
Name: Age, dtype: float64

In [25]:
# 2nd converting it
CreditCardData_df_clean['Age'] = pd.to_numeric(CreditCardData_df_clean['Age'])
CreditCardData_df_clean['Age'] .dtype

dtype('float64')

In [26]:
CreditCardData_df_clean['Age'] = CreditCardData_df_clean['Age'].astype(np.int64)
CreditCardData_df_clean['Age'].dtype

dtype('int64')

---------------------

In [27]:
# Checking all the data types
CreditCardData_df_clean.dtypes

Transaction ID                     int64
Date                      datetime64[ns]
Day of Week                       object
Time                               int64
Type of Card                      object
Entry Mode                        object
Amount                             int64
Type of Transaction               object
Merchant Group                    object
Country of Transaction            object
Shipping Address                  object
Country of Residence              object
Gender                            object
Age                                int64
Bank                              object
Fraud                              int64
dtype: object

In [28]:
# Or like this
CreditCardData_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99977 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Transaction ID          99977 non-null  int64         
 1   Date                    99977 non-null  datetime64[ns]
 2   Day of Week             99977 non-null  object        
 3   Time                    99977 non-null  int64         
 4   Type of Card            99977 non-null  object        
 5   Entry Mode              99977 non-null  object        
 6   Amount                  99977 non-null  int64         
 7   Type of Transaction     99977 non-null  object        
 8   Merchant Group          99977 non-null  object        
 9   Country of Transaction  99977 non-null  object        
 10  Shipping Address        99977 non-null  object        
 11  Country of Residence    99977 non-null  object        
 12  Gender                  99977 non-null  object

-------------------

### Data Conversion Done

------------------------

---------------

## Creating Machine Learning Model

--------------

In [29]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [30]:
CreditCardData_df_clean['Fraud'].value_counts()

0    92785
1     7192
Name: Fraud, dtype: int64

In [31]:
print('No of samples belongs to Safe Transactions:', CreditCardData_df_clean['Fraud'].value_counts()[0])
print('No of samples belongs to Fraudalent Transaction:', CreditCardData_df_clean['Fraud'].value_counts()[1])

No of samples belongs to Safe Transactions: 92785
No of samples belongs to Fraudalent Transaction: 7192


In [32]:
print( str(round(CreditCardData_df_clean['Fraud'].value_counts()[1] / CreditCardData_df_clean['Fraud'].value_counts()[0] * 100, 2)) + "% of data is fraud so it's unbalanced")

7.75% of data is fraud so it's unbalanced


##### It is clearly observed that the data is highly imbalanced. There are 92785 records belonging to one class and only 7192 records are belonging to other class.

### Split the Data into Training and Testing Sets

In [33]:
# Getting necessary data.
columns = list(CreditCardData_df_clean.columns[2:])
df_unbalanced = CreditCardData_df_clean[columns]
df_unbalanced.head()

Unnamed: 0,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,Wednesday,19,Visa,Tap,5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25,RBS,0
1,Wednesday,17,MasterCard,PIN,288,POS,Services,USA,USA,USA,F,50,Lloyds,0
2,Wednesday,14,Visa,Tap,5,POS,Restaurant,India,India,India,F,42,Barclays,0
3,Tuesday,14,Visa,Tap,28,POS,Entertainment,United Kingdom,India,United Kingdom,F,51,Barclays,0
4,Tuesday,23,Visa,CVC,91,Online,Electronics,USA,USA,United Kingdom,M,38,Halifax,1


### Data Preprocessing and Encoding

In [34]:
# Encoding Categorical values
df_string = df_unbalanced.select_dtypes(include=['object'])
for i in df_string.columns:
    df_unbalanced[i] = LabelEncoder().fit_transform(df_unbalanced[i])
df_unbalanced.head()

Unnamed: 0,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,3,19,1,2,5,2,2,4,4,4,1,25,7,0
1,3,17,0,1,288,2,8,3,3,3,0,50,4,0
2,3,14,1,2,5,2,7,1,1,1,0,42,0,0
3,2,14,1,2,28,2,2,4,1,4,0,51,0,0
4,2,23,1,0,91,1,1,3,3,4,1,38,3,1


### Defining the target variable

##### Creating the labels set (`y`)  from the “Fraud” column, and then create the features (`X`) DataFrame from the remaining columns.

In [35]:
# Separate the data into labels and features
# Separate the X variable, the features
features = list(df_unbalanced.columns[0:-1])
x = df_unbalanced[features]

# Separate the y variable, the labels
y = df_unbalanced['Fraud']

### Spliting Imbalanced data

In [36]:
x_train, x_test, y_train, y_test = train_test_split(x, y , 
                                                    test_size = 0.2, 
                                                    random_state=42)

print('Number of Records in Traning data:', x_train.shape[0])
print('Number of columns in Traning data:', x_train.shape[1])
print('Number of Records in Testing data:', x_test.shape[0])
print('Number of columns in Testing data:', x_test.shape[1])

Number of Records in Traning data: 79981
Number of columns in Traning data: 13
Number of Records in Testing data: 19996
Number of columns in Testing data: 13


### Model Training on Imbalanced Data

In [37]:
model = LogisticRegression()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)

### Model Evaluation on Imbalanced Data
* Accuracy
* Confusion matrix
* Classification Report

#### Accuracy

In [38]:
acc  = round(100 * accuracy_score(y_test, y_predict))
print(f'Accuracy of Model on Test data: {acc}%')

Accuracy of Model on Test data: 96%


#### Confusion matrix

In [39]:
print('\nThe confusion matrix for unbalanced data is:\n')
print(confusion_matrix(y_test, y_predict))


The confusion matrix for unbalanced data is:

[[18285   250]
 [  648   813]]


#### Classification Report

In [40]:
print('\nClassification report on Unbalanced data:\n\n',classification_report(y_test, y_predict))


Classification report on Unbalanced data:

               precision    recall  f1-score   support

           0       0.97      0.99      0.98     18535
           1       0.76      0.56      0.64      1461

    accuracy                           0.96     19996
   macro avg       0.87      0.77      0.81     19996
weighted avg       0.95      0.96      0.95     19996



## Data Balancing

### Upsampling

In [41]:
from imblearn.over_sampling import RandomOverSampler
# Define the oversampling method
oversampler = RandomOverSampler()

# oversample the data
x_balanced, y_balanced = oversampler.fit_resample(x_train, y_train)
print('No of samples Belongs to Honest Persons:',y_balanced.value_counts()[0])
print('No of samples Belongs to Fraud Persons:', y_balanced.value_counts()[1])

No of samples Belongs to Honest Persons: 74250
No of samples Belongs to Fraud Persons: 74250


##### It is clearly shown that the number of records belongs to Honest Persons is 92785 and Fraud Persons is also 92785. It means that the data is in balanced state.

### Model Training on Balanced Data

In [42]:
oversample_model = LogisticRegression()
oversample_model.fit(x_balanced, y_balanced)
y_balanced_predict = oversample_model.predict(x_test)

### Model Evaluation on Balanced Data
* Accuracy
* Confusion matrix
* Classification Report

#### Accuracy

In [43]:
acc  = round(100 * accuracy_score(y_test, y_balanced_predict))
print(f'Accuracy of Model on Test data: {acc}%')

Accuracy of Model on Test data: 93%


#### Confusion matrix

In [44]:
print('\nThe confusion matrix for balanced data is:\n')
print(confusion_matrix(y_test, y_balanced_predict))


The confusion matrix for balanced data is:

[[17299  1236]
 [  195  1266]]


#### Classification Report

In [45]:
print('\nClassification report on Unbalanced data:\n\n',classification_report(y_test, y_balanced_predict))


Classification report on Unbalanced data:

               precision    recall  f1-score   support

           0       0.99      0.93      0.96     18535
           1       0.51      0.87      0.64      1461

    accuracy                           0.93     19996
   macro avg       0.75      0.90      0.80     19996
weighted avg       0.95      0.93      0.94     19996



### Conclusion

As it can be shown that the accuracy on imbalanced data is 95% and on balanced data after applying oversampling technique is 93%. The reason behind that the model was overfitted due to imbalanced data and predicting one class with high accuracy and other class with low accuracy which can be seen in confusion matrix. When the data is balanced, A Bias-Varience trade-off is established and the accuracy is reduced to 93% and the model is accurately predicting both classes rather than only a single class.