# **Loading the Dataset**

In [1]:
%pip install -q kaggle

DEPRECATION: uvicorn 0.14.0 has a non-standard dependency specifier click>=7.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of uvicorn or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [None]:
from kaggle import files
files.upload()

In [3]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 /root/.kaggle/kaggle.json

In [4]:
!kaggle datasets download -d ealaxi/paysim1
!unzip /content/paysim1.zip

Downloading paysim1.zip to /content
 99% 176M/178M [00:01<00:00, 96.7MB/s]
100% 178M/178M [00:01<00:00, 104MB/s] 
Archive:  /content/paysim1.zip
  inflating: PS_20174392719_1491204439457_log.csv  


# **Data Analysis**

In [5]:
# Importing some important libraries
import pandas as pd
import numpy as np

df = pd.read_csv('PS_20174392719_1491204439457_log.csv')
pd.set_option('display.max.columns', None)
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [6]:
len_not_fraud = len(df['isFraud'][df.isFraud == 0])
len_is_fraud = len(df['isFraud'][df.isFraud == 1])

arr = np.array([len_not_fraud, len_is_fraud])
labels = ['Not Fraudulent', 'Fraudulent']
print(f"Total No. of Non-Fraudulent Cases: {len_not_fraud}")
print(f"Total No. Fraudulent Cases: {len_is_fraud}")

Total No. of Non-Fraudulent Cases: 6354407
Total No. Fraudulent Cases: 8213


The dataset is also highly imbalanced with only 0.12% of transactions tagged as fraudulent cases. Some workarounds can be done when working with imbalanced data such as:

*   Synthetic Minority Oversampling Technique (SMOTE)
*   Undersampling

In [7]:
df.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)
data = df.copy(deep = True)
# get all categorical columns in the dataframe
catCols = [col for col in data.columns if data[col].dtype=="O"]

from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()

for item in catCols:
    data[item] = lb_make.fit_transform(data[item])

In [8]:
# The function below will be used to evaluate different metrics of the algorithms used here.
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(y_test, y_pred):
    print("Accuracy Score: ", accuracy_score(y_test, y_pred))
    print("Precision Score: ", precision_score(y_test, y_pred))
    print("Recall Score: ", recall_score(y_test, y_pred))
    print("F1 Score: ", f1_score(y_test, y_pred))
    print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))

# **Logistic Regression**
Logistic regression estimates the probability of an event occurring, such as voted or didn’t vote, based on a given dataset of independent variables. Since the outcome is a probability, the dependent variable is bounded between 0 and 1.

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X = data.drop('isFraud', axis=1)
y = data.isFraud


# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

lr = LogisticRegression()
model1 = lr.fit(X_train, y_train)

# Predict on training set
lr_pred = model1.predict(X_test)
evaluate_model(y_test, lr_pred)

Accuracy Score:  0.9983421923672953
Precision Score:  0.3843906510851419
Recall Score:  0.442150744119059
F1 Score:  0.4112525117213664
Confusion Matrix:  [[1587097    1475]
 [   1162     921]]


# **Random Forest Classifier**
Random Forest is a classifier that contains a number of decision trees on various subsets of the given dataset and takes the average to improve the predictive accuracy of that dataset.

In [10]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)

# predict on test set
rfc_pred = rfc.predict(X_test)

evaluate_model(y_test, rfc_pred)

Accuracy Score:  0.9996925794719785
Precision Score:  0.9801204819277108
Recall Score:  0.7810849735957753
F1 Score:  0.8693561314453647
Confusion Matrix:  [[1588539      33]
 [    456    1627]]


# **Saving Model**

In [11]:
import joblib
joblib.dump(rfc, "credit_fraud.pkl")

['credit_fraud.pkl']