Downloading the dataset through the API KEY

In [1]:
#!/bin/bash
!kaggle datasets download kartik2112/fraud-detection

Dataset URL: https://www.kaggle.com/datasets/kartik2112/fraud-detection
License(s): CC0-1.0
Downloading fraud-detection.zip to /content
 99% 199M/202M [00:02<00:00, 74.2MB/s]
100% 202M/202M [00:02<00:00, 87.4MB/s]


Extracting the zip file

In [2]:
import zipfile

# Extracting the zip file without listing contents
with zipfile.ZipFile('/content/fraud-detection.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/extracted_files')


Loading the csv files and Analyzing the files

In [3]:
import pandas as pd
import numpy as np

# Load the data
train_path = '/content/extracted_files/fraudTrain.csv'
test_path = '/content/extracted_files/fraudTest.csv'


In [4]:
train_data = pd.read_csv(train_path)


In [5]:
# Initial analysis
print("Training Data Info:")
print(train_data.info())

Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14

In [6]:
print("\nSample Training Data:")
train_data.head()



Sample Training Data:


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [7]:

print("Missing values in Training Data:")
print(train_data.isnull().sum())


Missing values in Training Data:
Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64


In [8]:

print("\nClass Distribution in Training Data:")
print(train_data['is_fraud'].value_counts())



Class Distribution in Training Data:
is_fraud
0    1289169
1       7506
Name: count, dtype: int64


In [9]:
columns_to_drop = ['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'city',
                   'state', 'zip', 'unix_time', 'trans_num']
train_data.drop(columns=columns_to_drop, inplace=True)



In [10]:
# Convert trans_date_trans_time to datetime
train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time'])


In [11]:
from sklearn.preprocessing import LabelEncoder

# Feature Engineering: Extract time-based features
train_data['hour'] = train_data['trans_date_trans_time'].dt.hour
train_data['day'] = train_data['trans_date_trans_time'].dt.day
train_data['month'] = train_data['trans_date_trans_time'].dt.month
train_data['year'] = train_data['trans_date_trans_time'].dt.year
# Calculate age before dropping trans_date_trans_time
train_data['dob'] = pd.to_datetime(train_data['dob'])
train_data['age'] = (train_data['year'] - train_data['dob'].dt.year)
train_data.drop(columns=['dob', 'trans_date_trans_time'], inplace=True) #Drop trans_date_trans_time after extracting features and calculating age

# Encode categorical variables
categorical_cols = ['merchant', 'category', 'gender', 'job']
for col in categorical_cols:
    train_data[col] = LabelEncoder().fit_transform(train_data[col])

# Check processed data
print("\nProcessed Training Data Info:")
print(train_data.info())


Processed Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 16 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   merchant    1296675 non-null  int64  
 1   category    1296675 non-null  int64  
 2   amt         1296675 non-null  float64
 3   gender      1296675 non-null  int64  
 4   lat         1296675 non-null  float64
 5   long        1296675 non-null  float64
 6   city_pop    1296675 non-null  int64  
 7   job         1296675 non-null  int64  
 8   merch_lat   1296675 non-null  float64
 9   merch_long  1296675 non-null  float64
 10  is_fraud    1296675 non-null  int64  
 11  hour        1296675 non-null  int32  
 12  day         1296675 non-null  int32  
 13  month       1296675 non-null  int32  
 14  year        1296675 non-null  int32  
 15  age         1296675 non-null  int32  
dtypes: float64(5), int32(5), int64(6)
memory usage: 133.6 MB
None


In [12]:

print("\nSample Processed Training Data:")
train_data.head()


Sample Processed Training Data:


Unnamed: 0,merchant,category,amt,gender,lat,long,city_pop,job,merch_lat,merch_long,is_fraud,hour,day,month,year,age
0,514,8,4.97,0,36.0788,-81.1781,3495,370,36.011293,-82.048315,0,0,1,1,2019,31
1,241,4,107.23,0,48.8878,-118.2105,149,428,49.159047,-118.186462,0,0,1,1,2019,41
2,390,0,220.11,1,42.1808,-112.262,4154,307,43.150704,-112.154481,0,0,1,1,2019,57
3,360,2,45.0,1,46.2306,-112.1138,1939,328,47.034331,-112.561071,0,0,1,1,2019,52
4,297,9,41.96,1,38.4207,-79.4629,99,116,38.674999,-78.632459,0,0,1,1,2019,33


In [13]:
num_columns = len(train_data.columns)
print(f"The number of columns in the dataset is: {num_columns}")


The number of columns in the dataset is: 16


In [14]:
# Check for missing values
print("\nMissing Values in Processed Data:")
print(train_data.isnull().sum())

# Get the number of columns
print(f"\nThe number of columns in the processed dataset is: {train_data.shape[1]}")

# Separate features and target variable
X = train_data.drop(columns=['is_fraud'])
y = train_data['is_fraud']

# Check class distribution
print("\nClass Distribution in Target Variable:")
print(y.value_counts())



Missing Values in Processed Data:
merchant      0
category      0
amt           0
gender        0
lat           0
long          0
city_pop      0
job           0
merch_lat     0
merch_long    0
is_fraud      0
hour          0
day           0
month         0
year          0
age           0
dtype: int64

The number of columns in the processed dataset is: 16

Class Distribution in Target Variable:
is_fraud
0    1289169
1       7506
Name: count, dtype: int64


Using SMOTE to balance the dataset cause there is class imbalance in the dataset

In [15]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to oversample the minority class
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check the new class distribution
print("\nClass Distribution After Applying SMOTE:")
print(pd.Series(y_resampled).value_counts())





Class Distribution After Applying SMOTE:
is_fraud
0    1289169
1    1289169
Name: count, dtype: int64


Building the randomforest model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the resampled data
model.fit(X_resampled, y_resampled)

print("Model training completed.")


Now Analyzing the test data to predict

In [None]:
test_data = pd.read_csv(test_path)


In [None]:
# Drop irrelevant columns
columns_to_drop = ['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'city',
                   'state', 'zip', 'unix_time', 'trans_num']
test_data.drop(columns=columns_to_drop, inplace=True)

# Convert trans_date_trans_time to datetime
test_data['trans_date_trans_time'] = pd.to_datetime(test_data['trans_date_trans_time'])

# Feature Engineering: Extract time-based features
test_data['hour'] = test_data['trans_date_trans_time'].dt.hour
test_data['day'] = test_data['trans_date_trans_time'].dt.day
test_data['month'] = test_data['trans_date_trans_time'].dt.month
test_data['year'] = test_data['trans_date_trans_time'].dt.year
test_data.drop(columns=['trans_date_trans_time'], inplace=True)

# Feature Engineering: Calculate age
test_data['dob'] = pd.to_datetime(test_data['dob'])
test_data['age'] = (test_data['year'] - test_data['dob'].dt.year)
test_data.drop(columns=['dob'], inplace=True)

# Encode categorical variables
categorical_cols = ['merchant', 'category', 'gender', 'job']
for col in categorical_cols:
    test_data[col] = LabelEncoder().fit_transform(test_data[col])

# Separate features and target variable
X_test = test_data.drop(columns=['is_fraud'])
y_test = test_data['is_fraud']

# Check processed test data
print("Processed Test Data Info:")
print(test_data.info())

In [None]:
print("\nSample Processed Test Data:")
test_data.head()

Predictions on the test data

In [None]:
# Make predictions on the test data
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))




In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("ROC AUC Score:")
print(roc_auc_score(y_test, y_proba))


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Initialize the Logistic Regression model with class weights balanced
logreg_model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)

# Train the model
logreg_model.fit(X_resampled, y_resampled)

print("Logistic Regression model training completed.")

In [None]:
# Make predictions on the test data
y_pred_logreg = logreg_model.predict(X_test)
y_proba_logreg = logreg_model.predict_proba(X_test)[:, 1]

# Evaluate the Logistic Regression model
print("\nConfusion Matrix for Logistic Regression:")
print(confusion_matrix(y_test, y_pred_logreg))

In [None]:
print("\nClassification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_logreg))

print("\nROC AUC Score for Logistic Regression:")
print(roc_auc_score(y_test, y_proba_logreg))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba_logreg)
roc_auc = auc(fpr, tpr)

# Plot the ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Diagonal line for random guess
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(alpha=0.3)
plt.show()


In [None]:
import joblib

# Save the model to a file
model_filename = 'logistic_regression_model.pkl'
joblib.dump(logreg_model, model_filename)

print(f"Model saved to {model_filename}")
