In [3]:
# Package Installs
!kaggle datasets download -d kartik2112/fraud-detection

Dataset URL: https://www.kaggle.com/datasets/kartik2112/fraud-detection
License(s): CC0-1.0
Downloading fraud-detection.zip to /content
 91% 183M/202M [00:02<00:00, 94.3MB/s]
100% 202M/202M [00:02<00:00, 96.1MB/s]


In [4]:
#Import Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
!unzip /content/fraud-detection.zip

Archive:  /content/fraud-detection.zip
  inflating: fraudTest.csv           
  inflating: fraudTrain.csv          


In [6]:
!ls /content/

fraud-detection.zip  fraudTest.csv  fraudTrain.csv  sample_data


In [7]:
fraud_test_df = pd.read_csv('/content/fraudTest.csv')
fraud_train_df = pd.read_csv('/content/fraudTrain.csv')

In [8]:
fraud_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  int64  
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

In [9]:
fraud_test_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [10]:
# Convert 'trans_date_trans_time' to datetime
fraud_train_df['trans_date_trans_time'] = pd.to_datetime(fraud_train_df['trans_date_trans_time'])

# Extracting year, month, day, hour, and day of the week as new features
fraud_train_df['transaction_year'] = fraud_train_df['trans_date_trans_time'].dt.year
fraud_train_df['transaction_month'] = fraud_train_df['trans_date_trans_time'].dt.month
fraud_train_df['transaction_day'] = fraud_train_df['trans_date_trans_time'].dt.day
fraud_train_df['transaction_hour'] = fraud_train_df['trans_date_trans_time'].dt.hour
fraud_train_df['transaction_dayofweek'] = fraud_train_df['trans_date_trans_time'].dt.dayofweek


In [11]:
from sklearn.preprocessing import LabelEncoder

# Encoding categorical features
label_encoder = LabelEncoder()
fraud_train_df['category_encoded'] = label_encoder.fit_transform(fraud_train_df['category'])
fraud_train_df['gender_encoded'] = label_encoder.fit_transform(fraud_train_df['gender'])
fraud_train_df['job_encoded'] = label_encoder.fit_transform(fraud_train_df['job'])
# Display the dataset to verify the new date-related columns
print("\nDisplaying the dataset after extracting date features:")
display(fraud_train_df[['trans_date_trans_time', 'transaction_year', 'transaction_month',
                        'transaction_day', 'transaction_hour', 'transaction_dayofweek']].head())


Displaying the dataset after extracting date features:


Unnamed: 0,trans_date_trans_time,transaction_year,transaction_month,transaction_day,transaction_hour,transaction_dayofweek
0,2019-01-01 00:00:18,2019,1,1,0,1
1,2019-01-01 00:00:44,2019,1,1,0,1
2,2019-01-01 00:00:51,2019,1,1,0,1
3,2019-01-01 00:01:16,2019,1,1,0,1
4,2019-01-01 00:03:06,2019,1,1,0,1


In [12]:
from sklearn.preprocessing import StandardScaler

# Selecting numerical features to scale
numerical_features = ['amt', 'lat', 'long', 'merch_lat', 'merch_long', 'city_pop']

# Scaling numerical features
scaler = StandardScaler()
fraud_train_df[numerical_features] = scaler.fit_transform(fraud_train_df[numerical_features])
# Display the dataset to verify the encoded columns
print("\nDisplaying the dataset with encoded categorical variables:")
display(fraud_train_df[['category', 'category_encoded', 'gender', 'gender_encoded', 'job', 'job_encoded']].head())


Displaying the dataset with encoded categorical variables:


Unnamed: 0,category,category_encoded,gender,gender_encoded,job,job_encoded
0,misc_net,8,F,0,"Psychologist, counselling",370
1,grocery_pos,4,F,0,Special educational needs teacher,428
2,entertainment,0,M,1,Nature conservation officer,307
3,gas_transport,2,M,1,Patent attorney,328
4,misc_pos,9,M,1,Dance movement psychotherapist,116


In [13]:
from sklearn.model_selection import train_test_split

# Define features and target
X = fraud_train_df.drop('is_fraud', axis=1)  # Features
y = fraud_train_df['is_fraud']  # Target

# Split into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
# Display the scaled features to ensure they have been standardized
print("\nDisplaying the dataset after scaling numerical features:")
display(fraud_train_df[numerical_features].head())




Displaying the dataset after scaling numerical features:


Unnamed: 0,amt,lat,long,merch_lat,merch_long,city_pop
0,-0.407826,-0.48442,0.65762,-0.494354,0.593864,-0.282589
1,0.230039,2.03912,-2.03387,2.078699,-2.030341,-0.29367
2,0.934149,0.717754,-1.601537,0.902849,-1.592323,-0.280406
3,-0.158132,1.515617,-1.590766,1.662886,-1.621848,-0.287742
4,-0.177094,-0.023035,0.782279,0.026941,0.841909,-0.293835


In [14]:
# Convert 'trans_date_trans_time' to datetime
fraud_train_df['trans_date_trans_time'] = pd.to_datetime(fraud_train_df['trans_date_trans_time'])

# Extracting year, month, day, hour, and day of the week as new features
fraud_train_df['transaction_year'] = fraud_train_df['trans_date_trans_time'].dt.year
fraud_train_df['transaction_month'] = fraud_train_df['trans_date_trans_time'].dt.month
fraud_train_df['transaction_day'] = fraud_train_df['trans_date_trans_time'].dt.day
fraud_train_df['transaction_hour'] = fraud_train_df['trans_date_trans_time'].dt.hour
fraud_train_df['transaction_dayofweek'] = fraud_train_df['trans_date_trans_time'].dt.dayofweek

# Displaying the first few rows to verify the new columns
print("\nDisplaying the dataset after extracting date features:")
display(fraud_train_df[['trans_date_trans_time', 'transaction_year', 'transaction_month',
                        'transaction_day', 'transaction_hour', 'transaction_dayofweek']].head())



Displaying the dataset after extracting date features:


Unnamed: 0,trans_date_trans_time,transaction_year,transaction_month,transaction_day,transaction_hour,transaction_dayofweek
0,2019-01-01 00:00:18,2019,1,1,0,1
1,2019-01-01 00:00:44,2019,1,1,0,1
2,2019-01-01 00:00:51,2019,1,1,0,1
3,2019-01-01 00:01:16,2019,1,1,0,1
4,2019-01-01 00:03:06,2019,1,1,0,1


In [15]:
from sklearn.preprocessing import LabelEncoder

# Encoding categorical variables
label_encoder = LabelEncoder()
fraud_train_df['category_encoded'] = label_encoder.fit_transform(fraud_train_df['category'])
fraud_train_df['gender_encoded'] = label_encoder.fit_transform(fraud_train_df['gender'])
fraud_train_df['job_encoded'] = label_encoder.fit_transform(fraud_train_df['job'])

# Displaying the first few rows to verify the encoded columns
print("\nDisplaying the dataset with encoded categorical variables:")
display(fraud_train_df[['category', 'category_encoded', 'gender', 'gender_encoded', 'job', 'job_encoded']].head())



Displaying the dataset with encoded categorical variables:


Unnamed: 0,category,category_encoded,gender,gender_encoded,job,job_encoded
0,misc_net,8,F,0,"Psychologist, counselling",370
1,grocery_pos,4,F,0,Special educational needs teacher,428
2,entertainment,0,M,1,Nature conservation officer,307
3,gas_transport,2,M,1,Patent attorney,328
4,misc_pos,9,M,1,Dance movement psychotherapist,116


Scaling Numerical Features (Step 5):
Use StandardScaler to standardize features such as amt, lat, long, etc.

In [16]:
from sklearn.preprocessing import StandardScaler

# Selecting numerical features to scale
numerical_features = ['amt', 'lat', 'long', 'merch_lat', 'merch_long', 'city_pop']

# Scaling numerical features
scaler = StandardScaler()
fraud_train_df[numerical_features] = scaler.fit_transform(fraud_train_df[numerical_features])

# Displaying the first few rows to verify the scaled features
print("\nDisplaying the dataset after scaling numerical features:")
display(fraud_train_df[numerical_features].head())



Displaying the dataset after scaling numerical features:


Unnamed: 0,amt,lat,long,merch_lat,merch_long,city_pop
0,-0.407826,-0.48442,0.65762,-0.494354,0.593864,-0.282589
1,0.230039,2.03912,-2.03387,2.078699,-2.030341,-0.29367
2,0.934149,0.717754,-1.601537,0.902849,-1.592323,-0.280406
3,-0.158132,1.515617,-1.590766,1.662886,-1.621848,-0.287742
4,-0.177094,-0.023035,0.782279,0.026941,0.841909,-0.293835


In [17]:
# Display all column names to check their correctness
print(fraud_train_df.columns.tolist())


['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud', 'transaction_year', 'transaction_month', 'transaction_day', 'transaction_hour', 'transaction_dayofweek', 'category_encoded', 'gender_encoded', 'job_encoded']


In [18]:
# Updated columns_to_drop based on the current column names
columns_to_drop = ['street']  # 'street' is the only unnecessary column from the current list

# Dropping the columns
fraud_train_df = fraud_train_df.drop(columns=columns_to_drop, axis=1)

# Display the dataset to verify that unnecessary columns have been dropped
print("\nDisplaying the dataset after dropping unnecessary columns:")
display(fraud_train_df.head())



Displaying the dataset after dropping unnecessary columns:


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,city,...,merch_long,is_fraud,transaction_year,transaction_month,transaction_day,transaction_hour,transaction_dayofweek,category_encoded,gender_encoded,job_encoded
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,-0.407826,Jennifer,Banks,F,Moravian Falls,...,0.593864,0,2019,1,1,0,1,8,0,370
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,0.230039,Stephanie,Gill,F,Orient,...,-2.030341,0,2019,1,1,0,1,4,0,428
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,0.934149,Edward,Sanchez,M,Malad City,...,-1.592323,0,2019,1,1,0,1,0,1,307
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,-0.158132,Jeremy,White,M,Boulder,...,-1.621848,0,2019,1,1,0,1,2,1,328
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,-0.177094,Tyler,Garcia,M,Doe Hill,...,0.841909,0,2019,1,1,0,1,9,1,116


In [19]:
# Display the remaining columns
print("\nRemaining columns after dropping:")
print(fraud_train_df.columns.tolist())



Remaining columns after dropping:
['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud', 'transaction_year', 'transaction_month', 'transaction_day', 'transaction_hour', 'transaction_dayofweek', 'category_encoded', 'gender_encoded', 'job_encoded']


In [20]:
from sklearn.model_selection import train_test_split

# Define features and target
X = fraud_train_df.drop('is_fraud', axis=1)  # Features
y = fraud_train_df['is_fraud']  # Target

# Split into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Displaying the shape of the training and testing sets
print("\nShape of training features:", X_train.shape)
print("Shape of training target:", y_train.shape)
print("Shape of test features:", X_test.shape)
print("Shape of test target:", y_test.shape)



Shape of training features: (907672, 29)
Shape of training target: (907672,)
Shape of test features: (389003, 29)
Shape of test target: (389003,)


In [21]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns


In [22]:
# Define features (X) and target variable (y)
X = fraud_train_df.drop('is_fraud', axis=1)  # All columns except 'is_fraud'
y = fraud_train_df['is_fraud']  # Target column

# Split into training and test sets (already done previously but ensuring it's set up correctly)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [26]:
# Check data types of the columns in X_train
print(X_train.dtypes)


Unnamed: 0                        int64
trans_date_trans_time    datetime64[ns]
cc_num                            int64
merchant                         object
category                         object
amt                             float64
first                            object
last                             object
gender                           object
city                             object
state                            object
zip                               int64
lat                             float64
long                            float64
city_pop                        float64
job                              object
dob                              object
trans_num                        object
unix_time                         int64
merch_lat                       float64
merch_long                      float64
transaction_year                  int32
transaction_month                 int32
transaction_day                   int32
transaction_hour                  int32


In [27]:
columns_to_drop = ['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
                   'first', 'last', 'gender', 'city', 'state', 'job', 'dob', 'trans_num']

X_train = X_train.drop(columns=columns_to_drop, axis=1)
X_test = X_test.drop(columns=columns_to_drop, axis=1)

# Display the remaining columns to verify
print("\nRemaining columns after dropping unnecessary ones:")
print(X_train.columns.tolist())



Remaining columns after dropping unnecessary ones:
['amt', 'zip', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long', 'transaction_year', 'transaction_month', 'transaction_day', 'transaction_hour', 'transaction_dayofweek', 'category_encoded', 'gender_encoded', 'job_encoded']


In [28]:
# Verify that all columns are now numeric
print("\nData types of columns in X_train after dropping:")
print(X_train.dtypes)



Data types of columns in X_train after dropping:
amt                      float64
zip                        int64
lat                      float64
long                     float64
city_pop                 float64
unix_time                  int64
merch_lat                float64
merch_long               float64
transaction_year           int32
transaction_month          int32
transaction_day            int32
transaction_hour           int32
transaction_dayofweek      int32
category_encoded           int64
gender_encoded             int64
job_encoded                int64
dtype: object


In [30]:
# Initialize and train the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42)
rf_classifier.fit(X_train, y_train)

print("Model training completed successfully!")


Model training completed successfully!
