In [30]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [31]:
# Load the datasets
train_data = pd.read_csv("fraudTrain.csv")  # Update file path if needed
test_data = pd.read_csv("fraudTest.csv")  # Update file path if needed

# Display dataset information
print("Train Data Info:")
print(train_data.info())

print("\nFirst 5 rows of Train Data:")
print(train_data.head())


Train Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201829 entries, 0 to 201828
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             201829 non-null  int64  
 1   trans_date_trans_time  201829 non-null  object 
 2   cc_num                 201829 non-null  int64  
 3   merchant               201829 non-null  object 
 4   category               201829 non-null  object 
 5   amt                    201829 non-null  float64
 6   first                  201829 non-null  object 
 7   last                   201828 non-null  object 
 8   gender                 201828 non-null  object 
 9   street                 201828 non-null  object 
 10  city                   201828 non-null  object 
 11  state                  201828 non-null  object 
 12  zip                    201828 non-null  float64
 13  lat                    201828 non-null  float64
 14  long               

In [32]:
# Check for missing values
print("\nMissing values in Train Data:")
print(train_data.isnull().sum())

print("\nMissing values in Test Data:")
print(test_data.isnull().sum())



Missing values in Train Data:
Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     1
gender                   1
street                   1
city                     1
state                    1
zip                      1
lat                      1
long                     1
city_pop                 1
job                      1
dob                      1
trans_num                1
unix_time                1
merch_lat                1
merch_long               1
is_fraud                 1
dtype: int64

Missing values in Test Data:
Unnamed: 0               0
trans_date_trans_time    1
cc_num                   1
merchant                 1
category                 1
amt                      1
first                    1
last                     1
gender                   1
street                   1
city                     1
state   

In [33]:
# Drop columns that are not useful for prediction
columns_to_drop = ["trans_date_trans_time", "first", "last", "street", "dob", "trans_num"]
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)


In [34]:
# Fill missing values using forward fill method
train_data.fillna(method="ffill", inplace=True)
test_data.fillna(method="ffill", inplace=True)


  train_data.fillna(method="ffill", inplace=True)
  test_data.fillna(method="ffill", inplace=True)


In [38]:
# Encode categorical variables using LabelEncoder, handling unknown labels
label_encoders = {}
for col in ["merchant", "category", "city", "state", "job", "gender"]:
    le = LabelEncoder()
    # Convert all values to strings before fitting the LabelEncoder
    combined_values = pd.concat([train_data[col], test_data[col]], ignore_index=True).astype(str).unique()
    le.fit(combined_values)
    train_data[col] = le.transform(train_data[col].astype(str))
    test_data[col] = le.transform(test_data[col].astype(str))
    label_encoders[col] = le  # Store encoders for future use if needed

In [39]:
# Define features (X) and target (y)
X = train_data.drop("is_fraud", axis=1)  # Features
y = train_data["is_fraud"]  # Target Variable

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [41]:
# Standardize the numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Remove the 'is_fraud' column from test_data before scaling
X_test_scaled = scaler.transform(test_data.drop(columns=['is_fraud']))  # Scale the test data the same way

In [42]:
# Apply SMOTE to balance the fraud and non-fraud transactions
smote = SMOTE(sampling_strategy=1.0, random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train_scaled, y_train)

# Check new class distribution after SMOTE
print("\nClass distribution after SMOTE:")
print(y_train_sm.value_counts())



Class distribution after SMOTE:
is_fraud
0.0    160141
1.0    160141
Name: count, dtype: int64


In [43]:

# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_sm, y_train_sm)


In [44]:
# Make predictions on validation data
y_val_pred = model.predict(X_val_scaled)

# Evaluate the model
print("\nAccuracy Score:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_val_pred))



Accuracy Score: 0.9972006143784373

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     40035
         1.0       0.83      0.82      0.83       331

    accuracy                           1.00     40366
   macro avg       0.92      0.91      0.91     40366
weighted avg       1.00      1.00      1.00     40366


Confusion Matrix:
 [[39980    55]
 [   58   273]]


In [45]:
# Predict fraud status for test data
test_predictions = model.predict(X_test_scaled)

# Save predictions to test dataset
test_data["is_fraud"] = test_predictions
