In [1]:
#importing necessary libraries
import pandas as pd
import zipfile
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, accuracy_score
from google.colab import files
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt


#extracting the data from zip file
with zipfile.ZipFile('/content/drive/MyDrive/data/fraudTest.csv (1).zip', 'r') as ref:
    ref.extractall()

#Reading the data into the pandas dataframe
df=pd.read_csv('/content/drive/MyDrive/data/fraudTest.csv (1).zip')




#preprocessing the data:

df = df.drop(['Unnamed: 0', 'cc_num', 'trans_num', 'street', 'unix_time'], axis=1)#dropping unnecessary columns

df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time']) #Converting date columns to datetime format
df['dob'] = pd.to_datetime(df['dob'])

#Derive new 'age' column from 'dob'
df['age'] = (pd.to_datetime('today') - df['dob']).dt.days
#Dropping original date columns after extracting useful info
df = df.drop(['trans_date_trans_time', 'dob'], axis=1)
cat_cols = ['gender', 'category', 'job', 'merchant']

#Label encoding categorical columns using loop
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])


#Handling class imbalance by downsampling the majority class (not fraud)
not_fraud=df[df['is_fraud']==0]
fraud=df[df['is_fraud']==1]



#Randomly sample non-fraud transactions to balance the dataset
not_fraud_sample=not_fraud.sample(n=2145)

# Combine sampled non-fraud and all fraud cases into a new balanced dataset
new=pd.concat([not_fraud_sample,fraud],axis=0)


#drooping other unecessary columns
new=new.drop(['first','last'],axis=1)
new=new.drop(['city','state'],axis=1)

#splitting the data into features(X) and label(Y) for training and testing
X=new.drop('is_fraud',axis=1)
y=new['is_fraud']

#splitting the data into train data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

#Training the Random forst model
model=RandomForestClassifier()
model.fit(X_train,y_train)

#making prediction and evaluating the performance
y_pred=model.predict(X_test)
print("Random Forest classification report:",classification_report(y_test,y_pred))
print("the accuracy of Random Forest model is:",accuracy_score(y_test,y_pred))

# Train a Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)




# Evaluate performance
dt_pred = dt_model.predict(X_test)
print("Decision Tree Classification Report:")
print(classification_report(y_test, dt_pred))
print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))





Random Forest classification report:               precision    recall  f1-score   support

           0       0.93      0.96      0.94       425
           1       0.96      0.93      0.94       433

    accuracy                           0.94       858
   macro avg       0.94      0.94      0.94       858
weighted avg       0.94      0.94      0.94       858

the accuracy of Random Forest model is: 0.9440559440559441
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.93      0.93       425
           1       0.93      0.94      0.94       433

    accuracy                           0.93       858
   macro avg       0.93      0.93      0.93       858
weighted avg       0.93      0.93      0.93       858

Decision Tree Accuracy: 0.9347319347319347
