In [30]:
import pandas as pd

# Load dataset
data = pd.read_csv('data/transactions.csv')

# Show first few rows
data.head()


Unnamed: 0,transaction_id,user_id,amount,timestamp,location,device_type,user_age_days,transaction_count_last_1_day,transaction_count_last_1_hour,average_amount_last_7_days,is_kyc_verified,location_distance_from_home_km,is_foreign_device,previous_fraud_count,label
0,f53ae382-0a93-41ea-b2d6-26b3abe88ebd,USR0930,40202.09,2025-08-30T13:12:05.794185,Hyderabad,Android,602,8,1,20025.01,False,304.63,False,3,0
1,0d805cba-704a-46f3-ae56-48e7cbff352a,USR0418,4796.59,2025-08-20T18:49:05.794213,Delhi,Android,193,29,3,44287.84,True,378.01,False,4,0
2,14b32c59-bcde-4382-abdd-52e7e66298cc,USR0478,94985.58,2025-08-25T17:48:05.794228,Hyderabad,iOS,516,29,3,17489.56,True,188.66,True,0,0
3,1beba633-81a9-4852-b4aa-f6d2c66c78e0,USR0934,44433.91,2025-08-17T18:59:05.794238,Chennai,Web,285,34,1,27704.81,False,122.35,False,1,0
4,2d25818d-aece-44d0-97ff-e84e75177c80,USR0317,54842.06,2025-08-22T20:50:05.794247,Mumbai,Web,377,29,4,9556.8,False,143.91,True,3,0


In [31]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
for col in ['location', 'device_type']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

# Convert boolean columns to integers
data['is_kyc_verified'] = data['is_kyc_verified'].astype(int)
data['is_foreign_device'] = data['is_foreign_device'].astype(int)

# Quick check
data.head()


Unnamed: 0,transaction_id,user_id,amount,timestamp,location,device_type,user_age_days,transaction_count_last_1_day,transaction_count_last_1_hour,average_amount_last_7_days,is_kyc_verified,location_distance_from_home_km,is_foreign_device,previous_fraud_count,label
0,f53ae382-0a93-41ea-b2d6-26b3abe88ebd,USR0930,40202.09,2025-08-30T13:12:05.794185,3,0,602,8,1,20025.01,0,304.63,0,3,0
1,0d805cba-704a-46f3-ae56-48e7cbff352a,USR0418,4796.59,2025-08-20T18:49:05.794213,2,0,193,29,3,44287.84,1,378.01,0,4,0
2,14b32c59-bcde-4382-abdd-52e7e66298cc,USR0478,94985.58,2025-08-25T17:48:05.794228,3,2,516,29,3,17489.56,1,188.66,1,0,0
3,1beba633-81a9-4852-b4aa-f6d2c66c78e0,USR0934,44433.91,2025-08-17T18:59:05.794238,1,1,285,34,1,27704.81,0,122.35,0,1,0
4,2d25818d-aece-44d0-97ff-e84e75177c80,USR0317,54842.06,2025-08-22T20:50:05.794247,4,1,377,29,4,9556.8,0,143.91,1,3,0


In [32]:
# Features
X = data.drop(columns=['transaction_id', 'user_id', 'timestamp', 'label'])

# Target
y = data['label']

# Check shapes
print("Features shape:", X.shape)
print("Labels shape:", y.shape)


Features shape: (10000, 11)
Labels shape: (10000,)


In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 8000
Testing samples: 2000


In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Train model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1856
           1       0.00      0.00      0.00       144

    accuracy                           0.93      2000
   macro avg       0.46      0.50      0.48      2000
weighted avg       0.86      0.93      0.89      2000

[[1854    2]
 [ 144    0]]


In [35]:
import joblib

joblib.dump(clf, 'models/supervised_model.pkl')


['models/supervised_model.pkl']

In [36]:
model = joblib.load('models/supervised_model.pkl')
print("Model loaded successfully!")


Model loaded successfully!
