<a href="https://colab.research.google.com/github/Aditya17-bot/Fraud_detection/blob/main/fraud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install imbalanced-learn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pylab import rcParams
rcParams['figure.figsize'] = (14, 8)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE


In [None]:
train_path = 'fraudTrain.csv'
test_path  = 'fraudTest.csv'

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
train_df.head()


In [None]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

# Your columns
categorical_cols = ['city', 'merchant', 'category', 'gender']  # strings
numeric_cols = ['amount', 'age']  # numbers to scale to 0-1

preprocess = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', MinMaxScaler(), numeric_cols)  # scales to [0,1]
])

# Fit on train, transform both train and test
X_train_processed = preprocess.fit_transform(train_path)
X_test_processed = preprocess.transform(test_path)


In [None]:
LABELS = ["Normal", "Fraud"]

ax = train_df['Class'].value_counts().sort_index().plot(kind='bar', rot=0)
ax.set_title("Transaction Class Distribution (Train)")
ax.set_xlabel("Class")
ax.set_ylabel("Frequency")
ax.set_xticklabels(LABELS)
plt.show()

fraud = train_df[train_df['Class'] == 1]
normal = train_df[train_df['Class'] == 0]
print("Fraud, Normal shapes:", fraud.shape, normal.shape)

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
fig.suptitle('Amount per transaction by class (Train)')
bins = 50
ax1.hist(fraud['Amount'], bins=bins); ax1.set_title('Fraud')
ax2.hist(normal['Amount'], bins=bins); ax2.set_title('Normal')
plt.xlabel('Amount ($)'); plt.ylabel('Count'); plt.yscale('log'); plt.xlim((0, 20000))
plt.show()


In [None]:
target_col = 'Class'
feature_cols = [c for c in train_df.columns if c != target_col]

X_train_raw = train_df[feature_cols]
y_train = train_df[target_col].astype(int)

X_test_raw  = test_df[feature_cols]
y_test = test_df[target_col].astype(int)

# Optional: correlation on numeric columns
corr = train_df.select_dtypes(include=[np.number]).corr()
plt.figure(figsize=(18, 14))
sns.heatmap(corr, cmap='RdYlGn', center=0)
plt.title('Correlation (Train)')
plt.show()

print("Train features/labels:", X_train_raw.shape, y_train.shape)
print("Test  features/labels:", X_test_raw.shape, y_test.shape)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled  = scaler.transform(X_test_raw)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

print("Balanced train shapes:", X_train_bal.shape, y_train_bal.shape)
print("Balanced class counts:", np.bincount(y_train_bal))


In [None]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)
rf.fit(X_train_bal, y_train_bal)


In [None]:
y_pred = rf.predict(X_test_scaled)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
print("Accuracy:", accuracy_score(y_test, y_pred))

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Oranges',
            xticklabels=['Normal', 'Fraud'],
            yticklabels=['Normal', 'Fraud'])
plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title('Confusion Matrix: Random Forest')
plt.show()


In [None]:
y_scores = rf.predict_proba(X_test_scaled)[:, 1]

threshold = 0.50
y_pred_thr = (y_scores >= threshold).astype(int)

cm_thr = confusion_matrix(y_test, y_pred_thr)
print("Threshold =", threshold)
print("Confusion Matrix:\n", cm_thr)
print("\nClassification Report:\n", classification_report(y_test, y_pred_thr, digits=4))
