# 📊 Fraud Detection – INSAID Internship Project

This notebook contains the step-by-step approach for detecting fraudulent transactions using machine learning.

In [None]:
# 1. Import Required Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings
warnings.filterwarnings('ignore')

In [None]:
# 2. Load Dataset (update path if needed)
# Example: Load only a sample (100,000 rows) for testing
df = pd.read_csv("fraud_dataset.csv", nrows=100000)
df.head()

In [None]:
# 3. Data Overview
print(df.shape)
print(df.info())
print(df.describe())

In [None]:
# 4. Check for Missing Values
df.isnull().sum()

In [None]:
# 5. Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')

In [None]:
# 6. Handle Class Imbalance (SMOTE)
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [None]:
# 7. Train Random Forest Model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_res, y_train_res)

y_pred = model.predict(X_test)

In [None]:
# 8. Evaluation Metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

In [None]:
# 9. ROC Curve
fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")

In [None]:
# 10. Feature Importance
importances = model.feature_importances_
features = pd.Series(importances, index=X.columns)
features.nlargest(10).plot(kind='barh')
plt.title("Top 10 Important Features")

## ✅ Done! Now analyze key features, suggest prevention, and define post-implementation monitoring in Markdown cells.