In [None]:
# Fraud_Detection_Analysis.ipynb

# Cell 1: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')


# 💳 Fraud Detection Using Machine Learning

Welcome to the **Fraud Detection Project**. This notebook walks through the complete data science workflow to analyze transaction data and detect fraudulent activities.

### Objectives:
- Perform Exploratory Data Analysis (EDA)
- Preprocess the data
- Train machine learning models
- Evaluate performance


In [None]:
# Cell 2: Load and Merge Data
df_tx = pd.read_csv("data/transactions_data.csv")
df_cards = pd.read_csv("data/cards_dat.csv")
df_labels = pd.read_csv("data/fraud_labels.csv")

with open("data/mcc_codes.json") as f:
    mcc_map = json.load(f)

# Merge
df = df_tx.merge(df_labels, on="transaction_id", how="left")
df = df.merge(df_cards, on="card_id", how="left")
df["mcc_desc"] = df["mcc_code"].map(mcc_map)

df.head()


## 📊 Exploratory Data Analysis (EDA)
Let's explore the dataset using 11 different analyses.


In [None]:
# 1. Sample Preview
df.sample(5)


In [None]:
# 2. Summary Statistics
df.describe()


In [None]:
# 3. Missing Value Analysis
df.isnull().sum()


In [None]:
# 4. Data Types and Unique Value Counts
pd.DataFrame({"Data Type": df.dtypes, "Unique Values": df.nunique()})


In [None]:
# 5. Fraud Class Distribution
df["is_fraud"].value_counts().plot(kind="bar", title="Fraud Class Distribution")
plt.xlabel("Is Fraud")
plt.ylabel("Count")
plt.show()


In [None]:
# 6. Transaction Amount Distribution
sns.histplot(df["amount"], bins=50, kde=True)
plt.title("Transaction Amount Distribution")
plt.show()


In [None]:
# 7. Correlation Matrix
corr = df.select_dtypes(include=[np.number]).corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()


In [None]:
# 8. Outliers: Boxplot of Amount by Fraud
sns.boxplot(data=df, x="is_fraud", y="amount")
plt.title("Amount by Fraud Status")
plt.show()


In [None]:
# 9. Top 10 MCC Descriptions
df["mcc_desc"].value_counts().head(10)


In [None]:
# 10. Fraud Rate by Country
fraud_rate = df.groupby("country_code")["is_fraud"].mean().sort_values(ascending=False)
fraud_rate.head(10)


In [None]:
# 11. Pairplot (sample)
sample = df[["amount", "is_fraud"]].dropna().sample(n=500, random_state=42)
sns.pairplot(sample, hue="is_fraud")
plt.show()


## 🔄 Data Preprocessing and Model Training

We'll use one-hot encoding for categorical variables, split the data, and train models.


In [None]:
features = ["amount", "mcc_desc", "country_code"]
df_model = df[features + ["is_fraud"]].dropna()

# One-hot encoding
enc = OneHotEncoder(sparse=False, handle_unknown="ignore")
X_cat = enc.fit_transform(df_model[["mcc_desc", "country_code"]])
X = np.hstack([df_model[["amount"]].values, X_cat])
y = df_model["is_fraud"].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Model 1: Logistic Regression


In [None]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
y_prob_lr = lr_model.predict_proba(X_test)[:, 1]
auc_lr = roc_auc_score(y_test, y_prob_lr)
print(f"Logistic Regression ROC AUC: {auc_lr:.4f}")


### Model 2: XGBoost


In [None]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]
auc_xgb = roc_auc_score(y_test, y_prob_xgb)
print(f"XGBoost ROC AUC: {auc_xgb:.4f}")


In [None]:
# Confusion Matrix for best model
cm = confusion_matrix(y_test, y_pred_xgb)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
disp.plot()
plt.title("Confusion Matrix - XGBoost")
plt.show()


## 🧾 Conclusion

- Successfully performed 11 types of EDA on the fraud dataset.
- Preprocessed data using encoding and splitting.
- Trained and evaluated Logistic Regression and XGBoost models.
- **XGBoost achieved better ROC AUC score**.
- The model is capable of detecting fraudulent transactions with reasonable performance.

### 🔁 Next Steps:
- Add feature engineering (time-based, frequency-based features).
- Try oversampling or undersampling due to class imbalance.
- Use SHAP values to explain model predictions.
