# Problem Statment
* Credit card companies lose money because of fake transactions. When someone steals card details and makes purchases, it hurts both banks and customers. This project uses machine learning to find which transactions are real and which are fake. I will build a model that can automatically detect fraud and help banks stop bad transactions before they happen.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve

# Load Data

In [None]:
data=pd.read_csv('creditcard.csv')

In [None]:
data

In [None]:
data.columns

In [None]:
pd.set_option('display.max_columns',None)

In [None]:
data

# Basic checks

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data.describe()

In [None]:
data.tail()

In [None]:
data['Class'].value_counts()

# Domain Analysis
* Most of the transactions are normal, only a very small part are fraud.

* The Amount column shows how much money was spent in each transaction, we can compare fraud vs normal here.

* The Time column shows when the transaction happened, we can check if fraud happens more at some specific times.

* These simple checks give us an idea of how fraud looks in real life before making a model.

# EDA

In [None]:
# check  distribtuion for class
plt.figure(figsize=(5,5))
sns.histplot(data=data,x='Class')

In [None]:
data

In [None]:
sns.histplot(data=data, x="Amount",kde=True)
plt.show()

In [None]:
plt.figure(figsize=(22,36))
plotnumber=1
for col in data.drop('Class',axis=1).columns:
    if plotnumber <= len(data.columns):
        ax = plt.subplot(15,2, plotnumber)   # 15 rows, 2 cols = 30 plots
        sns.histplot(data[col], kde=True, stat="density", element="step")
        plt.xlabel(col, fontsize=10)
        plt.ylabel("Count",fontsize=10)
    plotnumber += 1

plt.tight_layout()
plt.show()

In [None]:
sns.histplot(data=data, x='Time', hue='Class', kde=True)

In [None]:
# Scatter plot of time and Amount
sns.scatterplot(data=data,x='Time',y='Amount',hue='Class')

In [None]:
# scatter plot of time and class
plt.figure(figsize=(10,6))
sns.scatterplot(data=data, x="Time", y="Class", alpha=0.5)
plt.title("Fraud vs Non-Fraud over Time")
plt.show()

In [None]:
# scatter plot of Amount and class
sns.scatterplot(data=data,x='Class',y='Amount')

In [None]:
# scatter plot with in PCA features
sns.scatterplot(data=data,x='V1',y='V2',hue='Class',alpha=0.6)

### Insights
* Most of the transactions are normal and only a very small part are fraud.

* Fraud transactions are not spread everywhere, they are grouped in some regions of feature space.

* Amount and Time show different spending patterns, and frauds may appear in specific ranges or times.

* PCA features (like V1, V2) show that fraud points sometimes cluster separately, but also mix with normal ones.

# BoxPlot

In [None]:
# check outliers on v2
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x='V2')
plt.title("Boxplot of  V2")

In [None]:
# check Outliers on v12
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x='V12')
plt.title("Boxplot of  V12")

In [None]:
# Check outliers on Amount
plt.figure(figsize=(8,4))
sns.boxplot(data=data, x='Amount')
plt.title("Boxplot of Transaction Amount")


# Data Preprocessing

In [None]:
# IQR method detect ouliers
q1=data['Amount'].quantile(0.25)
q3=data['Amount'].quantile(0.75)
iqr=q3-q1

lower_limit=q1-1.5*iqr
upper_limit=q3+1.5*iqr
print(lower_limit)
print(upper_limit)

In [None]:
outlier=data.loc[(data['Amount']>upper_limit)|(data['Amount']<lower_limit)]

In [None]:
outlier.shape

In [None]:
data

In [None]:
# missing values
data.isnull().sum()

In [None]:
# duplicated check
data.duplicated().sum()

In [None]:
data=data.drop_duplicates()

In [None]:
data.duplicated().sum()

# Feature Engineering

In [None]:
data['Hour'] = (data['Time'] % 86400) // 3600

In [None]:
data['Amount_log'] = np.log1p(data['Amount'])   # log(Amount+1)

### Scaling 

In [None]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
# on Amount
data['Amount_scaled']=scaler.fit_transform(data[['Amount_log']])

# On Hour
data['Hour']=scaler.fit_transform(data[['Hour']])

In [None]:
data

# Split x and y

In [None]:
from sklearn.model_selection import train_test_split

x=data.drop('Class',axis=1)
y=data['Class']

# Train-Test_split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)  # stratify= equal part me krega class=0 or class 1 ko same part me 

In [None]:
print("Before SMOTE:", y_train.value_counts())

In [None]:
# Apply SMOTE only on training data

from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
x_train_res, y_train_res = smote.fit_resample(x_train, y_train)

In [None]:
print("After SMOTE:", y_train_res.value_counts())

# import all 

In [None]:
from sklearn.linear_model import LogisticRegression

model=LogisticRegression(max_iter=1000,random_state=42)
model.fit(x_train_res,y_train_res)

In [None]:
# Predication on data
y_probs = model.predict_proba(x_test)[:, 1]

In [None]:
# set custom 
threshold = 0.5
y_pred = (y_probs >= threshold).astype(int)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report,confusion_matrix,roc_auc_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Recall:",recall_score(y_test, y_pred))
print("F1-score:",f1_score(y_test, y_pred))
print("ROC-AUC:",roc_auc_score(y_test, model.predict_proba(x_test)[:,1]))

In [None]:
# Decision Tree
# -------------------------------
dt_model = DecisionTreeClassifier(random_state=42, class_weight='balanced', max_depth=6)
dt_model.fit(x_train_res, y_train_res)

# Probabilities
y_probs_dt = dt_model.predict_proba(x_test)[:, 1]

# Threshold
y_pred_dt = (y_probs_dt >= threshold).astype(int)

# Metrics
print("Decision Tree Metrics")
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))
print("F1-score:", f1_score(y_test, y_pred_dt))
print("Precision:", precision_score(y_test, y_pred_dt))
print("Recall:", recall_score(y_test, y_pred_dt))
print("ROC-AUC:", roc_auc_score(y_test, y_probs_dt))

In [None]:
 Random Forest (optimized)
# -------------------------------
rf_model = RandomForestClassifier(
    n_estimators=50,     # fewer trees → faster
    max_depth=12,        # limit depth
    max_samples=200000,  # subset per tree → faster
    random_state=42, 
    class_weight='balanced',
    n_jobs=-1            # parallelize
)
rf_model.fit(x_train_res, y_train_res)

# Probabilities
y_probs_rf = rf_model.predict_proba(x_test)[:, 1]

# Threshold
y_pred_rf = (y_probs_rf >= threshold).astype(int)

# Metrics
print("Random Forest Metrics")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("F1-score:", f1_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_probs_rf))



In [None]:
 -------------------------------
# Optional: Precision-Recall Curve
# -------------------------------
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs_rf)
plt.plot(thresholds, precisions[:-1], label='Precision')
plt.plot(thresholds, recalls[:-1], label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Random Forest Precision-Recall vs Threshold')
plt.legend()
plt.show()