In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("creditcard.csv")
df.head()

# Train test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Putting target variable to y
y = df["Class"]

# Putting feature variables into X
x = df.drop(["Class"], axis = 1)

In [None]:
# Splitting data into train and test set 80:20
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=100)

# Feature Scaling

In [None]:
# Standardization method
from sklearn.preprocessing import StandardScaler

In [None]:
# Instantiate the Scaler
scaler = StandardScaler()

In [None]:
# Fit the data into scaler and transform
x_train['Amount'] = scaler.fit_transform(x_train[['Amount']])

# Scaling the test set

In [None]:
# Transform the test set
x_test['Amount'] = scaler.transform(x_test[['Amount']])

In [None]:
x_test.head()

# Checking the skewness 

In [None]:
# Listing the columns
cols = x_train.columns
cols

In [None]:
cols_to_plot = cols[:20]  # Limit the number of features for visualization

plt.figure(figsize=(15, 20))  # Adjust the figure size
for i, col in enumerate(cols_to_plot, 1):
    plt.subplot(5, 4, i)  # 5 rows, 4 columns per figure
    sns.histplot(x_train[col], kde=True)
    plt.title(f"{col} (Skewness: {x_train[col].skew():.2f})")
    
plt.tight_layout()
plt.show()

# Mitigating Skewness with Power Transformer

In [None]:
# importing Power Transformer
from sklearn.preprocessing import PowerTransformer

In [None]:
# Instantiate the powertransformer
pt = PowerTransformer(method='yeo-johnson', standardize=True, copy=False)

In [None]:
# Fit and transform the PT on training data
x_train[cols] = pt.fit_transform(x_train)

In [None]:
# Transform the test set
x_test[cols] = pt.transform(x_test)

In [None]:
max_plots = min(30, len(cols))  # Show up to 30 plots

plt.figure(figsize=(17, 20))

for i, col in enumerate(cols[:max_plots], 1):  # Enumerate makes tracking easier
    plt.subplot(6, 5, i)  # Create a 6x5 grid (30 plots max)
    sns.histplot(x_train[col], kde=True)  # Faster and supported plot
    skew_value = x_train[col].skew()
    plt.title(f"{col} (Skew: {skew_value:.2f})")
    plt.tight_layout()  # Adjust layout to avoid overlapping plots

plt.show()

# Random Forest Classifier

In [None]:
# Random Forest Model without SMOTE 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
x = df.drop("Class", axis = 1)
y = df["Class"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Initialize the model
rand_forest = RandomForestClassifier(
    n_estimators=100,
    max_features='sqrt',
    min_samples_split=10,
    random_state=42,
    n_jobs=-1
)

In [None]:
# Train the model
rand_forest.fit(x_train, y_train)

In [None]:
# Make predictions
y_pred = rand_forest.predict(x_test)

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

cnf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu", fmt='g')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')

print(classification_report(y_test, y_pred))

# Handling Imbalance data

In [None]:
# using SMOTE to balance the dataset 

from imblearn.over_sampling import SMOTE

# To ignore warnings 
import warnings 
warnings.filterwarnings("ignore") 

In [None]:
sm = SMOTE(random_state = 42)

In [None]:
# Applying SMOTE to generate synthetic sample 

x_train_s, y_train_s = sm.fit_resample(x, y)

In [None]:
# Class Distribution before resampling

y_train.value_counts()

In [None]:
# Class Distribution after resampling 

y_train_s.value_counts()

# Random Forest Model with SMOTE 

In [None]:
# Initialize the model
rand_forest = RandomForestClassifier(
    n_estimators=100,
    max_features='sqrt',
    min_samples_split=10,
    random_state=42,
    n_jobs=-1
)

In [None]:
rand_forest.fit(x_train_s, y_train_s)

In [None]:
# Predictions
y_pred = rand_forest.predict(x_test)

# Confusion Matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu", fmt='g')
plt.title('Confusion Matrix Heatmap')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Random Forest with SMOTE gives us a perfect classification report but it may not be usable because the model seems too perfect which may have issues of overfiting 