In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt

# Load the data
fraud = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/card_transdata.csv")
fraud.head()

# Step 1: Check for target variable distribution
target = fraud['fraud'].value_counts()
print(target)

# Visualize the distribution of the target variable
plt.bar(target.index, target.values)
plt.xticks([0, 1], ['Legit', 'Fraud'])
plt.title('Distribution of Target Variable (Fraud)')
plt.xlabel('Fraud')
plt.ylabel('Count')
plt.show()

# From the distribution, we can check if we have an imbalanced dataset
# If fraud is heavily skewed towards one class (0 for legit), it's imbalanced.

# Step 2: Prepare data
X = fraud.drop('fraud', axis=1)  # Features
y = fraud['fraud']  # Target variable

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Train Logistic Regression model
model = LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

# Step 4: Evaluate Model
y_pred = model.predict(X_test)

# Classification Report and Confusion Matrix
print("Logistic Regression Model Evaluation (Class Balanced):")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy:.4f}")

# Step 5: Oversampling to balance the target variable
# Use RandomOverSampler to oversample the minority class
oversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
X_train_over, y_train_over = oversampler.fit_resample(X_train, y_train)

# Train the Logistic Regression model with oversampled data
model_oversample = LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42)
model_oversample.fit(X_train_over, y_train_over)

# Predict and Evaluate the oversampled model
y_pred_over = model_oversample.predict(X_test)
print("\nLogistic Regression Model Evaluation (Oversampled):")
print(classification_report(y_test, y_pred_over))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_over))
accuracy_over = accuracy_score(y_test, y_pred_over)
print(f"Accuracy Score (Oversample): {accuracy_over:.4f}")

# Step 6: Undersampling to balance the target variable
# Use RandomUnderSampler to undersample the majority class
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)

# Train the Logistic Regression model with undersampled data
model_undersample = LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42)
model_undersample.fit(X_train_under, y_train_under)

# Predict and Evaluate the undersampled model
y_pred_under = model_undersample.predict(X_test)
print("\nLogistic Regression Model Evaluation (Undersampled):")
print(classification_report(y_test, y_pred_under))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_under))
accuracy_under = accuracy_score(y_test, y_pred_under)
print(f"Accuracy Score (Undersample): {accuracy_under:.4f}")

# Step 7: SMOTE to balance the target variable
# Use SMOTE to create synthetic examples for the minority class
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Train the Logistic Regression model with SMOTE data
model_smote = LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42)
model_smote.fit(X_train_smote, y_train_smote)

# Predict and Evaluate the SMOTE model
y_pred_smote = model_smote.predict(X_test)
print("\nLogistic Regression Model Evaluation (SMOTE):")
print(classification_report(y_test, y_pred_smote))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_smote))
accuracy_smote = accuracy_score(y_test, y_pred_smote)
print(f"Accuracy Score (SMOTE): {accuracy_smote:.4f}")
