In [None]:
# 1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, roc_curve, 
    precision_recall_curve, recall_score, precision_score
)

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


ccf = pd.read_csv('credit_card_fraud.csv')


# 2. Load data (already loaded as ccf)
ccf.head()

# 3. Data Exploration
ccf.info()
ccf.describe()
ccf['is_fraud'].value_counts(normalize=True)

# Visualize class imbalance
sns.countplot(x='is_fraud', data=ccf)
plt.title('Fraudulent vs Non-Fraudulent Transactions')
plt.show()

# 4. Data Preprocessing

# --- FIX: Check if columns exist before using them ---
# Convert date columns to datetime if they exist

ccf['trans_date_trans_time'] = pd.to_datetime(ccf['trans_date_trans_time'])

ccf['dob'] = pd.to_datetime(ccf['dob'])

# Feature engineering: Age

ccf['age'] = (ccf['trans_date_trans_time'] - ccf['dob']).dt.days // 365


# Feature engineering: Transaction hour

ccf['trans_hour'] = ccf['trans_date_trans_time'].dt.hour

# Encode categorical variables
cat_cols = ['merchant', 'category', 'city', 'state', 'job']
for col in cat_cols:
    if col in ccf.columns:
        ccf[col] = LabelEncoder().fit_transform(ccf[col])

# Drop columns that leak information or are not useful for modeling
ccf = ccf.drop(columns=['trans_date_trans_time', 'dob', 'trans_num'], axis=1)


# --- FIX: Impute or drop missing values before modeling ---
from sklearn.impute import SimpleImputer

# Separate features and target
X = ccf.drop('is_fraud', axis=1)
y = ccf['is_fraud']



# 5. Prepare data for modeling
# Split data (stratify to preserve class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# After splitting
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
# (If you have categorical columns, you could impute them as well, but in this code, all are numeric after encoding)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. Model Training

# Logistic Regression (baseline)
lr = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
y_proba_lr = lr.predict_proba(X_test_scaled)[:,1]

# Random Forest (main model)
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]



# Classification reports
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr, digits=4))

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, digits=4))

# Confusion matrices
print("Logistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

# ROC-AUC
roc_auc_lr = roc_auc_score(y_test, y_proba_lr)
roc_auc_rf = roc_auc_score(y_test, y_proba_rf)

print(f"Logistic Regression ROC-AUC: {roc_auc_lr:.4f}")
print(f"Random Forest ROC-AUC: {roc_auc_rf:.4f}")
