# We will build our AI model in this notebook

In [9]:
# Import the important libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

## Steps we will do in this notebook
- Load Cleaned Dataset
- Train Test Split
- Define and train models

### Load the data

In [3]:
df = pd.read_csv("creditcard_cleaned.csv")

### Split features and target

In [6]:
x = df.drop("Class", axis=1)
y = df["Class"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42, stratify=y)

### Feature Scaling

In [7]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

### Model Training
- Linear Regression(as classifier)
- Logistic Regression
- Decision Tree
- Random Forest
- Then we will create a summary dictionary of all models and their accuracy

### Linear Regression

In [10]:
linear_model = LinearRegression()
linear_model.fit(x_train_scaled, y_train)

# Predict & convert to binary
linear_pred = np.where(linear_model.predict(x_test_scaled) > 0.5, 1, 0)
linear_accuracy = accuracy_score(y_test, linear_pred)

print("🔹 Linear Regression as Classifier")
print("Accuracy:", linear_accuracy)
print("AUC Score:", roc_auc_score(y_test, linear_pred))

🔹 Linear Regression as Classifier
Accuracy: 0.9990510270654989
AUC Score: 0.7499207936476504


### Logistic Regression

In [11]:
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(x_train_scaled, y_train)

logistic_pred = logistic_model.predict(x_test_scaled)
logistic_accuracy = accuracy_score(y_test, logistic_pred)

print("🔹 Logistic Regression")
print("Accuracy:", logistic_accuracy)
print("AUC Score:", roc_auc_score(y_test, logistic_pred))

🔹 Logistic Regression
Accuracy: 0.9993080405685929
AUC Score: 0.8311707936476505


### Decision Tree

In [12]:
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(x_train, y_train)

tree_pred = tree_model.predict(x_test)
tree_accuracy = accuracy_score(y_test, tree_pred)

print("🔹 Decision Tree")
print("Accuracy:", tree_accuracy)
print("AUC Score:", roc_auc_score(y_test, tree_pred))

🔹 Decision Tree
Accuracy: 0.9992091892212491
AUC Score: 0.9060024801489079


### Random Forest

In [16]:
forest_model = RandomForestClassifier(random_state=42)
forest_model.fit(x_train, y_train)

forest_pred = forest_model.predict(x_test)
forest_accuracy = accuracy_score(y_test, forest_pred)

print("🔹 Random Forest")
print("Accuracy:", forest_accuracy)
print("AUC Score:", roc_auc_score(y_test, forest_pred))

🔹 Random Forest
Accuracy: 0.9996243648800933
AUC Score: 0.9062103968238253


## Cross Validation and Hyperparameter Tuning

#### Overall Random Forest is better model for this 

In [13]:
# Define models
model_dict = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

# Hyperparameter space
search_space = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l1', 'l2'],
    'n_estimators': [100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'criterion': ['gini', 'entropy']
}

# Models needing scaling
scale_models = ['Logistic Regression']

# Storage
results = []
model_store = {}

# Filter hyperparams per model
def filter_hyperparameter(model, space):
    valid_keys = model.get_params().keys()
    param_grid = {k: v for k, v in space.items() if k in valid_keys}
    
    model_name = type(model).__name__
    if model_name == 'LogisticRegression' and 'penalty' in param_grid and 'solver' in param_grid:
        param_combinations = []
        for penalty in param_grid['penalty']:
            for solver in param_grid['solver']:
                if penalty == 'l1' and solver == 'lbfgs':
                    continue
                combo = {'penalty': [penalty], 'solver': [solver]}
                for key in param_grid:
                    if key not in combo:
                        combo[key] = param_grid[key]
                param_combinations.append(combo)
        return param_combinations[0] if param_combinations else param_grid
    return param_grid

# Training loop
for name, model in model_dict.items():
    print(f"\n🔍 Tuning {name}...")

    if name in scale_models:
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('clf', model)
        ])
        model_for_grid = pipe
        param_grid = {f'clf__{k}': v for k, v in filter_hyperparameter(model, search_space).items()}
    else:
        model_for_grid = model
        param_grid = filter_hyperparameter(model, search_space)

    cv_folds = 5 if name != 'Random Forest' else 3
    n_jobs_param = -1 if name != 'Random Forest' else 8

    grid = GridSearchCV(
        estimator=model_for_grid,
        param_grid=param_grid,
        cv=cv_folds,
        scoring='accuracy',
        n_jobs=n_jobs_param,
        verbose=1
    )

    grid.fit(x_train, y_train)
    best_model = grid.best_estimator_
    model_store[name] = best_model

    y_pred = best_model.predict(x_test)

    if hasattr(best_model, "predict_proba"):
        y_proba = best_model.predict_proba(x_test)[:, 1]
    elif hasattr(best_model, "decision_function"):
        y_proba = best_model.decision_function(x_test)
    else:
        y_proba = None

    acc = accuracy_score(y_test, y_pred)
    f1 = classification_report(y_test, y_pred, output_dict=True)['weighted avg']['f1-score']
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else "N/A"

    print(f"\n📊 Evaluation for {name}")
    print("Best Params:", grid.best_params_)
    print("Accuracy:", f"{acc*100:.4f}%")
    print("AUC Score:", auc)
    print("Classification Report:\n", classification_report(y_test, y_pred))

    results.append({
        'Model': name,
        'Accuracy': round(acc, 4),
        'F1-Score': round(f1, 4),
        'AUC': round(auc, 4) if y_proba is not None else "N/A",
        'Best Params': grid.best_params_
    })

# Summary
print("\n" + "="*60)
print("FINAL RESULTS SUMMARY")
print("="*60)
for r in results:
    print(f"{r['Model']}: Accuracy={r['Accuracy']}, F1={r['F1-Score']}, AUC={r['AUC']}")
    print(f"Best Params: {r['Best Params']}")
    print("-" * 50)

# Select and save best model (by F1-score)
best_result = max(results, key=lambda x: x['F1-Score'])
best_model_name = best_result['Model']
print(f"\n🏆 Best Model Selected: {best_model_name}")
best_model = model_store[best_model_name]

# Save with pickle
with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)
print("✅ Best model saved to 'best_model.pkl'")


🔍 Tuning Logistic Regression...
Fitting 5 folds for each of 4 candidates, totalling 20 fits

📊 Evaluation for Logistic Regression
Best Params: {'clf__C': 10, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}
Accuracy: 99.9308%
AUC Score: 0.9904365755133562
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     50501
           1       0.87      0.66      0.75        80

    accuracy                           1.00     50581
   macro avg       0.93      0.83      0.88     50581
weighted avg       1.00      1.00      1.00     50581


🔍 Tuning Random Forest...
Fitting 3 folds for each of 144 candidates, totalling 432 fits

📊 Evaluation for Random Forest
Best Params: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 99.9644%
AUC Score: 0.9680446921843131
Classification Report:
               precision    recall  f1-score   support

           0    