In [10]:
import pandas as pd
import numpy as np
import os
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [19]:
# Set LOKY_MAX_CPU_COUNT environment variable
# os.environ['LOKY_MAX_CPU_COUNT'] = '4'  # Replace '4' with the desired number of CPUs
os.environ.pop('LOKY_MAX_CPU_COUNT', None)

In [11]:
# Step 1: Load the dataset
data = pd.read_csv('creditcard.csv')

In [12]:
# Step 2: Data preprocessing
X = data.drop('Class', axis=1)
y = data['Class']

In [13]:
# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Step 4: Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
# Step 5: Handle class imbalance using oversampling (SMOTE)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [28]:
# Step 6: Train the model (Logistic Regression)
# model = LogisticRegression()
start_time = time.time()

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_resampled, y_train_resampled)

end_time = time.time()
training_time = end_time - start_time

# Print training time
print("Training completed in {:.2f} seconds.".format(training_time))

Training completed in 609.93 seconds.


In [29]:
# Step 7: Evaluate the model
y_pred = model.predict(X_test_scaled)
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[56856     8]
 [   15    83]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.91      0.85      0.88        98

    accuracy                           1.00     56962
   macro avg       0.96      0.92      0.94     56962
weighted avg       1.00      1.00      1.00     56962



In [23]:
# Step 6: Model Training (with verbose and time tracking)
start_time = time.time()

model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled)

end_time = time.time()
training_time = end_time - start_time

# Print training time
print("Training completed in {:.2f} seconds.".format(training_time))

Training completed in 7.41 seconds.


In [24]:
# Step 7: Evaluate the model
y_pred = model.predict(X_test_scaled)
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[55425  1439]
 [    8    90]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     56864
           1       0.06      0.92      0.11        98

    accuracy                           0.97     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.97      0.99     56962

