In [None]:
import pandas as pd  

In [None]:
# Load the dataset
df = pd.read_csv("creditcard.csv")

In [None]:
# Exploratory Data Analysis(EDA)
# Display first few rows of the dataframe
df .head()

In [None]:
# Display the shape of the dataframe
df.shape

In [None]:
# Display dataframe info
df.info()

In [None]:
# Statistical summary of the dataframe
df.describe()

In [None]:
# Visualizing class distribution
df['Class'].value_counts()

In [None]:
# Visualizing class distribution
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='Class', data=df)
plt.title("Fraud vs Non-Fraud Count")
plt.show()

In [None]:
# Splitting features and target variable
X = df.drop('Class', axis=1)
y = df['Class']

In [None]:
# Train-Test Split
from sklearn.model_selection import  train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

In [None]:
# Handling Class Imbalance with SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

In [None]:
# Train Logistic Regression Model
from sklearn.linear_model import LogisticRegression
# 1. Create the model
model = LogisticRegression(max_iter=1000)
# 2. Train the model on balanced training data
model.fit(X_train_balanced, y_train_balanced)
# 3. Make predictions on test data
y_pred = model.predict(X_test_scaled)
# 4. predict the probabilities
y_prob = model.predict_proba(X_test_scaled)[:, 1]
print("Model Training completed!")

In [None]:
# Evaluate the Model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n",cm)

# 2. Visualizing Confusion Matrix
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 3. Classification Metrics
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

# 4. Classification Report
print("Classification Report:\n",classification_report(y_test, y_pred))

In [None]:
# # Model 2: Random Forest Classifier + SMOTE
# Import Required Libraries
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [None]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Apply SMOTE only on train
smote = SMOTE(sampling_strategy=0.1,random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [None]:
# Train Random Forest Model (with class weights)
rf = RandomForestClassifier(
    n_estimators=100,  class_weight='balanced', random_state=42, max_depth=15, n_jobs=-1
)
rf.fit(X_train, y_train)

In [None]:
# Make predictions on test data
y_pred = rf.predict(X_test)


In [None]:
# Evaluate the Model
print("Confusion Matrix:\n")
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

In [None]:
# Plotting the Confusion Matrix
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - Random Forest + SMOTE")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Building a Final Optimized Model
# 1. Importing Libraries for HyperParameter Tuning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier  

In [None]:
# 2. Defining the HyperParameter Grid
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True],
    'class_weight': ['balanced']
}

In [None]:
# 3. Creating the RandomizedSearchCV Model
rf_random = RandomizedSearchCV(
                    estimator=RandomForestClassifier(random_state=42),
                    param_distributions=param_grid, 
                    n_iter=5,
                    scoring='f1',
                    cv=2,         verbose=1,
                    random_state=42, n_jobs=-1
                    )

In [None]:
# 4. Fitting the RandomizedSearchCV Model (Training)
rf_random.fit(X_train_smote, y_train_smote)

In [None]:
# See best parameters
best_params = rf_random.best_params_
print("Best Parameters:", best_params)

In [None]:
# Building Final Model with Best Parameters
best_rf = rf_random.best_estimator_

In [None]:
# Predicting on test data
y_pred_best = best_rf.predict(X_test)

In [None]:
# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("Precision:", precision_score(y_test, y_pred_best))
print("Recall:", recall_score(y_test, y_pred_best))
print("F1 Score:", f1_score(y_test, y_pred_best))
print("confusion_matrix:\n", confusion_matrix(y_test, y_pred_best))

In [None]:
import joblib   # library to save the model

In [None]:
# Save the best model
joblib.dump(best_rf, "fraud_model.pkl")

In [None]:
# load the model to test it
import joblib
loaded_model = joblib.load("fraud_model.pkl")
# Make a prediction using the loaded model
sample = X_test.iloc[0].values.reshape(1, -1)
loaded_model.predict(sample)