<a href="https://colab.research.google.com/github/AJ-AYUSHMAN/CodSoft-Internship-Project-3/blob/main/CREDIT_CARD_FRAUD_DETECTION_MODEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = '/content/creditcard.csv'
data = pd.read_csv(file_path, on_bad_lines='warn')

# Display the first few rows of the dataset
print(data.head())

# Check for missing values
print(data.isnull().sum())

# Drop rows with missing values in 'Class'
data.dropna(subset=['Class'], inplace=True)

# Display summary statistics
print(data.describe())

# Plot the class distribution
fig = px.histogram(data, x='Class', title='Class Distribution')
fig.show()

# Separate features and target variable
X = data.drop(columns=['Class'])
y = data['Class']

# Handle missing values in features
# Fill numeric columns with mean
X.fillna(X.mean(), inplace=True)

# Normalize the transaction data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Handle class imbalance using SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_scaled, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Train a Logistic Regression model with increased max_iter
log_reg = LogisticRegression(random_state=42, max_iter=10000)
log_reg.fit(X_train, y_train)

# Train a Random Forest model
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Evaluate the Logistic Regression model
y_pred_log_reg = log_reg.predict(X_test)
y_prob_log_reg = log_reg.predict_proba(X_test)[:, 1]
print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred_log_reg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_reg))

# Evaluate the Random Forest model
y_pred_rf = rf_clf.predict(X_test)
y_prob_rf = rf_clf.predict_proba(X_test)[:, 1]
print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

# Compare Logistic Regression and Random Forest using Precision, Recall, and F1-score
precision_log_reg = precision_score(y_test, y_pred_log_reg)
recall_log_reg = recall_score(y_test, y_pred_log_reg)
f1_log_reg = f1_score(y_test, y_pred_log_reg)

precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print(f"Logistic Regression - Precision: {precision_log_reg}, Recall: {recall_log_reg}, F1-score: {f1_log_reg}")
print(f"Random Forest - Precision: {precision_rf}, Recall: {recall_rf}, F1-score: {f1_rf}")

# Plotting the comparison using Plotly
metrics = ['Precision', 'Recall', 'F1-score']
log_reg_scores = [precision_log_reg, recall_log_reg, f1_log_reg]
rf_scores = [precision_rf, recall_rf, f1_rf]

fig = make_subplots(rows=1, cols=2, subplot_titles=('Logistic Regression', 'Random Forest'))

fig.add_trace(go.Bar(x=metrics, y=log_reg_scores, name='Logistic Regression'), row=1, col=1)
fig.add_trace(go.Bar(x=metrics, y=rf_scores, name='Random Forest'), row=1, col=2)

fig.update_layout(title_text='Performance Comparison', barmode='group')
fig.show()

# Confusion Matrix Visualization using Plotly
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)

fig = make_subplots(rows=1, cols=2, subplot_titles=('Logistic Regression Confusion Matrix', 'Random Forest Confusion Matrix'))

trace1 = go.Heatmap(z=conf_matrix_log_reg, x=['Predicted 0', 'Predicted 1'], y=['Actual 0', 'Actual 1'], colorscale='Blues')
trace2 = go.Heatmap(z=conf_matrix_rf, x=['Predicted 0', 'Predicted 1'], y=['Actual 0', 'Actual 1'], colorscale='Blues')

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)

fig.update_layout(title_text='Confusion Matrices')
fig.show()

# ROC Curve and AUC Score
fpr_log_reg, tpr_log_reg, _ = roc_curve(y_test, y_prob_log_reg)
roc_auc_log_reg = roc_auc_score(y_test, y_prob_log_reg)

fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)
roc_auc_rf = roc_auc_score(y_test, y_prob_rf)

fig = go.Figure()

fig.add_trace(go.Scatter(x=fpr_log_reg, y=tpr_log_reg, mode='lines', name=f'Logistic Regression (AUC = {roc_auc_log_reg:.2f})'))
fig.add_trace(go.Scatter(x=fpr_rf, y=tpr_rf, mode='lines', name=f'Random Forest (AUC = {roc_auc_rf:.2f})'))

fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random', line=dict(dash='dash')))

fig.update_layout(title='ROC Curve', xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', showlegend=True)
fig.show()

# Probability Prediction Histogram
fig = make_subplots(rows=1, cols=2, subplot_titles=('Logistic Regression Probability', 'Random Forest Probability'))

trace1 = go.Histogram(x=y_prob_log_reg, nbinsx=50, name='Logistic Regression')
trace2 = go.Histogram(x=y_prob_rf, nbinsx=50, name='Random Forest')

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)

fig.update_layout(title_text='Predicted Probabilities', barmode='overlay')
fig.show()

# Display the classification report as a table
def classification_report_to_dataframe(report):
    report_data = []
    lines = report.split('\n')
    for line in lines[2:-3]:
        row = {}
        row_data = line.split()
        if len(row_data) >= 4:  # Check if there are enough elements
            row['class'] = row_data[0]
            row['precision'] = float(row_data[1])
            row['recall'] = float(row_data[2])
            row['f1_score'] = float(row_data[3])
            if len(row_data) > 4: # Check if support is present
                row['support'] = int(row_data[4])
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)
    return dataframe

log_reg_report_df = classification_report_to_dataframe(classification_report(y_test, y_pred_log_reg))
rf_report_df = classification_report_to_dataframe(classification_report(y_test, y_pred_rf))

fig = make_subplots(rows=1, cols=2, subplot_titles=('Logistic Regression Report', 'Random Forest Report'),
                    specs=[[{"type": "table"}, {"type": "table"}]])  # Specify subplot types as 'table'

trace1 = go.Table(header=dict(values=list(log_reg_report_df.columns)),
                  cells=dict(values=[log_reg_report_df[col].tolist() for col in log_reg_report_df.columns]))
trace2 = go.Table(header=dict(values=list(rf_report_df.columns)),
                  cells=dict(values=[rf_report_df[col].tolist() for col in rf_report_df.columns]))

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)

fig.update_layout(title_text='Classification Reports')
fig.show()


Skipping line 1987: expected 31 fields, saw 42



   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

Logistic Regression Performance:
              precision    recall  f1-score   support

         0.0       0.93      0.98      0.95     53408
         1.0       0.98      0.93      0.95     53334

    accuracy                           0.95    106742
   macro avg       0.95      0.95      0.95    106742
weighted avg       0.95      0.95      0.95    106742

Confusion Matrix:
 [[52249  1159]
 [ 3968 49366]]
Random Forest Performance:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     53408
         1.0       1.00      1.00      1.00     53334

    accuracy                           1.00    106742
   macro avg       1.00      1.00      1.00    106742
weighted avg       1.00      1.00      1.00    106742

Confusion Matrix:
 [[53398    10]
 [    0 53334]]
Logistic Regression - Precision: 0.9770608609599208, Recall: 0.9256009299883752, F1-score: 0.9506349955227762
Random Forest - Precision: 0.9998125374925015, Recall: 1.0, F1-score: 0.9999