# 1. Data Load and Selection

In [16]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve
import plotly.express as px
import plotly.graph_objects as go

# Load the dataset
file_path = 'fraudTest.csv'  # Replace with your dataset path
dataset = pd.read_csv(file_path)

dataset = dataset.sample(n=4000, random_state=42)  # Randomly sample 4,000 rows

# 2. Descriptive Statistics

In [17]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Round descriptive statistics to two decimal points
descriptive_stats = dataset.describe().T.round(2)
descriptive_stats['missing_values'] = dataset.isnull().sum()
descriptive_stats['unique_values'] = dataset.nunique()

# Display descriptive statistics as a table
fig_table = go.Figure(
    data=[
        go.Table(
            header=dict(values=["Metric", "Count", "Mean", "Std", "Min", "25%", "50%", "75%", "Max", "Missing", "Unique"],
                        fill_color='paleturquoise', align='left'),
            cells=dict(values=[descriptive_stats.index,
                               descriptive_stats['count'],
                               descriptive_stats['mean'],
                               descriptive_stats['std'],
                               descriptive_stats['min'],
                               descriptive_stats['25%'],
                               descriptive_stats['50%'],
                               descriptive_stats['75%'],
                               descriptive_stats['max'],
                               descriptive_stats['missing_values'],
                               descriptive_stats['unique_values']],
                       fill_color='lavender', align='left')
        )
    ]
)
fig_table.update_layout(title_text="Descriptive Statistics", title_x=0.5)
fig_table.show()

# Numerical columns for visualization
numerical_columns = dataset.select_dtypes(include=['float64', 'int64']).columns

# Create subplots for boxplots and histograms
rows = len(numerical_columns)
fig = make_subplots(
    rows=rows, cols=2,
    subplot_titles=[f"Boxplot - {col}" for col in numerical_columns] + [f"Histogram - {col}" for col in numerical_columns],
    vertical_spacing=0.05
)

for i, column in enumerate(numerical_columns, start=1):
    # Add boxplot
    fig.add_trace(
        go.Box(y=dataset[column], name=f"Boxplot - {column}", boxmean=True),
        row=i, col=1
    )
    # Add histogram
    fig.add_trace(
        go.Histogram(x=dataset[column], name=f"Histogram - {column}", nbinsx=30),
        row=i, col=2
    )

# Update layout
fig.update_layout(
    title_text="Boxplots and Histograms for Numerical Features",
    title_x=0.5,
    height=300 * rows,  # Adjust height based on the number of rows
    showlegend=False
)
fig.show()


# 3. Data Preprocessing



1.   Dropping Unnecessary colunns
2.   One Hot Coding
3.   Scaling for standardization
4.   SMOTE for data imbalance
5.   Stratified K-cross validation

Model Application

*   Logistic Regression
*   Random Forrest
*   Gradient Boosting






In [18]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import plotly.figure_factory as ff

# Drop unnecessary columns
columns_to_drop = ['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'first', 'last', 'street',
                   'city', 'state', 'zip', 'dob', 'trans_num']
dataset = dataset.drop(columns=columns_to_drop)

# Convert categorical columns to numeric using one-hot encoding
categorical_columns = ['merchant', 'category', 'gender', 'job']
dataset = pd.get_dummies(dataset, columns=categorical_columns, drop_first=True)

# Plot initial target distribution with Plotly
target_distribution = dataset['is_fraud'].value_counts().reset_index()
target_distribution.columns = ['is_fraud', 'count']

fig = px.bar(
    target_distribution,
    x='is_fraud',
    y='count',
    labels={'is_fraud': 'Class', 'count': 'Count'},
    title='Initial Target Distribution',
    color='is_fraud',
    color_discrete_map={0: 'blue', 1: 'red'}
)
fig.show()

# Separate features and target
X = dataset.drop(columns=['is_fraud'])
y = dataset['is_fraud']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to handle imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Plot class distribution after SMOTE
class_distribution = pd.Series(y_train_resampled).value_counts().reset_index()
class_distribution.columns = ['is_fraud', 'count']

fig = px.bar(
    class_distribution,
    x='is_fraud',
    y='count',
    labels={'is_fraud': 'Class', 'count': 'Count'},
    title='Class Distribution After SMOTE',
    color='is_fraud',
    color_discrete_map={0: 'blue', 1: 'red'}
)
fig.show()

# Define models
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Train and evaluate models
roc_curves = []
evaluation_metrics = []
confusion_matrices = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")

    # Cross-validation scores
    cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=cv, scoring='roc_auc')
    print(f"{model_name} - Cross-Validated AUC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

    # Fit the model and evaluate on the test set
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_test_scaled)

    # Store metrics
    report = classification_report(y_test, y_pred, output_dict=True)
    precision = report['1']['precision']
    recall = report['1']['recall']
    f1_score = report['1']['f1-score']
    accuracy = accuracy_score(y_test, y_pred)
    evaluation_metrics.append({'model': model_name, 'precision': precision, 'recall': recall, 'accuracy': accuracy, 'f1_score': f1_score})

    print(f"\n{model_name} - Classification Report:\n", classification_report(y_test, y_pred))
    print(f"{model_name} - Accuracy Score: {accuracy:.4f}")
    print(f"{model_name} - Test AUC: {roc_auc_score(y_test, y_proba):.4f}")

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices[model_name] = cm

    # Generate ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_curves.append({'model': model_name, 'fpr': fpr, 'tpr': tpr})

# Plot ROC curves for all models
fig = go.Figure()
for roc in roc_curves:
    fig.add_trace(go.Scatter(x=roc['fpr'], y=roc['tpr'], mode='lines', name=roc['model']))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random (Baseline)', line=dict(dash='dash')))
fig.update_layout(
    title='ROC Curves for All Models',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    template='plotly_white'
)
fig.show()

# Plot Confusion Matrices for all models
for model_name, cm in confusion_matrices.items():
    fig_cm = ff.create_annotated_heatmap(
        z=cm,
        x=['Non-Fraud (0)', 'Fraud (1)'],
        y=['Non-Fraud (0)', 'Fraud (1)'],
        annotation_text=cm,
        colorscale='Blues',
        showscale=True
    )
    fig_cm.update_layout(
        title=f'Confusion Matrix - {model_name}',
        xaxis_title='Predicted Label',
        yaxis_title='True Label',
        template='plotly_white'
    )
    fig_cm.show()

# Plot Precision, Recall, and Accuracy
metrics_df = pd.DataFrame(evaluation_metrics)

fig_metrics = go.Figure()
fig_metrics.add_trace(go.Bar(x=metrics_df['model'], y=metrics_df['precision'], name='Precision', marker_color='blue'))
fig_metrics.add_trace(go.Bar(x=metrics_df['model'], y=metrics_df['recall'], name='Recall', marker_color='red'))
fig_metrics.add_trace(go.Bar(x=metrics_df['model'], y=metrics_df['accuracy'], name='Accuracy', marker_color='green'))

fig_metrics.update_layout(
    title='Precision, Recall, and Accuracy for All Models',
    xaxis_title='Models',
    yaxis_title='Score',
    barmode='group',
    template='plotly_white'
)
fig_metrics.show()


Training Logistic Regression...
Logistic Regression - Cross-Validated AUC: 0.9985 ± 0.0006

Logistic Regression - Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       798
           1       0.00      0.00      0.00         2

    accuracy                           0.99       800
   macro avg       0.50      0.50      0.50       800
weighted avg       0.99      0.99      0.99       800

Logistic Regression - Accuracy Score: 0.9938
Logistic Regression - Test AUC: 0.3484
Training Random Forest...
Random Forest - Cross-Validated AUC: 1.0000 ± 0.0000



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.




Random Forest - Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       798
           1       0.00      0.00      0.00         2

    accuracy                           1.00       800
   macro avg       0.50      0.50      0.50       800
weighted avg       1.00      1.00      1.00       800

Random Forest - Accuracy Score: 0.9975
Random Forest - Test AUC: 0.7231
Training Gradient Boosting...
Gradient Boosting - Cross-Validated AUC: 1.0000 ± 0.0000

Gradient Boosting - Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       798
           1       1.00      0.50      0.67         2

    accuracy                           1.00       800
   macro avg       1.00      0.75      0.83       800
weighted avg       1.00      1.00      1.00       800

Gradient Boosting - Accuracy Score: 0.9988
Gradient Boosting - Test AUC: 0.6169


In [19]:
!pip install dash --quiet
!pip install dash-bootstrap-components --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m228.0/228.0 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m222.5/222.5 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [20]:
from dash import Dash, dcc, html, Input, Output
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

# Initialize Dash app
app = Dash(__name__)
app.title = "Fraud Detection Dashboard"

# Use values from the evaluation
metrics_df = pd.DataFrame(evaluation_metrics)
confusion_matrices = confusion_matrices  # Confusion matrices from your code
roc_curves = roc_curves  # ROC curves from your code

# Define unified font style
FONT_STYLE = {
    'font-family': 'Arial, sans-serif',
    'color': '#333',
    'font-size': '18px'
}

# Dashboard Layout
app.layout = html.Div(style={'font-family': 'Arial, sans-serif'}, children=[
    # Title with backdrop
    html.Div(
        style={'background-color': '#003f5c', 'padding': '20px', 'border-radius': '10px'},
        children=[
            html.H1("Fraud Detection Dashboard",
                    style={**FONT_STYLE, 'color': 'white', 'text-align': 'center', 'margin-bottom': '5px'}),
            html.H4("Analyze and Compare Model Performance",
                    style={**FONT_STYLE, 'color': 'white', 'text-align': 'center', 'margin-top': '5px'})
        ]
    ),

    # Dropdown for model selection
    html.Div(
        style={'background-color': '#f4f4f4', 'padding': '10px', 'border-radius': '10px', 'margin-top': '20px'},
        children=[
            html.Label("Select Model or Compare All:", style={**FONT_STYLE, 'font-weight': 'bold'}),
            dcc.Dropdown(
                id='model-dropdown',
                options=[{'label': model, 'value': model} for model in metrics_df['model']] +
                        [{'label': 'Compare All', 'value': 'Compare All'}],
                value='Logistic Regression',  # Default selection
                style={'width': '50%', 'margin': 'auto'}
            )
        ]
    ),

    # Metrics visualization
    html.Div([
        dcc.Graph(id='metrics-bar-chart'),
    ], style={'margin-top': '20px'}),

    # Confusion Matrix
    html.Div([
        dcc.Graph(id='confusion-matrix'),
    ], style={'margin-top': '20px'}),

    # ROC Curve
    html.Div([
        dcc.Graph(id='roc-curve'),
    ], style={'margin-top': '20px'}),

    # Export Options
    html.Div(
        style={'background-color': '#f4f4f4', 'padding': '10px', 'border-radius': '10px', 'margin-top': '20px'},
        children=[
            html.Label("Export Results:", style={**FONT_STYLE, 'font-weight': 'bold'}),
            html.Div([
                html.Button("Export as CSV", id="export-csv-btn", style={**FONT_STYLE, 'margin-right': '10px'}),
                html.Button("Export as JSON", id="export-json-btn", style={**FONT_STYLE, 'margin-right': '10px'}),
                html.Button("Export as Excel", id="export-excel-btn", style={**FONT_STYLE}),
            ], style={'text-align': 'center'}),
            dcc.Download(id="download-file")
        ]
    )
])

# Callbacks for Interactivity
@app.callback(
    [Output('metrics-bar-chart', 'figure'),
     Output('confusion-matrix', 'figure'),
     Output('roc-curve', 'figure')],
    [Input('model-dropdown', 'value')]
)
def update_visualizations(selected_model):
    if selected_model == 'Compare All':
        # Metrics Bar Chart
        metrics_fig = go.Figure()
        for _, row in metrics_df.iterrows():
            metrics_fig.add_trace(go.Bar(
                x=['Precision', 'Recall', 'Accuracy'],
                y=[row['precision'], row['recall'], row['accuracy']],
                name=row['model']
            ))
        metrics_fig.update_layout(title="Comparison of Precision, Recall, and Accuracy Across Models",
                                  barmode='group',
                                  xaxis_title="Metrics",
                                  yaxis_title="Score",
                                  template="plotly_white")

        # Placeholder for confusion matrix and ROC curve
        cm_fig = go.Figure().update_layout(title="Confusion Matrix Not Available in Compare Mode",
                                           template="plotly_white")
        roc_fig = go.Figure().update_layout(title="ROC Curve Not Available in Compare Mode",
                                            template="plotly_white")
        return metrics_fig, cm_fig, roc_fig
    else:
        # Single Model Analysis
        selected_metrics = metrics_df[metrics_df['model'] == selected_model].iloc[0]
        metrics_fig = go.Figure([
            go.Bar(name='Precision', x=['Precision'], y=[selected_metrics['precision']], marker_color='blue'),
            go.Bar(name='Recall', x=['Recall'], y=[selected_metrics['recall']], marker_color='red'),
            go.Bar(name='Accuracy', x=['Accuracy'], y=[selected_metrics['accuracy']], marker_color='green')
        ])
        metrics_fig.update_layout(title=f"{selected_model} - Precision, Recall, Accuracy",
                                  barmode='group',
                                  xaxis_title="Metrics",
                                  yaxis_title="Score",
                                  template="plotly_white")

        # Confusion Matrix
        cm = confusion_matrices[selected_model]
        cm_fig = ff.create_annotated_heatmap(
            z=cm,
            x=['Non-Fraud (0)', 'Fraud (1)'],
            y=['Non-Fraud (0)', 'Fraud (1)'],
            annotation_text=cm,
            colorscale='Blues',
            showscale=True
        )
        cm_fig.update_layout(title=f"{selected_model} - Confusion Matrix",
                             xaxis_title="Predicted Label",
                             yaxis_title="True Label",
                             template="plotly_white")

        # ROC Curve
        roc_data = next(roc for roc in roc_curves if roc['model'] == selected_model)
        roc_fig = go.Figure()
        roc_fig.add_trace(go.Scatter(x=roc_data['fpr'], y=roc_data['tpr'], mode='lines', name='ROC Curve'))
        roc_fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Baseline', line=dict(dash='dash')))
        roc_fig.update_layout(title=f"{selected_model} - ROC Curve",
                              xaxis_title="False Positive Rate",
                              yaxis_title="True Positive Rate",
                              template="plotly_white")

        return metrics_fig, cm_fig, roc_fig

# Callback for Export Options
@app.callback(
    Output("download-file", "data"),
    [Input("export-csv-btn", "n_clicks"),
     Input("export-json-btn", "n_clicks"),
     Input("export-excel-btn", "n_clicks")],
    prevent_initial_call=True
)
def export_file(n_clicks_csv, n_clicks_json, n_clicks_excel):
    ctx = dash.callback_context
    if not ctx.triggered:
        raise dash.exceptions.PreventUpdate

    button_id = ctx.triggered[0]["prop_id"].split(".")[0]
    if button_id == "export-csv-btn":
        return dcc.send_data_frame(metrics_df.to_csv, "model_metrics.csv")
    elif button_id == "export-json-btn":
        return dict(content=metrics_df.to_json(), filename="model_metrics.json")
    elif button_id == "export-excel-btn":
        return dcc.send_data_frame(metrics_df.to_excel, "model_metrics.xlsx", sheet_name="Metrics")

# Run the App
if __name__ == '__main__':
    app.run_server(debug=True)



<IPython.core.display.Javascript object>

To determine the best model, we need to evaluate both the metrics and the context of the task. In this case, the goal is fraud detection, which typically prioritizes identifying the minority class (is_fraud = 1) effectively.

Key Observations:
1. Logistic Regression:
Cross-Validated AUC: 0.9985 ± 0.0006 (Very high).
Test AUC: 0.3484 (Very low, indicating poor performance on the minority class).
Classification Report:
Precision, Recall, and F1-score for the minority class (is_fraud = 1) are 0, indicating it failed to identify any fraudulent transactions.
2. Random Forest:
Cross-Validated AUC: 1.0000 ± 0.0000 (Perfect on training data).
Test AUC: 0.7231 (Better than Logistic Regression but still suboptimal).
Classification Report:
Same issue as Logistic Regression: fails to identify the minority class (is_fraud = 1).
3. Gradient Boosting:
Cross-Validated AUC: 1.0000 ± 0.0000 (Perfect on training data).
Test AUC: 0.6169 (Lower than Random Forest but better handling of the minority class).
Classification Report:
Achieved Precision = 1.00 and Recall = 0.50 for the minority class.
F1-score for the minority class is 0.67, which is the best among all models.
Why Gradient Boosting is the Best Model:
Recall for Minority Class (is_fraud = 1):

Gradient Boosting correctly identified 50% of the fraudulent transactions, while the other models identified none.
F1-Score for Minority Class:

Gradient Boosting achieved an F1-score of 0.67, which balances Precision and Recall. This is critical for fraud detection, where the goal is to identify fraudulent transactions without overwhelming false positives.
Practical Consideration:

Fraudulent transactions are rare but costly, so it is better to catch some fraudulent transactions (even with false positives) than to miss all of them.
Recommendations:
Gradient Boosting is the best model because it shows the highest effectiveness in detecting the minority class.
Suggestions for Improvement:
Adjust Model Threshold: Experiment with thresholds to balance Precision and Recall for the Gradient Boosting model.
Resampling or Cost-Sensitive Techniques: Explore advanced sampling techniques or algorithms that handle class imbalance better (e.g., weighted loss functions).
Feature Engineering: Include additional features (e.g., interaction terms or domain-specific indicators) to improve model performance.