<a href="https://colab.research.google.com/github/Arishiine/Colab-Fraud-Project/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# prompt: build 3 models to predict fraud

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# --- 1. Data Loading and Preprocessing ---

# Load the data from the provided CSV file
try:
    df = pd.read_csv('/content/synthetic_fraud_dataset.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: '/content/synthetic_fraud_dataset.csv' not found. Please upload your data file.")
    # Exit or handle the error appropriately if the file is not found
    exit() # Or raise an exception, or create dummy data as before if that's desired


# Inspect the columns to identify the target column and feature types
print("Dataset columns:", df.columns.tolist())

# Assuming the target column is named 'Fraud_Label' based on the dataset inspection
target_column = 'Fraud_Label'
if target_column not in df.columns:
    print(f"Error: Target column '{target_column}' not found in the dataset.")
    # Attempt to guess the target column or exit
    # For now, let's assume 'Fraud_Label' is the correct one and exit if not found
    exit()


# Separate features (X) and target (y)
X = df.drop(target_column, axis=1)
y = df[target_column]

# Identify numerical and categorical features (adjust based on your dataset)
# This part will need adjustment based on the actual column types in your CSV
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

print("Numerical features:", numerical_features.tolist())
print("Categorical features:", categorical_features.tolist())


# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a column transformer to apply the transformations
# Only include transformers if there are features of that type
transformers = []
if len(numerical_features) > 0:
    transformers.append(('num', numerical_transformer, numerical_features))
if len(categorical_features) > 0:
    transformers.append(('cat', categorical_transformer, categorical_features))

if len(transformers) == 0:
    print("Error: No numerical or categorical features found for preprocessing.")
    exit()

preprocessor = ColumnTransformer(transformers=transformers)


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y) # Stratify for imbalanced data

# --- 2. Model Selection and Training ---

# Model 1: Logistic Regression
model_lr = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced'))]) # Use class_weight='balanced' for imbalanced data

print("Training Logistic Regression...")
model_lr.fit(X_train, y_train)
print("Logistic Regression training complete.")

# Model 2: Random Forest
model_rf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))]) # Use class_weight='balanced' for imbalanced data

print("Training Random Forest...")
model_rf.fit(X_train, y_train)
print("Random Forest training complete.")

# Model 3: LightGBM
# LightGBM can handle categorical features directly if they are encoded as integers
# For simplicity here, we'll let the preprocessor handle encoding.
model_lgb = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', lgb.LGBMClassifier(random_state=42, objective='binary', metric='auc', is_unbalance=True))]) # Use is_unbalance=True for imbalanced data

print("Training LightGBM...")
model_lgb.fit(X_train, y_train)
print("LightGBM training complete.")

# --- 4. Evaluation ---

print("\n--- Model Evaluation ---")

models = {
    "Logistic Regression": model_lr,
    "Random Forest": model_rf,
    "LightGBM": model_lgb
}

for name, model in models.items():
    print(f"\nEvaluating {name}:")

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] # Probability of the positive class

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    auc_score = roc_auc_score(y_test, y_prob)
    print(f"ROC AUC Score: {auc_score:.4f}")

# --- 5. Comparison (Implicit in the evaluation output) ---
# You can now compare the metrics printed for each model to see which performs best.
# For fraud detection, Recall (the ability to find all positive samples) and
# Precision (the accuracy of positive predictions) are often key, along with AUC.

Dataset loaded successfully.
Dataset columns: ['Transaction_ID', 'User_ID', 'Transaction_Amount', 'Transaction_Type', 'Timestamp', 'Account_Balance', 'Device_Type', 'Location', 'Merchant_Category', 'IP_Address_Flag', 'Previous_Fraudulent_Activity', 'Daily_Transaction_Count', 'Avg_Transaction_Amount_7d', 'Failed_Transaction_Count_7d', 'Card_Type', 'Card_Age', 'Transaction_Distance', 'Authentication_Method', 'Risk_Score', 'Is_Weekend', 'Fraud_Label']
Numerical features: ['Transaction_Amount', 'Account_Balance', 'IP_Address_Flag', 'Previous_Fraudulent_Activity', 'Daily_Transaction_Count', 'Avg_Transaction_Amount_7d', 'Failed_Transaction_Count_7d', 'Card_Age', 'Transaction_Distance', 'Risk_Score', 'Is_Weekend']
Categorical features: ['Transaction_ID', 'User_ID', 'Transaction_Type', 'Timestamp', 'Device_Type', 'Location', 'Merchant_Category', 'Card_Type', 'Authentication_Method']
Training Logistic Regression...
Logistic Regression training complete.
Training Random Forest...
Random Forest t



[LightGBM] [Info] Number of positive: 12050, number of negative: 25450
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005049 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1595
[LightGBM] [Info] Number of data points in the train set: 37500, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.321333 -> initscore=-0.747651
[LightGBM] [Info] Start training from score -0.747651
LightGBM training complete.

--- Model Evaluation ---

Evaluating Logistic Regression:
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.82      0.84      8483
           1       0.66      0.73      0.69      4017

    accuracy                           0.79     12500
   macro avg       0.76      0.78      0.77     12500
weighted avg       0.80      0.79      0.79     12500

Confu



Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8483
           1       1.00      1.00      1.00      4017

    accuracy                           1.00     12500
   macro avg       1.00      1.00      1.00     12500
weighted avg       1.00      1.00      1.00     12500

Confusion Matrix:
[[8480    3]
 [   9 4008]]
ROC AUC Score: 1.0000




# Task
Use the data-clean, feature engineer, and train three models (Logistic Regression, Random Forest, and LightGBM) on the dataset "fraud_detection_dataset.csv". Predict fraud and write a Python application for deployment.

## Data cleaning and feature engineering

### Subtask:
Address the `KeyError` by correcting the target column name and perform any necessary data cleaning and feature engineering steps based on the actual dataset columns.


**Reasoning**:
The traceback indicates a KeyError because the column 'is_fraud' was not found in the DataFrame. I need to inspect the DataFrame `df` to identify the correct target column name and the feature types. Since the file was not found, a dummy dataset was created, which does not have a column named 'is_fraud'. The error occurs when trying to drop 'is_fraud'. I need to examine the dummy dataset's columns and correct the code to reflect the dummy dataset's structure.



## Model Saving

Now I will save the trained models and the preprocessor using `joblib` so they can be used later for making predictions on new data without retraining.

In [2]:
import joblib
import os

# Create a directory to save the models if it doesn't exist
if not os.path.exists('saved_models'):
    os.makedirs('saved_models')

# Save the preprocessor
joblib.dump(preprocessor, 'saved_models/preprocessor.joblib')

# Save each trained model
joblib.dump(model_lr, 'saved_models/logistic_regression_model.joblib')
joblib.dump(model_rf, 'saved_models/random_forest_model.joblib')
joblib.dump(model_lgb, 'saved_models/lightgbm_model.joblib')

print("Models and preprocessor saved to 'saved_models' directory.")

Models and preprocessor saved to 'saved_models' directory.


## Streamlit Application for Deployment

Here is a Python script for a basic Streamlit application that loads the saved models and preprocessor and allows you to input transaction data to get fraud predictions.

**To run this Streamlit app:**

1.  Make sure you have Streamlit installed (`pip install streamlit`).
2.  Save the code below as a Python file (e.g., `fraud_app.py`) in the same directory where you saved the `saved_models` folder.
3.  Open your terminal or command prompt, navigate to that directory, and run the command: `streamlit run fraud_app.py`

In [7]:
import streamlit as st
import pandas as pd
import joblib
import os

# Load the saved preprocessor and models
@st.cache_resource # Cache the resource to avoid reloading on every rerun
def load_models():
    try:
        preprocessor = joblib.load('saved_models/preprocessor.joblib')
        model_lr = joblib.load('saved_models/logistic_regression_model.joblib')
        model_rf = joblib.load('saved_models/random_forest_model.joblib')
        model_lgb = joblib.load('saved_models/lightgbm_model.joblib')
        st.success("Models and preprocessor loaded successfully.")
        return preprocessor, model_lr, model_rf, model_lgb
    except FileNotFoundError:
        st.error("Error: Saved models or preprocessor not found. Please ensure 'saved_models' directory and its contents exist.")
        return None, None, None, None

preprocessor, model_lr, model_rf, model_lgb = load_models()

# Set up the Streamlit app title and description
st.title("Fraud Detection Application")
st.write("Upload a CSV file with transaction data or enter details manually to predict fraud.")

# Option to upload a CSV file
st.header("Upload Transaction Data (CSV)")
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")

if uploaded_file is not None:
    try:
        new_data_df = pd.read_csv(uploaded_file)
        st.write("Uploaded Data:")
        st.dataframe(new_data_df)

        # Select model for prediction
        model_choice_upload = st.selectbox(
            "Select Model for Prediction (Uploaded Data):",
            ("Random Forest", "Logistic Regression", "LightGBM")
        )

        if st.button("Predict Fraud (Uploaded Data)"):
            if preprocessor and model_lr and model_rf and model_lgb:
                # Make predictions
                if model_choice_upload == "Logistic Regression":
                    model = model_lr
                elif model_choice_upload == "Random Forest":
                    model = model_rf
                else:
                    model = model_lgb

                try:
                    # Ensure the uploaded data has the same columns as the training data (excluding target)
                    # You might need more robust column handling here depending on your data
                    # For simplicity, assuming column order and names match
                    processed_data = preprocessor.transform(new_data_df)
                    predictions = model.predict(processed_data)
                    probabilities = model.predict_proba(processed_data)[:, 1]

                    results_df = new_data_df.copy()
                    results_df['Predicted_Fraud'] = predictions
                    results_df['Probability_Fraud'] = probabilities

                    st.write("\nPrediction Results (Uploaded Data):")
                    st.dataframe(results_df)

                except Exception as e:
                    st.error(f"Error during prediction: {e}")
                    st.write("Please ensure the uploaded CSV has the correct format and columns matching the training data.")
            else:
                st.warning("Models not loaded. Please check the 'saved_models' directory.")

# Option to enter data manually (example for a few key features)
st.header("Enter Transaction Details Manually")

# You will need to add input fields for each feature your model expects
# This is a simplified example with only a few features
# Get the list of expected features from the preprocessor
if preprocessor:
    # Assuming the preprocessor's fitted transformers can give us feature names
    # This might require inspecting the fitted preprocessor object based on its type
    # For ColumnTransformer, you can access the columns used by each transformer
    try:
        # This is a simplified way to get feature names, may need adjustment
        # You may need to access the feature names differently based on your specific preprocessor setup
        # For example, if using ColumnTransformer with named transformers, you might need to iterate
        # through the transformers and get their feature names.
        # As a temporary fix to unblock, let's define a dummy numerical_features list
        # assuming the original dataset columns are still available in the environment
        # In a real Streamlit app, you'd need a more robust way to get feature names from the loaded preprocessor
        try:
          # Attempt to get feature names from the preprocessor if it's a ColumnTransformer
          if isinstance(preprocessor, ColumnTransformer):
              input_features = []
              for name, transformer, features in preprocessor.transformers_:
                  if hasattr(transformer, 'get_feature_names_out'):
                      input_features.extend(transformer.get_feature_names_out(features))
                  else:
                      # Fallback if the transformer doesn't have get_feature_names_out
                      input_features.extend(features)
          else:
             st.warning("Preprocessor type not recognized for automatic feature name extraction. Please manually define input fields.")
             input_features = [] # Fallback
        except Exception as e:
             st.warning(f"Could not automatically get feature names from preprocessor: {e}")
             st.write("Please manually define the input fields for your features.")
             input_features = [] # Fallback to empty list if feature names can't be extracted

        manual_input_data = {}
        st.write("Please enter values for the following features:")
        # Create input fields dynamically based on identified features
        for feature in input_features:
            # You would need to add more sophisticated input types based on feature dtype
            manual_input_data[feature] = st.text_input(f"{feature}:", "")

        # Add a button to predict with manual input
        if st.button("Predict Fraud (Manual Input)"):
            if preprocessor and model_lr and model_rf and model_lgb:
                # Convert manual input to a DataFrame
                try:
                    # Convert input values to appropriate types if necessary
                    # This is a basic conversion, you might need more specific handling
                    manual_input_processed = {}
                    for feature, value in manual_input_data.items():
                        # Attempt to infer type or use a default (e.g., string)
                        try:
                            # Try converting to float
                            manual_input_processed[feature] = [float(value)]
                        except ValueError:
                            # If conversion to float fails, keep as string
                            manual_input_processed[feature] = [value]

                    manual_input_df = pd.DataFrame(manual_input_processed)

                    st.write("Manual Input Data:")
                    st.dataframe(manual_input_df)

                    # Select model for prediction
                    model_choice_manual = st.selectbox(
                        "Select Model for Prediction (Manual Input):",
                        ("Random Forest", "Logistic Regression", "LightGBM"),
                        key='manual_model_choice' # Add a unique key
                    )

                    # Make predictions
                    if model_choice_manual == "Logistic Regression":
                        model = model_lr
                    elif model_choice_manual == "Random Forest":
                        model = model_rf
                    else:
                        model = model_lgb

                    try:
                         processed_data_manual = preprocessor.transform(manual_input_df)
                         predictions_manual = model.predict(processed_data_manual)
                         probabilities_manual = model.predict_proba(processed_data_manual)[:, 1]

                         st.write("\nPrediction Result (Manual Input):")
                         st.write(f"Predicted Fraud: {'Yes' if predictions_manual[0] == 1 else 'No'}")
                         st.write(f"Probability of Fraud: {probabilities_manual[0]:.4f}")

                    except Exception as e:
                         st.error(f"Error during prediction: {e}")
                         st.write("Please ensure the manual input data matches the expected format and features.")

                except Exception as e:
                    st.error(f"Error creating DataFrame from manual input: {e}")
            else:
                 st.warning("Models not loaded. Please check the 'saved_models' directory.")

else:
    st.info("Models are being loaded...")

st.markdown("---")
st.write("Developed by Your Name/Team") # Optional: Add your name or team name

SyntaxError: expected 'except' or 'finally' block (ipython-input-7-2505689984.py, line 74)