In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

In [None]:
def load_and_explore_data():
    """
    TODO: Load the claims.csv dataset and perform exploratory data analysis
    Returns:
        pd.DataFrame: The loaded claims dataset
    """
    df = pd.read_csv('data/claims.csv')
    return df

def preprocess_data(df):
    """
    TODO: Preprocess the data for machine learning
    
    Args:
        df (pd.DataFrame): Raw claims dataset
        
    Returns:
        tuple: (X_train, X_test, y_train, y_test) - preprocessed and split data
    """

    df = df.drop(columns=['claim_id', 'claim_type', 'description'])
    X = df.drop(columns=['is_fraud'])
    y = df['is_fraud']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    return X_train, X_test, y_train, y_test


def train_models(X_train, y_train):
    """
    TODO: Train multiple machine learning models
    
    Args:
        X_train: Training features
        y_train: Training labels
        
    Returns:
        dict: Dictionary of trained models
    """
    models = {
        'logistic_regression': LogisticRegression(max_iter=1000),
        'random_forest': RandomForestClassifier()
    }

    for name, model in models.items():
        model.fit(X_train, y_train)

    return models


def evaluate_models(models, X_test, y_test):
    """
    TODO: Evaluate all models and compare their performance
    
    Args:
        models (dict): Dictionary of trained models
        X_test: Test features
        y_test: Test labels
        
    Returns:
        dict: Performance metrics for each model
    """
    results = {}
    
    for name, model in models.items():
        y_pred = model.predict(X_test)
        results[name] = {
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1_score': f1_score(y_test, y_pred)
            # print(name)
        }
    return results


def select_best_model(results):
    """
    TODO: Select the best model based on business requirements
    
    Args:
        results (dict): Performance metrics for each model
        
    Returns:
        str: Name of the best model with business justification
    """
    pass


df  = load_and_explore_data()
X_train, X_test, y_train, y_test = preprocess_data(df)
X_train.head()

# def main():
#     """
#     Main function to run the complete fraud detection pipeline
#     """
#     print("=== Insurance Claims Fraud Detection ===\n")

#     df  = load_and_explore_data()
#     # print(*df.columns)
#     preprocess_data(df)
#     # print(X_train.head(1))
#     # print(y_train.head(1))
#     # models = train_models(X_train, y_train)
#     # results = evaluate_models(models, X_test, y_test)    
    
#     print("Model Performance Comparison:")
#     print("Random Forest: Precision=0.XX, Recall=0.XX, F1=0.XX")
#     print("Logistic Regression: Precision=0.XX, Recall=0.XX, F1=0.XX")
#     print()
#     print("Recommended Model: [MODEL_NAME]")
#     print("Business Impact: [EXPLANATION]")


# if __name__ == "__main__":
#     main()


Unnamed: 0,claim_amount,policy_age_days,claimant_age,previous_claims
32,1050.0,2100,51,0
39,12800.0,50,48,2
21,21000.0,12,31,0
36,775.5,2500,56,0
19,14800.0,35,50,2


In [7]:
df.head()

NameError: name 'df' is not defined