In [1]:
import os
import zipfile
import numpy as np
import pandas as pd
import sys

from pathlib import Path

project_root = Path.cwd().parent
sys.path.append(str(project_root))

from src.Insurance_Fraud.logger.logger import logger
from src.Insurance_Fraud.constants import *
from src.Insurance_Fraud.utils.common import read_yaml, create_directories

In [15]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    learning_rate: float
    max_depth: int
    n_estimators: int
    subsample: float
    target_column: str

In [12]:

class ConfigurationManager:
    def __init__(
        self,
        config_file_path = Path(CONFIG_FILE_PATH),
        params_file_path = Path(PARAMS_FILE_PATH),
        schema_file_path = Path(SCHEMA_FILE_PATH)
    ):
        self.config = read_yaml(Path(config_file_path))
        self.params = read_yaml(Path(params_file_path))
        self.schema = read_yaml(Path(schema_file_path))

        logger.info(f"Schema loaded: {self.schema}")  # Log the schema for debugging

        self.config['data_validation']['unzip_data_dir'] = Path("C:/Users/Arpit Kadam/Desktop/Insurance-Fraud-Detection/artifacts/data_ingestion/Insurance_Claims.csv")

        create_directories([self.config['artifacts_root']])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config['model_trainer']
        params = self.params['GradientBoostingClassifier']
        
        # Check if 'TARGET_COLUMN' exists in schema
        if 'TARGET_COLUMN' not in self.schema:
            logger.error("TARGET_COLUMN not found in schema")
            raise KeyError("TARGET_COLUMN not found in schema")

        schema = self.schema['TARGET_COLUMN']

        create_directories([config['root_dir']])

        model_trainer_config = ModelTrainerConfig(
            root_dir = config['root_dir'],
            train_data_path = config['train_data_path'],
            test_data_path = config['test_data_path'],
            model_name = config['model_name'],
            learning_rate = params['learning_rate'],
            max_depth = params['max_depth'],
            n_estimators = params['n_estimators'],
            subsample = params['subsample'],
            target_column = schema['TARGET_COLUMN']
        )

        logger.info(f"Model Trainer Config: {model_trainer_config}")
        return model_trainer_config


In [6]:
import joblib
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier


In [13]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)

        train_x = train_data.drop([self.config.target_column], axis=1)
        train_y = train_data[self.config.target_column]

        test_x = test_data.drop([self.config.target_column], axis=1)
        test_y = test_data[self.config.target_column]

        model = GradientBoostingClassifier(
            learning_rate=self.config.learning_rate,
            max_depth=self.config.max_depth,
            n_estimators=self.config.n_estimators,
            subsample=self.config.subsample
        )

        model.fit(train_x, train_y)

        joblib.dump(model, os.path.join(self.config.root_dir, self.config.model_name))

        logger.info(f"Model trained and saved to {self.config.root_dir}/{self.config.model_name}")

        return model

In [14]:
try:
    config = ConfigurationManager(config_file_path=Path("C:/Users/Arpit Kadam/Desktop/Insurance-Fraud-Detection/config/config.yaml"),
                                    params_file_path=Path("C:/Users/Arpit Kadam/Desktop/Insurance-Fraud-Detection/params.yaml"),
                                    schema_file_path=Path("C:/Users/Arpit Kadam/Desktop/Insurance-Fraud-Detection/schema.yaml"))
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model = model_trainer.train()
    logger.info(f"Model training completed successfully")
except Exception as e:
    logger.error(f"Error in model training: {e}")
    raise e


[2025-01-22 01:39:56,364: INFO: common: Attempting to read YAML file from: C:\Users\Arpit Kadam\Desktop\Insurance-Fraud-Detection\config\config.yaml]
[2025-01-22 01:39:56,375: INFO: common: Attempting to read YAML file from: C:\Users\Arpit Kadam\Desktop\Insurance-Fraud-Detection\params.yaml]
[2025-01-22 01:39:56,378: INFO: common: Attempting to read YAML file from: C:\Users\Arpit Kadam\Desktop\Insurance-Fraud-Detection\schema.yaml]
[2025-01-22 01:39:56,388: INFO: 74876685: Schema loaded: {'COLUMNS': {'months_as_customer': 'int64', 'age': 'int64', 'policy_number': 'int64', 'policy_bind_date': 'object', 'policy_state': 'object', 'policy_csl': 'object', 'policy_deductable': 'int64', 'policy_annual_premium': 'float64', 'umbrella_limit': 'int64', 'insured_zip': 'int64', 'insured_sex': 'object', 'insured_education_level': 'object', 'insured_occupation': 'object', 'insured_hobbies': 'object', 'insured_relationship': 'object', 'capital-gains': 'int64', 'capital-loss': 'int64', 'incident_date':

KeyError: 'TARGET_COLUMN'