In [22]:
import os
import zipfile
import numpy as np
import pandas as pd
import sys

In [23]:
from pathlib import Path

project_root = Path.cwd().parent
sys.path.append(str(project_root))

from src.Insurance_Fraud.constants import *
from src.Insurance_Fraud.utils.common import read_yaml, create_directories

In [4]:
df = pd.read_csv(Path("C:/Users/Arpit Kadam/Desktop/Insurance-Fraud-Detection/artifacts/data_ingestion/Insurance_Claims.csv"))

df.head()


Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


In [5]:
for columns in df.columns:
    print(f"{columns}: {df[columns].dtype}")


months_as_customer: int64
age: int64
policy_number: int64
policy_bind_date: object
policy_state: object
policy_csl: object
policy_deductable: int64
policy_annual_premium: float64
umbrella_limit: int64
insured_zip: int64
insured_sex: object
insured_education_level: object
insured_occupation: object
insured_hobbies: object
insured_relationship: object
capital-gains: int64
capital-loss: int64
incident_date: object
incident_type: object
collision_type: object
incident_severity: object
authorities_contacted: object
incident_state: object
incident_city: object
incident_location: object
incident_hour_of_the_day: int64
number_of_vehicles_involved: int64
property_damage: object
bodily_injuries: int64
witnesses: int64
police_report_available: object
total_claim_amount: int64
injury_claim: int64
property_claim: int64
vehicle_claim: int64
auto_make: object
auto_model: object
auto_year: int64
fraud_reported: object


In [24]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    unzip_data_dir: Path
    STATUS_FILE: str
    all_schema: dict

In [25]:
from src.Insurance_Fraud.constants import *
from src.Insurance_Fraud.utils.common import read_yaml, create_directories

In [27]:
from pathlib import Path

class ConfigurationManager:
    def __init__(
        self,
        config_file_path = Path(CONFIG_FILE_PATH),
        params_file_path = Path(PARAMS_FILE_PATH),
        schema_file_path = Path(SCHEMA_FILE_PATH)
    ):

        self.config = read_yaml(Path(config_file_path))
        self.params = read_yaml(Path(params_file_path))
        self.schema = read_yaml(Path(schema_file_path))

        self.config['data_validation']['unzip_data_dir'] = Path("C:/Users/Arpit Kadam/Desktop/Insurance-Fraud-Detection/artifacts/data_ingestion/Insurance_Claims.csv")

        create_directories([self.config['artifacts_root']])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config['data_validation']
        schema = self.schema['COLUMNS']

        create_directories([config['root_dir']])

        data_validation_config = DataValidationConfig(
            root_dir=config['root_dir'],
            unzip_data_dir=config['unzip_data_dir'],
            STATUS_FILE=config['STATUS_FILE'],
            all_schema=schema
        )

        return data_validation_config

In [36]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self) -> bool:
        try:
            df = pd.read_csv(self.config.unzip_data_dir)
            validation_status = True
            with open(self.config.STATUS_FILE, 'w') as f:
                for column, expected_dtype in self.config.all_schema.items():
                    if column in df.columns:
                        actual_dtype = df[column].dtype
                        print(f"Checking column: {column}, Expected: {expected_dtype}, Found: {actual_dtype}")
                        f.write(f"Checking column: {column}, Expected: {expected_dtype}, Found: {actual_dtype}\n")
                        if actual_dtype != expected_dtype:
                            validation_status = False
                            f.write(f"Column {column} has incorrect dtype. Expected: {expected_dtype}, Found: {actual_dtype}\n")
                    else:
                        validation_status = False
                        f.write(f"Column {column} is missing in the data.\n")
                
                if validation_status:
                    f.write("All columns are valid and have correct data types.\n")  # Write success message

            with open(self.config.STATUS_FILE, 'w') as f:
                f.write("Validation Status\n")
                f.write("****************************************************\n")
                f.write(str(validation_status) + '\n')
                f.write("****************************************************\n")
                f.write('Data Description\n')
                f.write("****************************************************\n")
                f.write(str(df.head()) + '\n')
                f.write("****************************************************\n")
                f.write('Data Info\n')
                f.write("****************************************************\n")
                f.write(str(df.info()) + '\n')
                f.write("****************************************************\n")
                f.write('Data Describe\n')
                f.write("****************************************************\n")
                f.write(str(df.describe()) + '\n')
                f.write("****************************************************\n")
                f.write('Data Shape\n')
                f.write("****************************************************\n")
                f.write(str(df.shape) + '\n')
                f.write("****************************************************\n")

            return validation_status
        except Exception as e:
            print(f"An error occurred: {e}")
            raise e
        


In [37]:
config = ConfigurationManager(config_file_path=Path("C:/Users/Arpit Kadam/Desktop/Insurance-Fraud-Detection/config/config.yaml"),
                                params_file_path=Path("C:/Users/Arpit Kadam/Desktop/Insurance-Fraud-Detection/params.yaml"),
                                schema_file_path=Path("C:/Users/Arpit Kadam/Desktop/Insurance-Fraud-Detection/schema.yaml"))
data_validation_config = config.get_data_validation_config()
data_validation = DataValidation(config=data_validation_config)
validation_status = data_validation.validate_all_columns()
print(f"Data validation status: {validation_status}")

[2025-01-21 22:42:49,200: INFO: common: Attempting to read YAML file from: C:\Users\Arpit Kadam\Desktop\Insurance-Fraud-Detection\config\config.yaml]
[2025-01-21 22:42:49,204: INFO: common: Attempting to read YAML file from: C:\Users\Arpit Kadam\Desktop\Insurance-Fraud-Detection\params.yaml]
[2025-01-21 22:42:49,207: INFO: common: Attempting to read YAML file from: C:\Users\Arpit Kadam\Desktop\Insurance-Fraud-Detection\schema.yaml]
[2025-01-21 22:42:49,215: INFO: common: Directory created: artifacts]
[2025-01-21 22:42:49,219: INFO: common: Directory created: artifacts/data_validation]
Checking column: months_as_customer, Expected: int64, Found: int64
Checking column: age, Expected: int64, Found: int64
Checking column: policy_number, Expected: int64, Found: int64
Checking column: policy_bind_date, Expected: object, Found: object
Checking column: policy_state, Expected: object, Found: object
Checking column: policy_csl, Expected: object, Found: object
Checking column: policy_deductable, 