In [16]:
import os
import zipfile
import numpy as np
import pandas as pd
import sys

from pathlib import Path

project_root = Path.cwd().parent
sys.path.append(str(project_root))

from src.Insurance_Fraud.logger.logger import logger
from src.Insurance_Fraud.constants import *
from src.Insurance_Fraud.utils.common import read_yaml, create_directories

In [3]:
data = pd.read_csv("C:/Users/Arpit Kadam/Desktop/Insurance-Fraud-Detection/artifacts/data_ingestion/insurance_claims.csv")

data.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


In [19]:
df = data.copy()

from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns
df.replace('?', np.nan, inplace=True)
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

for col in numerical_columns:
    df[col].fillna(df[col].median(), inplace=True)

df['loss_ratio'] = df['total_claim_amount'] / (df['policy_annual_premium'] * 12)
df['profitability'] = (df['policy_annual_premium'] * 12) - df['total_claim_amount']

df['vehicle_age'] = 2025 - df['auto_year']
df.drop('auto_year', axis=1, inplace=True)

df['incident_date'] = pd.to_datetime(df['incident_date'])
df['incident_year'] = df['incident_date'].dt.year
df['incident_month'] = df['incident_date'].dt.month
df['incident_day'] = df['incident_date'].dt.day
df.drop('incident_date',axis=1,inplace=True)

df['policy_bind_date'] = pd.to_datetime(df['policy_bind_date'], errors='coerce')
df['policy_bind_year'] = df['policy_bind_date'].dt.year
df['policy_bind_month'] = df['policy_bind_date'].dt.month
df['policy_bind_day'] = df['policy_bind_date'].dt.day
df.drop('policy_bind_date',axis=1,inplace=True) 

df = pd.concat([df,pd.get_dummies(df['incident_type'],drop_first=True).astype(int)],axis=1)
df.drop('incident_type',axis=1,inplace=True)

df = pd.concat([df,pd.get_dummies(df['collision_type'],drop_first=True).astype(int)],axis=1)
df.drop('collision_type',axis=1,inplace=True)

df = pd.concat([df,pd.get_dummies(df['authorities_contacted'],drop_first=True).astype(int)],axis=1)
df.drop('authorities_contacted',axis=1,inplace=True)

df = pd.concat([df,pd.get_dummies(df['incident_severity'],drop_first=True).astype(int)],axis=1)
df.drop('incident_severity',axis=1,inplace=True)

df = pd.concat([df,pd.get_dummies(df['policy_csl'],drop_first=True).astype(int)],axis=1)
df.drop('policy_csl',axis=1,inplace=True)

df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 45, 55, 100], labels=['18-25', '26-35', '36-45', '46-55', '55+'])

encoder = LabelEncoder()
df[df.select_dtypes(include=['object']).columns] = df[df.select_dtypes(include=['object']).columns].apply(encoder.fit_transform)
df['age_group'] = encoder.fit_transform(df['age_group'])


# Separate the majority and minority classes
majority_class = df[df['fraud_reported'] == 0]
minority_class = df[df['fraud_reported'] == 1]

# Perform undersampling on the majority class
majority_downsampled = resample(majority_class, 
                                replace=False, 
                                n_samples=500,  # Reduce to 500 samples
                                random_state=42)

# Perform oversampling on the minority class
minority_oversampled = resample(minority_class, 
                                replace=True, 
                                n_samples=500,  # Increase to 500 samples
                                random_state=42)

# Combine the balanced classes
balanced_df = pd.concat([majority_downsampled, minority_oversampled])

# Shuffle the dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

x = balanced_df.drop('fraud_reported', axis=1)
y = balanced_df['fraud_reported']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

print(x_train_scaled.shape)
print(x_test_scaled.shape)
print(y_train.shape)
print(y_test.shape)


(800, 54)
(200, 54)
(800,)
(200,)


In [13]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [14]:
from src.Insurance_Fraud.constants import *
from src.Insurance_Fraud.utils.common import read_yaml, create_directories

In [15]:
class ConfigurationManager:
    def __init__(
        self,
        config_file_path = Path(CONFIG_FILE_PATH),
        params_file_path = Path(PARAMS_FILE_PATH),
        schema_file_path = Path(SCHEMA_FILE_PATH)
    ):

        self.config = read_yaml(Path(config_file_path))
        self.params = read_yaml(Path(params_file_path))
        self.schema = read_yaml(Path(schema_file_path))

        self.config['data_validation']['unzip_data_dir'] = Path("C:/Users/Arpit Kadam/Desktop/Insurance-Fraud-Detection/artifacts/data_ingestion/Insurance_Claims.csv")

        create_directories([self.config['artifacts_root']])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config['data_transformation']
        create_directories([config['root_dir']])

        data_transformation_config = DataTransformationConfig(
            root_dir=config['root_dir'],
            data_path=config['data_path']
        )

        return data_transformation_config


In [22]:
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
import pandas as pd

In [25]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def initiate_data_transformation(self):
        df = pd.read_csv("C:\\Users\\Arpit Kadam\\Desktop\\Insurance-Fraud-Detection\\artifacts\\data_ingestion\\insurance_claims.csv")

        numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
        categorical_columns = df.select_dtypes(include=['object']).columns
        df.replace('?', np.nan, inplace=True)
        for col in categorical_columns:
            df[col].fillna(df[col].mode()[0], inplace=True)

        for col in numerical_columns:
            df[col].fillna(df[col].median(), inplace=True)

        df['loss_ratio'] = df['total_claim_amount'] / (df['policy_annual_premium'] * 12)
        df['profitability'] = (df['policy_annual_premium'] * 12) - df['total_claim_amount']

        df['vehicle_age'] = 2025 - df['auto_year']
        df.drop('auto_year', axis=1, inplace=True)

        df['incident_date'] = pd.to_datetime(df['incident_date'])
        df['incident_year'] = df['incident_date'].dt.year
        df['incident_month'] = df['incident_date'].dt.month
        df['incident_day'] = df['incident_date'].dt.day
        df.drop('incident_date',axis=1,inplace=True)

        df['policy_bind_date'] = pd.to_datetime(df['policy_bind_date'], errors='coerce')
        df['policy_bind_year'] = df['policy_bind_date'].dt.year
        df['policy_bind_month'] = df['policy_bind_date'].dt.month
        df['policy_bind_day'] = df['policy_bind_date'].dt.day
        df.drop('policy_bind_date',axis=1,inplace=True)        

        df = pd.concat([df,pd.get_dummies(df['incident_type'],drop_first=True).astype(int)],axis=1)
        df.drop('incident_type',axis=1,inplace=True)

        df = pd.concat([df,pd.get_dummies(df['collision_type'],drop_first=True).astype(int)],axis=1)
        df.drop('collision_type',axis=1,inplace=True)

        df = pd.concat([df,pd.get_dummies(df['authorities_contacted'],drop_first=True).astype(int)],axis=1)
        df.drop('authorities_contacted',axis=1,inplace=True)

        df = pd.concat([df,pd.get_dummies(df['incident_severity'],drop_first=True).astype(int)],axis=1)
        df.drop('incident_severity',axis=1,inplace=True)

        df = pd.concat([df,pd.get_dummies(df['policy_csl'],drop_first=True).astype(int)],axis=1)
        df.drop('policy_csl',axis=1,inplace=True)

        df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 45, 55, 100], labels=['18-25', '26-35', '36-45', '46-55', '55+'])

        encoder = LabelEncoder()
        df[df.select_dtypes(include=['object']).columns] = df[df.select_dtypes(include=['object']).columns].apply(encoder.fit_transform)
        df['age_group'] = encoder.fit_transform(df['age_group'])


        # Separate the majority and minority classes
        majority_class = df[df['fraud_reported'] == 0]
        minority_class = df[df['fraud_reported'] == 1]

        # Perform undersampling on the majority class
        majority_downsampled = resample(majority_class, 
                                        replace=False, 
                                        n_samples=500,  # Reduce to 500 samples
                                        random_state=42)

        # Perform oversampling on the minority class
        minority_oversampled = resample(minority_class, 
                                        replace=True, 
                                        n_samples=500,  # Increase to 500 samples
                                        random_state=42)

        # Combine the balanced classes
        balanced_df = pd.concat([majority_downsampled, minority_oversampled])

        # Shuffle the dataset
        balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

        train, test = train_test_split(balanced_df, test_size=0.25, random_state=42)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

        logger.info(f"Data transformation completed successfully and saved to {self.config.root_dir}")
        logger.info(f"Train data shape: {train.shape}")
        logger.info(f"Test data shape: {test.shape}")

        print(train.shape)
        print(test.shape)


In [27]:
try:
    config = ConfigurationManager(config_file_path=Path("C:/Users/Arpit Kadam/Desktop/Insurance-Fraud-Detection/config/config.yaml"),
                                    params_file_path=Path("C:/Users/Arpit Kadam/Desktop/Insurance-Fraud-Detection/params.yaml"),
                                    schema_file_path=Path("C:/Users/Arpit Kadam/Desktop/Insurance-Fraud-Detection/schema.yaml"))
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    try:
        data_transformation.initiate_data_transformation()
    except FileNotFoundError as e:
        print(f"Error: Input data file not found. Please check the file path: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

except Exception as e:
    print(f"An unexpected error occurred: {e}")

[2025-01-22 00:32:08,440: INFO: common: Attempting to read YAML file from: C:\Users\Arpit Kadam\Desktop\Insurance-Fraud-Detection\config\config.yaml]
[2025-01-22 00:32:08,448: INFO: common: Attempting to read YAML file from: C:\Users\Arpit Kadam\Desktop\Insurance-Fraud-Detection\params.yaml]
[2025-01-22 00:32:08,453: INFO: common: Attempting to read YAML file from: C:\Users\Arpit Kadam\Desktop\Insurance-Fraud-Detection\schema.yaml]
[2025-01-22 00:32:08,470: INFO: common: Directory created: artifacts]
[2025-01-22 00:32:08,476: INFO: common: Directory created: artifacts/data_transformation]


[2025-01-22 00:32:08,645: INFO: 4132114995: Data transformation completed successfully and saved to artifacts/data_transformation]
[2025-01-22 00:32:08,647: INFO: 4132114995: Train data shape: (750, 55)]
[2025-01-22 00:32:08,649: INFO: 4132114995: Test data shape: (250, 55)]
(750, 55)
(250, 55)


In [43]:
with open(Path("C:/Users/Arpit Kadam/Desktop/Insurance-Fraud-Detection/artifacts/data_validation/STATUS.txt"), "r") as f:
    status = f.read().split("****************************************************")[1].split(" ")[1]

print(status)
# The issue is with the string comparison
# Let's clean up the status string and do a proper comparison
status = status.strip()  # Remove any whitespace
if status == "True":
    print("Data Transformation Stage Completed")
else:
    print("Data Transformation Stage Failed")


True

Data Transformation Stage Completed
