In [45]:
import os

path= os.getcwd()

if path.endswith('notebooks'):
    os.chdir("../")

In [46]:
# import dependencies
import gdown
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt
from dotenv import load_dotenv
from src.Home_Premium_Prediction.constants import CONFIG_FILE_PATH
from src.Home_Premium_Prediction.utils import read_yaml, create_directories

In [47]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [48]:
load_dotenv()

True

In [49]:
TRAIN_FILE_NAME = os.getenv("TRAIN_DATA_FILE")
TEST_FILE_NAME = os.getenv("TEST_DATA_FILE")

In [None]:
class DataIngestionConfig:
    def __init__(self, data_ingestion_dir, train_data_url, test_data_url, train_data_path, test_data_path):
        self.data_ingestion_dir = data_ingestion_dir
        self.train_data_url = train_data_url
        self.test_data_url = test_data_url
        self.train_data_path = train_data_path
        self.test_data_path = test_data_path


class DataIngestionConfigManager:
    def __init__(self, config_file=CONFIG_FILE_PATH):
        self.config_file = read_yaml(config_file)
        create_directories([self.config_file['artifacts_root']])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        create_directories([self.config_file['data_ingestion']['data_ingestion_dir']])
        return DataIngestionConfig(
            data_ingestion_dir=self.config_file['data_ingestion']['data_ingestion_dir'],
            train_data_url=self.config_file['data_ingestion']['train_data_url'],
            test_data_url=self.config_file['data_ingestion']['test_data_url'],
            train_data_path=self.config_file['data_ingestion']['train_data_path'],
            test_data_path=self.config_file['data_ingestion']['test_data_path']
        )


class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def _download_if_needed(self, url: str, output_dir: str, file_name: str):
        os.makedirs(output_dir, exist_ok=True)
        file_path = os.path.join(output_dir, file_name)
        if not os.path.exists(file_path):
            # Clean the file ID and construct URL
            file_id = url.split('/')[-1]
            download_url = f"https://drive.google.com/uc?id={file_id}"
            print(f"Downloading {file_name} from {download_url}...")
            gdown.download(download_url, file_path, quiet=False)
            print(f"Saved to: {file_path}")
        else:
            print(f"{file_name} already exists at {file_path}")
        return file_path

    def download_data(self):
        train_path = self._download_if_needed(
            url=self.config.train_data_url,
            output_dir=self.config.train_data_path,
            file_name=TRAIN_FILE_NAME
        )
        test_path = self._download_if_needed(
            url=self.config.test_data_url,
            output_dir=self.config.test_data_path,
            file_name=TEST_FILE_NAME
        )

        # Load and return data if needed
        train_data = pd.read_csv(train_path)
        test_data = pd.read_csv(test_path)
        print("Both datasets loaded successfully.")
        return train_data, test_data


# MAIN
if __name__ == "__main__":
    config_manager = DataIngestionConfigManager(config_file=CONFIG_FILE_PATH)
    data_ingestion_config = config_manager.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    train_data, test_data = data_ingestion.download_data()

created directory at: artifacts
created directory at: artifacts/data_ingestion
home_insurance_train.csv already exists at artifacts/data_ingestion/raw_data\home_insurance_train.csv
home_insurance_test.csv already exists at artifacts/data_ingestion/raw_data\home_insurance_test.csv
Both datasets loaded successfully.


In [51]:
df= pd.read_csv("artifacts/data_ingestion/raw_data/home_insurance_train.csv")

df.head()

Unnamed: 0,uuid,quote_id,sale_flag,property_type,year_built,number_of_bedrooms,number_of_bathrooms,building_value,contents_value,coverage_level,number_of_occupants,has_smoke_alarms,has_burglar_alarm,pets_present,flood_risk_score,fire_risk_score,crime_rate_score,broker_name,policy_term,previous_claims_count,distance_to_fire_station,has_security_cameras,ownership_status,energy_efficiency_rating,pcd,lat,long,Premium
0,97692477-4e79-4239-b68d-f1e75e8f9727,Q0000000,0,Semi-Detached,1906,6,3,475369.0,112585.0,Bronze,4,1,0,0,0.49,0.36,0.06,BrokerB,12,3,18.44,0,Vacant,B,N4 3BH,51.566409,-0.119991,1740.49
1,a1f0165c-4f0c-465d-bf8e-5998a57e75f5,Q0000001,1,Terraced,1946,4,2,112530.0,7451.0,Bronze,3,1,0,1,0.08,0.33,0.2,BrokerC,36,2,4.42,0,Vacant,A,BT342PL,54.154194,-6.313653,677.71
2,403f057a-b3ed-439b-a8e0-67134879c9a9,Q0000002,0,Flat,1926,5,2,337412.0,40202.0,Bronze,3,1,0,1,0.32,0.49,0.27,BrokerA,36,5,21.04,0,Owner-occupied,A,SO506JS,50.977035,-1.328926,1441.61
3,698d2f13-2123-4fec-a6fa-52c857245a70,Q0000003,0,Detached,1953,1,3,496935.0,105234.0,Silver,1,1,0,0,0.69,0.31,0.37,BrokerA,24,5,18.18,0,Owner-occupied,E,NR302SA,52.602705,1.727722,1957.38
4,d4c386f9-cfb6-4be4-a28d-4da57c35b3fc,Q0000004,0,Terraced,1910,4,1,457227.0,67499.0,Bronze,6,0,0,0,0.87,0.74,0.32,BrokerA,24,1,3.81,0,Vacant,B,ST4 2QE,53.000934,-2.166444,1543.64


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 28 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   uuid                      1000000 non-null  object 
 1   quote_id                  1000000 non-null  object 
 2   sale_flag                 1000000 non-null  int64  
 3   property_type             1000000 non-null  object 
 4   year_built                1000000 non-null  int64  
 5   number_of_bedrooms        1000000 non-null  int64  
 6   number_of_bathrooms       1000000 non-null  int64  
 7   building_value            1000000 non-null  float64
 8   contents_value            1000000 non-null  float64
 9   coverage_level            1000000 non-null  object 
 10  number_of_occupants       1000000 non-null  int64  
 11  has_smoke_alarms          1000000 non-null  int64  
 12  has_burglar_alarm         1000000 non-null  int64  
 13  pets_present              10

In [54]:
df.dtypes

uuid                         object
quote_id                     object
sale_flag                     int64
property_type                object
year_built                    int64
number_of_bedrooms            int64
number_of_bathrooms           int64
building_value              float64
contents_value              float64
coverage_level               object
number_of_occupants           int64
has_smoke_alarms              int64
has_burglar_alarm             int64
pets_present                  int64
flood_risk_score            float64
fire_risk_score             float64
crime_rate_score            float64
broker_name                  object
policy_term                   int64
previous_claims_count         int64
distance_to_fire_station    float64
has_security_cameras          int64
ownership_status             object
energy_efficiency_rating     object
pcd                          object
lat                         float64
long                        float64
Premium                     