In [1]:
import os


In [2]:
os.chdir('../')

In [3]:
from pathlib import Path
from dataclasses import dataclass
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir:Path
    source_URL:str
    local_dir:Path
    split_dir:Path
    


In [4]:
from src.Loan_defaulter.constants import *
from src.Loan_defaulter.utils.common import read_yaml,create_directories

In [5]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath=CONFIG_FILE_PATH,
            params_filepath=PARAMS_FILE_PATH

    ):
        self.config=read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self)->DataIngestionConfig:
        config=self.config.data_ingestion
       
        create_directories([config.root_dir])
        create_directories([config.split_dir])
        data_ingestion_config=DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_dir=config.local_dir,
            split_dir=config.split_dir
        )
        return data_ingestion_config

In [6]:
import os
import urllib.request as request 
import gdown
from src.Loan_defaulter import logger
import zipfile
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split

In [7]:
class DataIngestion:
    def __init__(self,config:DataIngestionConfig):
        self.config = config

    def download_file(self)->str:
        try:
            dataset_url=self.config.source_URL
            local_dir=self.config.local_dir
            os.makedirs('artifacts/data_ingestion',exist_ok=True)
            logger.info(f'Downloading data from {dataset_url} into file {local_dir}')
            file_id=dataset_url.split('/')[-2]
            prefix='http://drive.google.com/uc?/export=download&id='
            gdown.download(prefix+file_id,local_dir)
            logger.info(f'Download data from {dataset_url} into file {local_dir}')
        except Exception as e:
            raise e
    def split_data(self):
        try:
            logger.info('Splitting data into train test split')
            df=pd.read_csv(self.config.local_dir)
            df.drop(columns=['ID','year'],inplace=True)
            df['loan_with_property']=df['property_value'].apply(lambda x: 0 if pd.isna(x) else 1)
            df['property_value']=df['property_value'].apply(lambda x: 0 if pd.isna(x) else x)
            # droping ltv as it is a calculated using loan amount by total property value
            df.drop(columns=['LTV'],inplace=True)
            split_dir=self.config.split_dir
            
            X=df.drop(columns=['Status'])
            y=df['Status']
            X_train,X_temp,y_train,y_temp=train_test_split(X,y,test_size=0.3,random_state=42)      
            X_val,X_test,y_val,y_test=train_test_split(X_temp,y_temp,test_size=0.5,random_state=42)
            X_train.to_csv(os.path.join(split_dir,'X_train.csv'))
            X_val.to_csv(os.path.join(split_dir,'X_val.csv'))
            X_test.to_csv(os.path.join(split_dir,'X_test.csv'))
            y_train.to_csv(os.path.join(split_dir,'y_train.csv'))
            y_val.to_csv(os.path.join(split_dir,'y_val.csv'))
            y_test.to_csv(os.path.join(split_dir,'y_test.csv'))
            logger.info(f'Data split into train test split and save in {split_dir}')
        except Exception as e:
            raise e

        
        
    
    

In [8]:
try:
    config=ConfigurationManager()
    data_ingestion_config=config.get_data_ingestion_config()
    data_ingestion=DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.split_data()
except Exception as e:
    raise e

[2024-12-25 11:40:53,262:INFO:common:yaml file: config\config.yaml loaded successfully]
[2024-12-25 11:40:53,263:INFO:common:yaml file: params.yaml loaded successfully]
[2024-12-25 11:40:53,265:INFO:common:created directory at: artifacts]
[2024-12-25 11:40:53,267:INFO:common:created directory at: artifacts/data_ingestion]
[2024-12-25 11:40:53,269:INFO:common:created directory at: artifacts/data_ingestion/split_data]
[2024-12-25 11:40:53,270:INFO:2880705593:Downloading data from https://drive.google.com/file/d/1iPSSu3l8bYz36l9oEic3RhgW-EgF_oN9/view?usp=sharing into file artifacts/data_ingestion/loan_default.csv]


Downloading...
From: http://drive.google.com/uc?/export=download&id=1iPSSu3l8bYz36l9oEic3RhgW-EgF_oN9
To: c:\Users\bisht\OneDrive\Desktop\New folder\artifacts\data_ingestion\loan_default.csv
100%|██████████| 28.5M/28.5M [00:03<00:00, 8.96MB/s]

[2024-12-25 11:41:01,425:INFO:2880705593:Download data from https://drive.google.com/file/d/1iPSSu3l8bYz36l9oEic3RhgW-EgF_oN9/view?usp=sharing into file artifacts/data_ingestion/loan_default.csv]
[2024-12-25 11:41:01,426:INFO:2880705593:Splitting data into train test split]





UnboundLocalError: local variable 'df' referenced before assignment