In [1]:
import os
%pwd

'c:\\Users\\bisht\\OneDrive\\Desktop\\Personal_project\\research'

In [2]:
os.chdir('../')

In [3]:
%pwd

'c:\\Users\\bisht\\OneDrive\\Desktop\\Personal_project'

In [4]:
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir:Path
    source_URL:str
    local_data_file:Path
    unzip_dir:Path
    transaction_dir:Path
    credit_score_dir:Path
    preprocess_dir:Path


In [5]:
from src.Banking_System.constants import *
from src.Banking_System.utils.common import read_yaml,create_directories


In [6]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath=CONFIG_FILE_PATH,
            params_filepath=PARAMS_FILE_PATH
    ):
        self.config=read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
    def get_data_ingestion_config(self)->DataIngestionConfig:
        config=self.config.data_ingestion
        create_directories([config.root_dir])
        
        data_ingestion_config=DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir,
            transaction_dir=config.transaction_dir,
            credit_score_dir=config.credit_score_dir,
            preprocess_dir=config.preprocess_dir
            
            
        )
        return data_ingestion_config

In [7]:
import os
import urllib.request as request
import zipfile
import gdown
from src.Banking_System import logger
from src.Banking_System.utils.common import get_size,save_object
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder,StandardScaler,LabelEncoder
import pickle
from sklearn.model_selection import train_test_split

In [8]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config


    
     
    def download_file(self)-> str:
        '''
        Fetch data from the url
        '''

        try: 
            dataset_url = self.config.source_URL
            zip_download_dir = self.config.local_data_file
            os.makedirs("artifacts/data_ingestion", exist_ok=True)
            logger.info(f"Downloading data from {dataset_url} into file {zip_download_dir}")

            file_id = dataset_url.split("/")[-2]
            prefix = 'https://drive.google.com/uc?/export=download&id='
            gdown.download(prefix+file_id,zip_download_dir)

            logger.info(f"Downloaded data from {dataset_url} into file {zip_download_dir}")

        except Exception as e:
            raise e
        
    
    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)
    def data_preprocess(self):
        create_directories([self.config.transaction_dir])
        create_directories([self.config.credit_score_dir])
        create_directories([self.config.preprocess_dir])
        df=pd.read_csv(os.path.join(self.config.unzip_dir,'data\Credit Score Classification Dataset.csv'))
        df1=pd.read_csv(os.path.join(self.config.unzip_dir,'data\PS_20174392719_1491204439457_log.csv'))
        df_encoded=pd.get_dummies(df,columns=['Gender','Marital Status','Home Ownership'],drop_first=True)
        oe = OrdinalEncoder(categories=[['High School Diploma', "Associate's Degree", "Bachelor's Degree", "Master's Degree", 'Doctorate']])
        df_encoded['Education'] = oe.fit_transform(df[['Education']])
        
        df2=df1[['amount','oldbalanceOrg', 'newbalanceOrig','isFraud']]
        df2=df2[df2['oldbalanceOrg']>=df2['amount']]
        df_non_fraud=df2[df2['isFraud']==0].sample(8168,replace=False)
        df_fraud=df2[df2['isFraud']==1].sample(8168,replace=False)
        df_balanced=pd.concat([df_non_fraud,df_fraud],axis=0)
        ss1=StandardScaler()
        ss2=StandardScaler()
        Le=LabelEncoder()
        df_encoded['Credit Score']=Le.fit_transform(df_encoded['Credit Score'])
        df_encoded[['Income','Age']]=ss1.fit_transform(df_encoded[['Income','Age']])
        df_balanced[['amount','oldbalanceOrg', 'newbalanceOrig']]=ss2.fit_transform(df_balanced[['amount','oldbalanceOrg', 'newbalanceOrig']])
        credit_train,credit_test=train_test_split(df_encoded,random_state=22,test_size=0.2)
        transaction_train,transaction_test=train_test_split(df_balanced,random_state=22,test_size=0.2)
        credit_train.to_csv(os.path.join(self.config.credit_score_dir,'credit_train.csv'))
        credit_test.to_csv(os.path.join(self.config.credit_score_dir,'credit_test.csv'))
        transaction_train.to_csv(os.path.join(self.config.transaction_dir,'transaction_train.csv'))
        transaction_test.to_csv(os.path.join(self.config.transaction_dir,'transaction_test.csv'))
        save_object(os.path.join(self.config.preprocess_dir,'creditscaler.pkl'),ss1)
        save_object(os.path.join(self.config.preprocess_dir,'transactionscaler.pkl'),ss2)
        save_object(os.path.join(self.config.preprocess_dir,'encodelabelcredit.pkl'),Le)
        logger.info(f"PreProcessing Completed")


        



In [9]:
try:
    config=ConfigurationManager()
    data_ingestion_config=config.get_data_ingestion_config()
    data_ingestion=DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
    data_ingestion.data_preprocess()
except Exception as e:
    raise e

[2024-03-28 10:39:54,946: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-03-28 10:39:54,949: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-28 10:39:54,951: INFO: common: created directory at: artifacts]
[2024-03-28 10:39:54,952: INFO: common: created directory at: artifacts/data_ingestion]
[2024-03-28 10:39:54,954: INFO: 1782276900: Downloading data from https://drive.google.com/file/d/1VvR9L9WoUiy5vrvBYIvDlrxfq9IbYe-4/view?usp=sharing into file artifacts/data_ingestion/data.zip]


Downloading...
From (original): https://drive.google.com/uc?/export=download&id=1VvR9L9WoUiy5vrvBYIvDlrxfq9IbYe-4
From (redirected): https://drive.google.com/uc?%2Fexport=download&id=1VvR9L9WoUiy5vrvBYIvDlrxfq9IbYe-4&confirm=t&uuid=7c94f0b0-8420-4825-bd58-fd3e196419a3
To: c:\Users\bisht\OneDrive\Desktop\Personal_project\artifacts\data_ingestion\data.zip
100%|██████████| 1.14G/1.14G [01:46<00:00, 10.7MB/s]


[2024-03-28 10:41:45,743: INFO: 1782276900: Downloaded data from https://drive.google.com/file/d/1VvR9L9WoUiy5vrvBYIvDlrxfq9IbYe-4/view?usp=sharing into file artifacts/data_ingestion/data.zip]
[2024-03-28 10:41:58,693: INFO: common: created directory at: artifacts/data_ingestion/transaction_data]
[2024-03-28 10:41:58,694: INFO: common: created directory at: artifacts/data_ingestion/credit_score_data]
[2024-03-28 10:41:58,695: INFO: common: created directory at: artifacts/data_ingestion/preprocess]
[2024-03-28 10:42:13,038: INFO: 1782276900: PreProcessing Completed]


In [3]:
import pandas as pd
df=pd.read_csv(r'C:\Users\bisht\OneDrive\Desktop\Personal_project\artifacts\data_ingestion\credit_score_data\credit_test.csv')
df

Unnamed: 0.1,Unnamed: 0,Age,Income,Education,Number of Children,Credit Score,Gender_Male,Marital Status_Single,Home Ownership_Rented
0,25,-0.470408,-1.198004,1.0,0,0,True,True,True
1,30,-0.943702,0.65624,3.0,2,1,True,False,False
2,75,-0.115438,-0.270882,2.0,2,1,False,False,False
3,68,-0.825379,-0.502662,2.0,0,0,True,True,True
4,46,1.659415,1.428842,0.0,0,1,True,False,False
5,31,-0.352085,-0.116361,4.0,1,1,False,False,False
6,151,1.067798,0.192679,4.0,1,1,False,False,False
7,154,-0.588732,-0.966223,0.0,0,0,True,True,True
8,13,-0.707055,-0.888963,1.0,0,0,True,True,True
9,50,0.594504,0.269939,3.0,0,1,True,True,False


In [4]:
df

Unnamed: 0,Age,Gender,Income,Education,Marital Status,Number of Children,Home Ownership,Credit Score
0,25,Female,50000,Bachelor's Degree,Single,0,Rented,High
1,30,Male,100000,Master's Degree,Married,2,Owned,High
2,35,Female,75000,Doctorate,Married,1,Owned,High
3,40,Male,125000,High School Diploma,Single,0,Owned,High
4,45,Female,100000,Bachelor's Degree,Married,3,Owned,High
...,...,...,...,...,...,...,...,...
159,29,Female,27500,High School Diploma,Single,0,Rented,Low
160,34,Male,47500,Associate's Degree,Single,0,Rented,Average
161,39,Female,62500,Bachelor's Degree,Married,2,Owned,High
162,44,Male,87500,Master's Degree,Single,0,Owned,High


In [6]:
pd.get_dummies(df[['Home Ownership','Gender','Marital Status']],drop_first=True,)

Unnamed: 0,Home Ownership_Rented,Gender_Male,Marital Status_Single
0,True,False,True
1,False,True,False
2,False,False,False
3,False,True,True
4,False,False,False
...,...,...,...
159,True,False,True
160,True,True,True
161,False,False,False
162,False,True,True
