In [1]:
!pwd

/home/techie/Desktop/general/end-to-end-liver-project/research


In [2]:
import os

os.chdir('../')
!pwd

/home/techie/Desktop/general/end-to-end-liver-project


In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
    root_dir: Path
    dataset: Path

In [4]:
from liver.constants import *
from liver.utils.common import create_directories, read_yaml

In [5]:
class ConfigurationManager:
    def __init__(self,
                 config=CONFIG_FILE_PATH):
        
        self.config_file = read_yaml(config)
        
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config_file.data_transformation
        
        create_directories([config.root_dir])
        
        data_transformation_config = DataTransformationConfig(
            root_dir= config.root_dir,
            dataset= config.dataset
        )
        
        return data_transformation_config

In [6]:
from liver import logger
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from sklearn.model_selection import train_test_split
import os

In [7]:
class DataTransformation:
    def __init__(self,
                 config:DataTransformationConfig) -> None:
        self.config = config
        
    def cleaning(self) -> pd.DataFrame:
        df = pd.read_csv(self.config.dataset)
        # print(df.head())
        # print(df.isnull().sum())
        df = df.dropna()  ## drop null values
        # print(df.isnull().sum())
        logger.info(f"Null values removed: {df.isnull().sum()}")
        
        df.replace(to_replace={'Female':0, 'Male':1}, inplace=True) ## label encoding
        logger.info("labeled gander columns")
        
        return df
    
    def handle_imbalanced(self, df):
        X = df.drop(['Dataset'], axis=1)
        y = df['Dataset']

        logger.info('Original dataset shape %s' % Counter(y))
        
        os =  RandomOverSampler()
        X, y = os.fit_resample(X, y)

        logger.info('Resampled dataset shape %s' % Counter(y))
        df = X
        df['Dataset'] = y
        
        return df
    
    def train_test_splits(self, df):
        train, test = train_test_split(df)
        train.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"),index = False)
        logger.info("Splited data into training and test sets")
        logger.info(f'train shape: {train.shape}')
        logger.info(f'test shape: {test.shape}')

In [8]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data = data_transformation.cleaning()
    data = data_transformation.handle_imbalanced(df=data)
    data_transformation.train_test_splits(df=data)
except Exception as e:
    raise e

[2023-10-20 19:48:02,887: INFO: common: yaml file: config/config.yaml loaded successfully]
[2023-10-20 19:48:02,890: INFO: common: created directory at: artifacts/data_transformation]
[2023-10-20 19:48:02,906: INFO: 254018392: Null values removed: Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64]
[2023-10-20 19:48:02,910: INFO: 254018392: labeled gander columns]
[2023-10-20 19:48:02,915: INFO: 254018392: Original dataset shape Counter({1: 414, 2: 165})]
[2023-10-20 19:48:02,928: INFO: 254018392: Resampled dataset shape Counter({1: 414, 2: 414})]
[2023-10-20 19:48:02,956: INFO: 254018392: Splited data into training and test sets]
[2023-10-20 19:48:02,959: INFO: 254018392: trai