In [1]:
from dataclasses import dataclass 
from restr_rating.logger import logging
from restr_rating.exception import RatingException 
from restr_rating.entity import config_entity, artifact_entity
from typing import Optional 
import pandas as pd 
import numpy as np 
import os, sys 
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import LabelEncoder 
from restr_rating import utils
from restr_rating.config import TARGET_COLUMN, ENCODE_EXCLUDE_COLUMN

class DataTransformation:
    
    def __init__(self, data_transformation_config:config_entity.DataTransformationConfig,
                        data_ingestion_artifact:artifact_entity.DataIngestionArtifact):
        
        try:
            logging.info(f"{'>'*30} Initiated Data Transformation {'<'*30}")
            self.data_transformation_config = data_transformation_config
            self.data_ingestion_artifact = data_ingestion_artifact
        except Exception as e:
            raise RatingException(e, sys)

    

In [2]:
def Encode(df):
    # Initialize the LabelEncoder object
        le = LabelEncoder()
        
        # Iterate through columns in the DataFrame
        for column in df.columns[~df.columns.isin(ENCODE_EXCLUDE_COLUMN)]:
            # Fit and transform the categorical column using LabelEncoder
            df[column] = le.fit_transform(df[column])
        # Return the encoded DataFrame
        return df

In [3]:
train_file = pd.read_csv("/config/workspace/artifact/02_16_2023__08_26_48/data_ingestion/dataset/train.csv")
test_file = pd.read_csv("/config/workspace/artifact/02_16_2023__08_26_48/data_ingestion/dataset/test.csv")

In [7]:
train_file.shape

(32989, 14)

In [6]:
train_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32989 entries, 0 to 32988
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   address       32989 non-null  int64  
 1   name          32989 non-null  int64  
 2   online_order  32989 non-null  int64  
 3   book_table    32989 non-null  int64  
 4   rate          32989 non-null  float64
 5   votes         32989 non-null  int64  
 6   location      32989 non-null  int64  
 7   rest_type     32989 non-null  int64  
 8   cuisines      32989 non-null  int64  
 9   cost          32989 non-null  int64  
 10  reviews_list  32989 non-null  int64  
 11  menu_item     32989 non-null  int64  
 12  type          32989 non-null  int64  
 13  city          32989 non-null  int64  
dtypes: float64(1), int64(13)
memory usage: 3.5 MB


In [4]:
transformed_train_file = Encode(train_file)
transformed_test_file = Encode(test_file)

In [5]:
transformed_train_file

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city
0,6875,6105,1,0,3.6,23,18,24,1648,800,12157,6886,2,17
1,7725,6191,1,0,3.2,8,88,56,1075,400,6154,1576,4,29
2,4753,4869,1,1,3.8,360,35,71,977,1300,3763,2806,2,15
3,2157,670,0,0,3.1,7,3,73,1232,250,4230,6886,2,12
4,3092,4333,1,1,4.3,686,27,24,1460,1000,13511,6886,4,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32984,7478,4524,1,0,4.0,121,9,73,1996,900,10694,6886,4,6
32985,1720,6175,1,0,2.9,24,19,24,1552,600,130,6886,4,8
32986,762,1724,0,1,4.3,2741,8,7,963,1300,2759,6886,2,27
32987,1374,1213,1,1,4.0,199,29,73,1989,700,13659,3100,2,2


In [5]:
from restr_rating.entity.config_entity import DataTransformationConfig
from restr_rating.entity.config_entity import TrainingPipelineConfig

In [6]:
training_pipeline_config = TrainingPipelineConfig()
data_transformation_config = DataTransformationConfig(training_pipeline_config)

In [12]:
data_transformation_config.transformed_train_path

'/config/workspace/artifact/02_16_2023__08_38_44/data_transformation/transformed/train.npz'

In [7]:
utils.save_numpy_array_data(df=transformed_train_file, file_path=data_transformation_config.transformed_train_path)

In [None]:
utils.save_numpy_array_data(file_path=data_transformation_config.transformed_train_path, df=transformed_train_file)
utils.save_numpy_array_data(file_path=data_transformation_config.transformed_test_path, df=transformed_test_file)

In [None]:
def initiate_data_transformation(self)->artifact_entity.DataTransformationArtifact:
        try: 
            # reading the training and testing files
            logging.info("Reading train and test files in data_transformation.py")
            train_df = pd.read_csv(self.data_ingestion_artifact.train_file_path)
            test_df = pd.read_csv(self.data_ingestion_artifact.test_file_path)

            # transforming training and testing dataset
            logging.info(f"Transforming train and test dataframes")
            transformed_train_path = self.Encode(train_df)
            transformed_test_path = self.Encode(test_df)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize a LabelEncoder object
le = LabelEncoder()

# Iterate through columns in the DataFrame
for column in df.columns[~df.columns.isin(ENCODE_EXCLUDE_COLUMN)]:
    # Check if the column contains categorical data
    if df[column].dtype == 'object':
        # Fit and transform the categorical column using LabelEncoder
        df[column] = le.fit_transform(df[column])
        
# Return the encoded DataFrame
return df

In [None]:

    def Encode(df:pd.DataFrame):
    # Initialize the LabelEncoder object
        le = LabelEncoder()
        
        # Iterate through columns in the DataFrame
        for column in df.columns[~df.columns.isin(ENCODE_EXCLUDE_COLUMN)]:
            # Fit and transform the categorical column using LabelEncoder
            df[column] = le.fit_transform(df[column])
        # Return the encoded DataFrame
        return df

In [5]:
def save_numpy_array_data(file_path:str):
    dir_path = os.path.dirname(file_path)
    os.makedirs(dir_path, exist_ok=True)
    print(dir_path)

In [6]:
save_numpy_array_data("/config/workspace/artifact/02_17_2023__02_06_20/data_transformation/transformed/train.npz")


/config/workspace/artifact/02_17_2023__02_06_20/data_transformation/transformed
