In [1]:
from dataclasses import dataclass 
from restr_rating.logger import logging
from restr_rating.exception import RatingException 
from restr_rating.entity import config_entity, artifact_entity
from typing import Optional 
import pandas as pd 
import numpy as np 
import os, sys 
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import LabelEncoder 
from restr_rating import utils
from restr_rating.config import TARGET_COLUMN, ENCODE_EXCLUDE_COLUMN

In [40]:
class DataTransformation:
    
    def __init__(self, data_transformation_config:config_entity.DataTransformationConfig,
                        data_ingestion_artifact:artifact_entity.DataIngestionArtifact):
        
        try:
            logging.info(f"{'>'*30} Initiated Data Transformation {'<'*30}")
            self.data_transformation_config = data_transformation_config
            self.data_ingestion_artifact = data_ingestion_artifact
        except Exception as e:
            raise RatingException(e, sys)

    def Encode (self, df:pd.DataFrame):
        try:
            # Iterate through columns in the DataFrame
            for column in df.columns[~df.columns.isin(ENCODE_EXCLUDE_COLUMN)]:
                df[column] = df[column].factorize()[0]
            # Return the encoded DataFrame
            return df
        except Exception as e:
            raise RatingException(e, sys)

In [1]:
from restr_rating.logger import logging
from restr_rating.exception import RatingException
from restr_rating.utils import get_collection_as_dataframe
from restr_rating.components.data_ingestion import DataIngestion
from restr_rating.entity import config_entity, artifact_entity
from restr_rating.components.data_validation import DataValidation
from restr_rating.components.data_transformation import DataTransformation

training_pipeline_config = config_entity.TrainingPipelineConfig()

# data_ingestion 
data_ingestion_config = config_entity.DataIngestionConfig(training_pipeline_config=training_pipeline_config)
print(data_ingestion_config.to_dict())
data_ingestion = DataIngestion(data_ingestion_config=data_ingestion_config)
data_ingestion_artifact = data_ingestion.initiate_data_ingestion()

# data validation
data_validation_config = config_entity.DataValidationConfig(training_pipeline_config=training_pipeline_config)
data_validation = DataValidation(data_validation_config=data_validation_config, data_ingestion_artifact=data_ingestion_artifact)
# data_validation_artifact = data_validation.initiate_data_validation()

{'database_name': 'restr_ratings', 'collection_name': 'ratings', 'data_ingestion_dir': '/config/workspace/artifact/02_18_2023__11_26_43/data_ingestion', 'feature_store_dir': '/config/workspace/artifact/02_18_2023__11_26_43/data_ingestion/feature_store/ratings.csv', 'train_file_name': '/config/workspace/artifact/02_18_2023__11_26_43/data_ingestion/dataset/train.csv', 'test_file_name': '/config/workspace/artifact/02_18_2023__11_26_43/data_ingestion/dataset/test.csv', 'test_size': 0.2}


In [3]:
from restr_rating.entity.config_entity import DataTransformationConfig
data_transformation_config = DataTransformationConfig(training_pipeline_config)

In [5]:
data_transformation = DataTransformation(data_transformation_config, data_ingestion_artifact)

In [12]:
train_df =pd.read_csv("/config/workspace/artifact/02_18_2023__11_26_43/data_ingestion/dataset/train.csv")
test_df = pd.read_csv("/config/workspace/artifact/02_18_2023__11_26_43/data_ingestion/dataset/test.csv")


In [13]:
en_train_df = train_df.drop(ENCODE_EXCLUDE_COLUMN, axis=1)
en_test_df = test_df.drop(ENCODE_EXCLUDE_COLUMN, axis=1)

In [16]:
en_train_df = en_train_df.drop(['address','reviews_list'], axis=1)
en_train_df.head()

Unnamed: 0,name,online_order,book_table,location,rest_type,cuisines,menu_item,type,city
0,Urban Spice,Yes,No,Ejipura,Casual Dining,"North Indian, Chinese, Continental",menu_not_availabel,Delivery,Koramangala 5th Block
1,Visantoe,Yes,No,Whitefield,Kiosk,Fast Food,"['Chicken Ghee Roast Meal', 'Grilled Chicken M...",Dine-out,Whitefield
2,Sherlock'S Pub,Yes,Yes,Kalyan Nagar,"Pub, Casual Dining","Continental, North Indian, Chinese, Italian","['Egg Bhurji', 'Spicy Chicken Wings', 'Chicken...",Delivery,Kammanahalli
3,Bhukkad,No,No,Bannerghatta Road,Quick Bites,"Healthy Food, Fast Food",menu_not_availabel,Delivery,JP Nagar
4,Pulimunchi,Yes,Yes,Indiranagar,Casual Dining,"Mangalorean, Seafood",menu_not_availabel,Dine-out,Indiranagar


In [17]:
en_test_df = en_test_df.drop(['address','reviews_list'], axis=1)
en_test_df.head()

Unnamed: 0,name,online_order,book_table,location,rest_type,cuisines,menu_item,type,city
0,King'S Appetite,Yes,No,Electronic City,"Food Court, Quick Bites","North Indian, Chinese",menu_not_availabel,Delivery,Electronic City
1,Cafe 221B,Yes,Yes,BTM,Cafe,"Cafe, Beverages",menu_not_availabel,Cafes,Koramangala 6th Block
2,Wangs Kitchen,Yes,No,HSR,Casual Dining,Chinese,menu_not_availabel,Delivery,Bellandur
3,Late Night,Yes,No,Kumaraswamy Layout,Quick Bites,"North Indian, Chinese",menu_not_availabel,Delivery,JP Nagar
4,Ibaco,Yes,No,Brigade Road,Dessert Parlor,Desserts,"['Almond Crunch Ice Cream [95 grams]', 'Carame...",Desserts,Lavelle Road


In [68]:
train_encode = data_transformation.Encode(df=en_train_df)
test_encode = data_transformation.Encode(df=en_test_df)

In [69]:
train_encode

Unnamed: 0,name,online_order,book_table,location,rest_type,cuisines,menu_item,type,city
0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,1,1,1,1,1
2,2,0,1,2,2,2,2,0,2
3,3,1,0,3,3,3,0,0,3
4,4,0,1,4,0,4,0,1,4
...,...,...,...,...,...,...,...,...,...
32984,2099,0,0,13,3,990,0,1,18
32985,4290,0,0,20,0,87,0,1,14
32986,1375,1,1,37,19,709,0,0,9
32987,2488,0,1,8,3,154,932,0,8


In [70]:
test_encode

Unnamed: 0,name,online_order,book_table,location,rest_type,cuisines,menu_item,type,city
0,0,0,0,0,0,0,0,0,0
1,1,0,1,1,1,1,0,1,1
2,2,0,0,2,2,2,0,0,2
3,3,0,0,3,3,0,0,0,3
4,4,0,0,4,4,3,1,2,4
...,...,...,...,...,...,...,...,...,...
8243,3956,0,0,17,3,47,2014,0,6
8244,3957,0,0,2,2,1656,0,0,18
8245,3958,0,0,11,2,1243,0,0,16
8246,438,0,0,11,3,290,0,0,23


In [1]:
import pandas as pd

# define base, train, and test dataframes
base_df = pd.read_csv('zomato_cleaned.csv')

train_df =pd.read_csv("/config/workspace/artifact/02_18_2023__11_26_43/data_ingestion/dataset/train.csv")
test_df = pd.read_csv("/config/workspace/artifact/02_18_2023__11_26_43/data_ingestion/dataset/test.csv")

# fit .factorize() method on base_df
unique_values = {}
for column in base_df.columns:
    unique_values[column] = base_df[column].unique()




In [2]:
# transform categorical variables in train_df and test_df using unique values
for column in train_df.columns:
    if train_df[column].dtype == 'object':
        train_df[column] = pd.Categorical(train_df[column], categories=unique_values[column]).codes
    if test_df[column].dtype == 'object':
        test_df[column] = pd.Categorical(test_df[column], categories=unique_values[column]).codes



In [5]:
print(type(train_df))


<class 'pandas.core.frame.DataFrame'>


In [10]:
train_df.reset_index(drop=True)

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city
0,6895,5318,0,1,3.6,23,46,0,119,800,11934,0,2,17
1,3312,2831,0,1,3.2,8,25,21,63,400,4212,8145,4,29
2,6592,2677,0,0,3.8,360,68,27,1937,1300,10904,3883,2,15
3,1054,995,1,1,3.1,7,11,2,475,250,1131,0,2,13
4,5160,4205,0,0,4.3,686,28,0,244,1000,7405,0,4,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32984,3348,2861,0,1,4.0,121,55,2,1217,900,4254,0,4,5
32985,4742,2596,0,1,2.9,24,15,0,5,600,6863,0,4,8
32986,2327,2050,1,0,4.3,2741,32,33,746,1300,19615,0,2,27
32987,607,583,0,0,4.0,199,8,2,130,700,673,248,2,1


: 

In [2]:
pd.concat(pd.read_csv('/config/workspace/zomato_cleaned.csv', chunksize = 5000))

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",menu_not_availabel,Buffet,Banashankari
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",menu_not_availabel,Buffet,Banashankari
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",menu_not_availabel,Buffet,Banashankari
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",menu_not_availabel,Buffet,Banashankari
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",menu_not_availabel,Buffet,Banashankari
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41232,"136, SAP Labs India, KIADB Export Promotion In...",The Farm House Bar N Grill,No,No,3.7,34,Whitefield,"Casual Dining, Bar","North Indian, Continental",800,"[('Rated 4.0', 'RATED\n Ambience- Big and spa...",menu_not_availabel,Pubs and bars,Whitefield
41233,"139/C1, Next To GR Tech Park, Pattandur Agraha...",Bhagini,No,No,2.5,81,Whitefield,"Casual Dining, Bar","Andhra, South Indian, Chinese, North Indian",800,"[('Rated 4.0', 'RATED\n A fine place to chill...",menu_not_availabel,Pubs and bars,Whitefield
41234,"Four Points by Sheraton Bengaluru, 43/3, White...",Best Brews - Four Points By Sheraton Bengaluru...,No,No,3.6,27,Whitefield,Bar,Continental,1500,"[('Rated 5.0', ""RATED\n Food and service are ...",menu_not_availabel,Pubs and bars,Whitefield
41235,Sheraton Grand Bengaluru Whitefield Hotel & Co...,Chime - Sheraton Grand Bengaluru Whitefield Ho...,No,Yes,4.3,236,"ITPL Main Road, Whitefield",Bar,Finger Food,2500,"[('Rated 4.0', 'RATED\n Nice and friendly pla...",menu_not_availabel,Pubs and bars,Whitefield


In [4]:
pd.read_csv('zomato_cleaned.csv')

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",menu_not_availabel,Buffet,Banashankari
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",menu_not_availabel,Buffet,Banashankari
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",menu_not_availabel,Buffet,Banashankari
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",menu_not_availabel,Buffet,Banashankari
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",menu_not_availabel,Buffet,Banashankari
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41232,"136, SAP Labs India, KIADB Export Promotion In...",The Farm House Bar N Grill,No,No,3.7,34,Whitefield,"Casual Dining, Bar","North Indian, Continental",800,"[('Rated 4.0', 'RATED\n Ambience- Big and spa...",menu_not_availabel,Pubs and bars,Whitefield
41233,"139/C1, Next To GR Tech Park, Pattandur Agraha...",Bhagini,No,No,2.5,81,Whitefield,"Casual Dining, Bar","Andhra, South Indian, Chinese, North Indian",800,"[('Rated 4.0', 'RATED\n A fine place to chill...",menu_not_availabel,Pubs and bars,Whitefield
41234,"Four Points by Sheraton Bengaluru, 43/3, White...",Best Brews - Four Points By Sheraton Bengaluru...,No,No,3.6,27,Whitefield,Bar,Continental,1500,"[('Rated 5.0', ""RATED\n Food and service are ...",menu_not_availabel,Pubs and bars,Whitefield
41235,Sheraton Grand Bengaluru Whitefield Hotel & Co...,Chime - Sheraton Grand Bengaluru Whitefield Ho...,No,Yes,4.3,236,"ITPL Main Road, Whitefield",Bar,Finger Food,2500,"[('Rated 4.0', 'RATED\n Nice and friendly pla...",menu_not_availabel,Pubs and bars,Whitefield


: 

In [3]:

                

    def initiate_data_transformation(self)->artifact_entity.DataTransformationArtifact:
        try: 
            # reading the training and testing files
            logging.info("Reading train and test files in data_transformation.py")
            test_df = pd.concat(pd.read_csv(self.data_ingestion_artifact.test_file_path, chunksize = 5000))
            train_df = pd.concat(pd.read_csv(self.data_ingestion_artifact.train_file_path, chunksize = 5000))
            logging.info(f"train and test file read")
            
            # selecting input feature for encoding
            en_train_df = train_df.drop(ENCODE_EXCLUDE_COLUMN, axis=1)
            en_test_df = test_df.drop(ENCODE_EXCLUDE_COLUMN, axis=1)
            
            train_encode = self.Encode(df=en_train_df)
            test_encode = self.Encode(df=en_test_df)

            train_df = pd.concat([train_df[ENCODE_EXCLUDE_COLUMN], train_encode], axis=1)
            test_df = pd.concat([test_df[ENCODE_EXCLUDE_COLUMN], test_encode], axis=1)

            # saving into numpy array
            logging.info(f"Saving the transformed dataframe into numpy array")
            logging.info(f"file path: {self.data_transformation_config.transformed_train_path}")
            utils.save_numpy_array_data(file_path=self.data_transformation_config.transformed_train_path, df=train_df)
            utils.save_numpy_array_data(file_path=self.data_transformation_config.transformed_test_path, df=test_df)

            # creating the artifacts
            data_transformation_artifact = artifact_entity.DataTransformationArtifact(
                                            transformed_train_path= self.data_transformation_config.transformed_train_path, 
                                            transformed_test_path = self.data_transformation_config.transformed_test_path,
                                            )
            
            logging.info(f"Data Transformation Done")
            return data_transformation_artifact
        
        except Exception as e:
            raise RatingException(e, sys)


IndentationError: unexpected indent (3969155388.py, line 4)

In [2]:
from restr_rating import utils

In [2]:
train_arr = utils.load_numpy_array("/config/workspace/artifact/02_18_2023__01_53_23/data_transformation/transformed/train.npz")

In [3]:
train_arr_2 =utils.load_numpy_array("/config/workspace/artifact/02_18_2023__02_02_48/data_transformation/transformed/train.npz")

In [4]:
train_arr_2[0][0]

3.6

In [5]:
x_train, y_train = train_arr_2[:,1:], train_arr_2[:,0]

In [6]:
y_train.shape

(32989,)

In [9]:
x_train.shape

(32989, 11)

In [7]:
y_train

array([3.6, 3.2, 3.8, ..., 4.3, 4. , 3.6])

In [8]:
x_train[0]

array([8.000e+02, 2.300e+01, 5.318e+03, 0.000e+00, 1.000e+00, 4.600e+01,
       0.000e+00, 1.190e+02, 0.000e+00, 2.000e+00, 1.700e+01])

In [10]:
import pandas as pd

In [12]:
base_df = pd.concat(pd.read_csv('/config/workspace/zomato_cleaned.csv', chunksize = 5000)).drop(['address', 'reviews_list'], axis=1)

In [13]:
base_df

Unnamed: 0,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,menu_item,type,city
0,Jalsa,Yes,Yes,4.1,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800,menu_not_availabel,Buffet,Banashankari
1,Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800,menu_not_availabel,Buffet,Banashankari
2,San Churro Cafe,Yes,No,3.8,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800,menu_not_availabel,Buffet,Banashankari
3,Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300,menu_not_availabel,Buffet,Banashankari
4,Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600,menu_not_availabel,Buffet,Banashankari
...,...,...,...,...,...,...,...,...,...,...,...,...
41232,The Farm House Bar N Grill,No,No,3.7,34,Whitefield,"Casual Dining, Bar","North Indian, Continental",800,menu_not_availabel,Pubs and bars,Whitefield
41233,Bhagini,No,No,2.5,81,Whitefield,"Casual Dining, Bar","Andhra, South Indian, Chinese, North Indian",800,menu_not_availabel,Pubs and bars,Whitefield
41234,Best Brews - Four Points By Sheraton Bengaluru...,No,No,3.6,27,Whitefield,Bar,Continental,1500,menu_not_availabel,Pubs and bars,Whitefield
41235,Chime - Sheraton Grand Bengaluru Whitefield Ho...,No,Yes,4.3,236,"ITPL Main Road, Whitefield",Bar,Finger Food,2500,menu_not_availabel,Pubs and bars,Whitefield


: 