In [34]:
import pandas as pd 
import numpy as np 
import dill
from sklearn.metrics import r2_score

In [35]:
base_df = pd.read_csv('zomato_cleaned.csv')

In [36]:
from restr_rating import utils
from restr_rating.exception import RatingException
from restr_rating.logger import logging

In [39]:
model = utils.load_object("/config/workspace/artifact/02_20_2023__08_50_21/model_trainer/model/model.pkl")

In [3]:
transformer = utils.load_object("/config/workspace/artifact/02_19_2023__10_11_56/data_transformation/transformer/transformer.pkl")

In [7]:
data = utils.load_numpy_array("/config/workspace/artifact/02_19_2023__10_11_56/data_transformation/transformed/train.npz")

In [10]:
data[0]

array([3.600e+00, 8.000e+02, 2.300e+01, 5.318e+03, 0.000e+00, 1.000e+00,
       4.600e+01, 0.000e+00, 1.190e+02, 0.000e+00, 2.000e+00, 1.700e+01])

In [40]:
def encode_categorical_variables(base: pd.DataFrame, train_df: pd.DataFrame, test_df: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame):
    # fit .factorize() method on base_df
        try:
            unique_values = {}
            encoded_base = pd.DataFrame()
            for column in base.columns:
                unique_values[column] = base[column].unique()

            # transform categorical variables in train_df and test_df using unique values
            for column in train_df.columns:
                if train_df[column].dtype == 'object':
                    train_df[column] = pd.Categorical(train_df[column], categories=unique_values[column]).codes
                if test_df[column].dtype == 'object':
                    test_df[column] = pd.Categorical(test_df[column], categories=unique_values[column]).codes
                else:
                    encoded_base[column] = base[column]

            # utils.save_encoding_to_dill(unique_values=unique_values, encoded_base=base, file_path=file_path)
            return train_df, test_df
        except Exception as e:
            raise RatingException(e, sys)

In [41]:
base_file = pd.concat(pd.read_csv('/config/workspace/zomato_cleaned.csv', chunksize = 5000)).drop(['address', 'reviews_list'], axis=1)
train_file = pd.concat(pd.read_csv("/config/workspace/artifact/02_19_2023__11_59_43/data_ingestion/dataset/train.csv", chunksize = 5000)).drop(['address', 'reviews_list'], axis=1)
test_file = pd.concat(pd.read_csv("/config/workspace/artifact/02_19_2023__11_59_43/data_ingestion/dataset/test.csv", chunksize = 5000)).drop(['address', 'reviews_list'], axis=1)

In [42]:
train_df, test_df = encode_categorical_variables(base=base_file, test_df=test_file, train_df=train_file)

In [7]:
train_df

Unnamed: 0,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,menu_item,type,city
0,5318,0,1,3.6,23,46,0,119,800,0,2,17
1,2831,0,1,3.2,8,25,21,63,400,8145,4,29
2,2677,0,0,3.8,360,68,27,1937,1300,3883,2,15
3,995,1,1,3.1,7,11,2,475,250,0,2,13
4,4205,0,0,4.3,686,28,0,244,1000,0,4,11
...,...,...,...,...,...,...,...,...,...,...,...,...
32984,2861,0,1,4.0,121,55,2,1217,900,0,4,5
32985,2596,0,1,2.9,24,15,0,5,600,0,4,8
32986,2050,1,0,4.3,2741,32,33,746,1300,0,2,27
32987,583,0,0,4.0,199,8,2,130,700,248,2,1


In [18]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8248 entries, 0 to 8247
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          8248 non-null   int16  
 1   online_order  8248 non-null   int8   
 2   book_table    8248 non-null   int8   
 3   rate          8248 non-null   float64
 4   votes         8248 non-null   int64  
 5   location      8248 non-null   int8   
 6   rest_type     8248 non-null   int8   
 7   cuisines      8248 non-null   int16  
 8   cost          8248 non-null   int64  
 9   menu_item     8248 non-null   int16  
 10  type          8248 non-null   int8   
 11  city          8248 non-null   int8   
dtypes: float64(1), int16(3), int64(2), int8(6)
memory usage: 290.1 KB


In [24]:
# loading the model
model = utils.load_object("/config/workspace/artifact/02_20_2023__07_52_27/model_trainer/model/model.pkl")

In [21]:
def save_numpy_array_data(file_path:str, df:pd.DataFrame):
    """
    Save Pandas DataFrame data as a NumPy array to file
    file_path: str location of file to save
    df: pd.DataFrame data to save
    """
    try:
        dir_path = os.path.dirname(file_path)
        os.makedirs(dir_path, exist_ok=True)
        array = df.to_numpy()  # Convert DataFrame to NumPy array
        with open(file_path, "wb") as file_obj:
            np.save(file_obj, array)
    except Exception as e:
        raise RatingException(e, sys)



In [43]:
def load_numpy_array(file_path:str)->np.array:
    """
    load numpy array data from file 
    file_path: str location of file to load
    return np.array data loaded
    """
    try:
        with open(file_path,"rb") as file_obj:
            return np.load(file_obj, allow_pickle=True)
    except Exception as e:
        raise RatingException(e, sys)

In [13]:
test_data = load_numpy_array("/config/workspace/artifact/02_20_2023__08_50_21/data_transformation/transformed/test.npz")

In [44]:
train_data = load_numpy_array("/config/workspace/artifact/02_20_2023__08_50_21/data_transformation/transformed/train.npz")

In [20]:
test_data.shape

(8248, 12)

In [None]:

y_hat_train = model.predict(x_train)
r2_train_score = r2_score(y_true=y_train, y_pred=y_hat_train)
logging.info(f"r2 score for the train dattase:[{r2_train_score}]")

y_hat_test = model.predict(x_test)
r2_test_score = r2_score(y_true=y_test, y_pred=y_hat_test)
logging.info(f"r2 score for the test dattase:[{r2_test_score}]")


In [45]:
x_test, y_test = test_data[:,1:], test_data[:,0]
x_train , y_train = train_data[:,1:], train_data[:,0]

In [21]:
x_test.shape

(8248, 11)

In [17]:
np.unique(y_test)

array([2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2,
       3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4. , 4.1, 4.2, 4.3, 4.4, 4.5,
       4.6, 4.7, 4.8, 4.9])

In [46]:
y_hat_test = model.predict(x_test)
r2_score(y_true=y_test, y_pred=y_hat_test)

0.923191398984183

In [48]:
y_hat_train = model.predict(x_train)
r2_score(y_true=y_train, y_pred=y_hat_train)

1.0

: 

In [None]:
train_df

In [29]:
x_train

NameError: name 'x_train' is not defined

In [28]:
x_train, y_train = train_df[:,1:], train_df[:,0]

InvalidIndexError: (slice(None, None, None), slice(1, None, None))