# **1. Importing Libraries** #

In [181]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder

# **2. Data Preparation Pipeline** #

1. **Selecting The following features** as intial part of preparation:<br> 
        1. online_ordering?.<br> 
        2. table_bookings?.<br> 
        3. location.<br> 
        4. rest_type.<br> 
        5. cuisines.<br> 
        6. approx_cost(for two people).<br> 
        7. type.<br> 
        8. listed_in(listed_in(city)).<br>
        Then Change their names to match the fitting columns names
        
2. Preparing **Cost** Column. <br> 
3. Creating  **Count of Cuisines** and **Count of Services** columns. <br> 
4. Generalizing **location**, **rest_type**, **listed_in(listed_in(city))** columns by keeping the top 10 most frequent values and replacing the rest with a other. <br> 
5. Create columns for each cuisine and Set the value to **1** if the cuisine is in the **'cuisines' list** for each restaurant. <br>
6. drop **'rest_type','listed_in(city)','Count_Cuisines_list','Count_service_list','location'** columns. <br>
7. Encoding data for categorical features.
8. Filling Na Values with median value.
9. Scalling the dataset. 

In [182]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder,StandardScaler

In [183]:
# Loading dataset to a pandas' dataframe
zomato_df = pd.read_csv('zomato.csv')
# Displaying first 5 rows of dataset
zomato_df.head(2)

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari


In [184]:
zomato_df = zomato_df[(zomato_df['rate']=='NEW')]

In [185]:
def split_on_commas(value):
    return value.split(',') if isinstance(value, str) else []

In [186]:
Top_features_lists = {}  # Create an empty dictionary to store the loaded lists
for feature in ['location', 'city', 'rest_type']:
    with open(f'{feature}_top_values.pkl', 'rb') as file:
        Top_features_lists[feature + '_top_list'] = pickle.load(file)
# Now, you have three separate variables: loaded_lists['location_top_list'], loaded_lists['listed_in(city)_top_list'], and loaded_lists['rest_type_top_list'] containing the loaded lists for each feature.

In [187]:
class DataPreparation:
    def __init__(self, df):
        self.df = df

    def clean_cost(self):
        def clean_cost_value(value):
            '''
            This function takes a value (supposed to be numeric but is a string with commas, e.g., '10,000'),
            removes the commas, and returns the number as a floating-point (float) type.

            Parameters:
            value (str or numeric): The value to be cleaned, which may contain commas.

            Returns:
            float or np.nan: The cleaned numeric value as a float. Returns np.nan if the input is 'nan'.
            '''
            # Convert the input to a string to handle various data types
            value = str(value)

            # Check if the value contains commas
            if ',' in value:
                # Remove the commas and convert to float
                new_value = value.replace(',', '')
                return float(new_value)
            # Check if the value is 'nan' (a string representation of NaN)
            elif value == 'nan':
                return np.nan
            else:
                # Convert the value to float (assuming it's already in a numeric format)
                return float(value)

        self.df['cost_per_two'] = self.df['cost_per_two'].apply(clean_cost_value)

    def count_list(self, input_list):
        """
        Count the number of elements in a list.

        Parameters:
        input_list (list): The input list to count elements from.

        Returns:
        int: The count of elements in the list.
        """
        return len(input_list)


    def create_features(self):
        # Step 1: Select Features and rename them
        self.df = self.df.drop(['url', 'address', 'phone', 'dish_liked', 'reviews_list', 'menu_item','name','rate','votes'], axis=1)
        self.df.rename(columns={"online_order": "online_ordering?", "book_table": "table_bookings?",
                                'listed_in(type)': 'type', 'approx_cost(for two people)': 'cost_per_two',
                                'listed_in(city)': 'city'}, inplace=True)

        # Step 2: Clean Cost Column
        self.clean_cost()

        # Creates a new column containing cuisines for every restaurant stored in a list
        self.df['Count_Cuisines_list'] = self.df['cuisines'].apply(split_on_commas)
        all_cuisines = []

        # Extend every cuisine list for every restaurant into one list
        for cuisines in self.df['Count_Cuisines_list']:
            all_cuisines.extend(cuisines)

        # Count Occurrence Number for every cuisine
        cuisine_counts = pd.Series(all_cuisines).value_counts()

        # Creating a DataFrame with the top 7 popular cuisines in Bangalore
        top_cuisines = pd.read_csv('top_cuisines.csv',index_col='Unnamed: 0')

        # Step 3: Create Count of Cuisines and Count of Services Columns
        self.df['Count_Cuisines_list'] = self.df['cuisines'].apply(split_on_commas)
        self.df['cuisines_count'] = self.df['Count_Cuisines_list'].apply(self.count_list)

        self.df['Count_service_list'] = self.df['rest_type'].apply(split_on_commas)
        self.df['service_type_count'] = self.df['Count_service_list'].apply(self.count_list)

        # Step 4: Generalize Location, Rest_type, city Columns
        self.df = self.Top_n_either_value(self.df, 'city',Top_features_lists['city_top_list'])
        self.df = self.Top_n_either_value(self.df, 'location', Top_features_lists['location_top_list'])
        self.df = self.Top_n_either_value(self.df, 'rest_type', Top_features_lists['rest_type_top_list'])

        # Step 5: Create Columns for Each Cuisine
        top_cuisines = list(top_cuisines['Cuisine'][:10])

        for cuisine in top_cuisines:
            self.df[cuisine] = self.df['cuisines'].apply(lambda x: 1 if cuisine in x else 0)

        self.df.drop(columns=['cuisines'], inplace=True)

        # Step 6: Drop Unneeded Columns
        self.df.drop(columns=['rest_type', 'city', 'Count_Cuisines_list', 'Count_service_list', 'location'], inplace=True)

        # Step 7: Encoding Data for Categorical Features
        columns_to_encode = ['type', 'city_new', 'rest_type_new', 'location_new']
        self.df_encoded = pd.get_dummies(self.df, columns=columns_to_encode)

        columns_to_label_encode = ['online_ordering?', 'table_bookings?']
        le = LabelEncoder()

        for column in columns_to_label_encode:
            self.df_encoded[column] = le.fit_transform(self.df_encoded[column])

        # Convert boolean columns to 1s and 0s
        self.df_encoded = self.df_encoded.replace({True: 1, False: 0})

        # Step 8: Fill NA Values with Median
        self.df_encoded.fillna(self.df_encoded.median(), inplace=True)
        
        # Step 9: Normalize the numerical features using StandardScaler
        numerical_columns = ['cost_per_two', 'cuisines_count', 'service_type_count']
        scaler = StandardScaler()
        self.df_encoded[numerical_columns] = scaler.fit_transform(self.df_encoded[numerical_columns])

    @staticmethod
    def Top_n_either_value(df, column_name, top_values_list, new_value='Other'):
        """
        Generalize a DataFrame column by keeping the top N most frequent values and replacing the rest with a new value.

        Parameters:
            df (DataFrame): The DataFrame containing the column to be generalized.
            column_name (str): The name of the column to be generalized.
            top_values_list (list): A list of the top values to keep.
            new_value (str): The value to replace less frequent values with (default is 'Other').

        Returns:
            DataFrame: The DataFrame with the column generalized.
        """
        df[column_name + '_new'] = df[column_name].apply(lambda x: x if x in top_values_list else new_value)
        return df

In [188]:
# Usage:
# Instantiate the class with your DataFrame
data_prep = DataPreparation(zomato_df)

# Perform data preparation
data_prep.create_features()

# Access the prepared DataFrame
prepared_df = data_prep.df_encoded

In [189]:
prepared_df.head()

Unnamed: 0,online_ordering?,table_bookings?,cost_per_two,cuisines_count,service_type_count,North Indian,Chinese,South Indian,Fast Food,Biryani,...,location_new_Bannerghatta Road,location_new_Bellandur,location_new_HSR,location_new_Indiranagar,location_new_JP Nagar,location_new_Jayanagar,location_new_Koramangala 5th Block,location_new_Marathahalli,location_new_Other,location_new_Whitefield
72,0,0,-0.947178,-1.111121,-0.421461,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
75,0,0,0.219508,-0.312243,2.339922,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
110,0,0,-0.780508,0.486636,2.339922,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
130,0,0,-0.780508,0.486636,2.339922,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
131,0,0,-1.113847,-1.111121,-0.421461,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [190]:
prepared_df.columns

Index(['online_ordering?', 'table_bookings?', 'cost_per_two', 'cuisines_count',
       'service_type_count', 'North Indian', 'Chinese', 'South Indian',
       'Fast Food', 'Biryani', 'Continental', 'Desserts', 'Cafe', 'Beverages',
       'Italian', 'type_Buffet', 'type_Cafes', 'type_Delivery',
       'type_Desserts', 'type_Dine-out', 'type_Drinks & nightlife',
       'type_Pubs and bars', 'city_new_BTM', 'city_new_HSR',
       'city_new_Indiranagar', 'city_new_JP Nagar', 'city_new_Jayanagar',
       'city_new_Koramangala 4th Block', 'city_new_Koramangala 5th Block',
       'city_new_Koramangala 6th Block', 'city_new_Koramangala 7th Block',
       'city_new_Marathahalli', 'city_new_Other', 'rest_type_new_Bakery',
       'rest_type_new_Beverage Shop', 'rest_type_new_Cafe',
       'rest_type_new_Casual Dining', 'rest_type_new_Casual Dining, Bar',
       'rest_type_new_Delivery', 'rest_type_new_Dessert Parlor',
       'rest_type_new_Food Court', 'rest_type_new_Other',
       'rest_type_new

# **2. Prediction** #

## **2.1. Importing Model** ##

In [191]:
# Replace 'your_model_filename.pkl' with the actual filename of your saved model.
model = joblib.load('AdaBoost_no_votes.joblib')
New_restaurants_predictions = model.predict(prepared_df)

## **2.2. Predicting** ##

In [192]:
# Adding a new column 'Success Prediction' to the Zomato DataFrame
zomato_df['Success Prediction'] = New_restaurants_predictions

# Saving the updated DataFrame to a CSV file named 'NEW Restaurants_Success_predicted.csv'
zomato_df.to_csv('NEW Restaurants_Success_predicted.csv')

In [193]:
zomato_df['Success Prediction'].value_counts()

Success Prediction
1    2162
0      46
Name: count, dtype: int64