In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from datetime import datetime, timedelta
import warnings
import seaborn as sns
warnings.simplefilter('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
pd.set_option('display.max_columns', 200) # show more columns
RANDOM_SEED = 27 # makes the test repeatable
current_date = pd.to_datetime('2021/03/15') # current date

# 1. Loading the dataset

The datasets contains information about restaurants. This infomation needs to be used to build a regression model capable of predicting the rating of a restaurant. 
First data frame, df_train(main_task.csv) contains information about 40000 restaurants and will be used for training and validating the model.
The df_test('kaggle_task.csv) has information about 10000 restaurants, which rating needs to be predicted.
Following information can be found in the data frames:
1. Restaurant_id - restaurant identification number;
2. City - city in which restaurant is located;
3. Cuisine Style - tags for food that is served in the restaurant;
4. Ranking - rank of a restaurant compared to the other restaurants in the city;
5. Rating -  restaurant rating according to TripAdvisor, dependant variable and the value that needs to be predicted(1 to 5);
6. Price Range - range of prices in a restaurant(category);
7. Number of Reviews — Number of Reviews;
8. Reviews - two reviews displayed on the website and their respective dates;
9. URL_TA — URL on TripAdvisor;
10. ID_TA — Identificator of restaurant in TripAdvisor's DataBase.

In [2]:
DATA_DIR = '/kaggle/input/sf-dst-restaurant-rating/'
df_train = pd.read_csv('main_task.csv') # dataset for training
df_test = pd.read_csv('kaggle_task.csv') # dataset for validation
sample_submission = pd.read_csv('sample_submission.csv') # submission dataset

FileNotFoundError: [Errno 2] File /kaggle/input/sf-dst-restaurant-rating//main_task.csv does not exist: '/kaggle/input/sf-dst-restaurant-rating//main_task.csv'

In [None]:
# For the correct processing of features, combine train and test sets into a one dataset

df_train['sample'] = 1  # train
df_test['sample'] = 0  # test
# as we have to predict rating, in test set we just fill it with 0
df_test['Rating'] = 0

df = df_test.append(df_train, sort=False).reset_index(
    drop=True)  # combine sets

# 1.1 Basic look at the dataset


In [None]:
df.head()

In [None]:
# renaming the columns so they are easier to read
df.columns = ['id','city', 'cuisine', 'ranking',
              'prices', 'number_of_reviews', 'reviews', 'url',
              'website_id', 'sample', 'rating']

In [None]:
# at the moment url is not used, so it is dropped at the start
df.drop(['url'], inplace=True, axis=1)

In [None]:
# general look at the number of unique values in df
df.nunique(dropna=False)

In [None]:
df.info()

It appears that the column id has repeatable values, which might be duplicates.
The columns cuisine, prices and number_of_reviews have 9283, 13886 and 2543 NaN values.
Most columns have data dype object, with only ranking, rating and number_of_reviews being float64.

# 1.1 Restaurant ID

In [None]:
df.id = df.id.apply(lambda x: x[3:]) # getting rid of the id_ part

In [None]:
df.loc[df.id == df.id.value_counts().index[0]] # displaying the restaurants that share id

It appears that the line are not duplicated, so the restaurants sharing id are just chain restaurants.

In [None]:
# Marking such restaurants should we need it in the future
chain_restaurant = df.id.value_counts()[df.id.value_counts() > 1].index.tolist()
df['chain_restaurant'] = df[df.id.isin(chain_restaurant)].id.apply(lambda x: 1)
df.chain_restaurant = df['chain_restaurant'].fillna(0)

# 1.2 Cities

In [None]:
# check if there are any repetitions/alternative notations for city names
df.city.value_counts()

In [None]:
plt.figure(figsize = (15, 5))
sns.countplot(df['city'], order=df['city'].value_counts().index)
plt.title('Cities Distribution\n', fontsize=15)
plt.xticks(rotation=90)
plt.ylabel('Frequency')
plt.xlabel('City')

cities = df.city.unique() # creating list with all the cities

# 1.3 Cuisine

In [None]:
df.cuisine[0], type(df.cuisine[0])

It appears that data in this column is a string, arrenged to look like a list.

In [None]:
def separation(data):
    '''Function for separation of certain data in the dataset.
       Function takes in a string and returns a
       list of elements of the string'''
    data = data.replace('[','')
    data = data.replace(']','')
    data = data.replace('\'','')
    data = data.lower()
    data = data.split(',')
    data = [element.strip() for element in data]
    return data

In [None]:
# marking the lines that had their values filled in
df['filled_cuisine'] = df[df.cuisine.isna()].cuisine.apply(lambda x: 1)
df['filled_cuisine'] = df['filled_cuisine'].fillna(0)

In [None]:
# Fill in the empty lines and separates the strings, making lists of tags of cuisines
df.cuisine = df.cuisine.fillna('other_cuisine').apply(separation)
cuisine_exploded = df.explode('cuisine')
pd.DataFrame(cuisine_exploded.cuisine.value_counts()) # counts down how often each cousine is seen

In [None]:
# creating a dictionary with most common cuisine in each city
popular_cuisine_dict = {}
for city in cities:
    most_popular_cuisine = cuisine_exploded[
        cuisine_exploded['city'] == city
    ].cuisine.value_counts().index[0]
    popular_cuisine_dict[city] = most_popular_cuisine
popular_cuisine_dict

In [None]:
# maps dictionary with common cuisine on the dataframe
df['most_popular_cuisine'] = df.city.map(popular_cuisine_dict)

In [None]:
# looks how many cuisine styles a restaurant offers. 
# Since initially the tags weren't included for some restaurants, their values of number
# of tags are set to 0
df['cuisine_count'] = df.cuisine.apply(lambda x: len(x))
df['cuisine_count'].loc[df.filled_cuisine == 1] = 0

# 1.4 Ranking

In [None]:
plt.figure(figsize = (15, 5))
sns.boxplot(df.ranking.values)
plt.title('Ranking boxplot\n', fontsize = 15) # boxplot to see the distribution of values of ranking
plt.xlabel('Ranking')

In [None]:
plt.figure(figsize=(15, 5))
sns.distplot(df.ranking.values, bins=25) # How frequent the ranks are
plt.title('Ranking distribution\n', fontsize = 15)
plt.xlabel('Ranking')
plt.ylabel('Frequency')

In [None]:
df['ranking'].describe()

From the graph, it is seen that distribution 

In [None]:
plt.figure(figsize=(15, 5))
for city in (df['city'].value_counts())[0:10].index:
    sns.distplot(df['ranking'][df['city'] == city], kde=False, label=city)

    
plt.legend(prop={'size': 10})
plt.title('Ranking Distribution among cities\n', fontsize=15)
plt.xlabel('Ranking')
plt.ylabel('Quantity (frequency)')

From the graph above it is clear that all the cities use the same distribution, so ranking can be normalised

In [None]:
df['ranking_norm'] = df['ranking']/df.city.map(dict(df['city'].value_counts()))

In [None]:
# quick look at the distribution of normalised ranking
plt.figure(figsize=(15, 5))
sns.distplot(df.ranking_norm.values, bins=25)
plt.title('Ranking Distribution\n', fontsize=15)
plt.xlabel('Ranking')
plt.ylabel('Quantity (frequency)')

In [None]:
df['ranking_norm'].describe()

In [None]:
sns.boxplot(df.ranking_norm.values)

# 1.5 Reviews and number of reviews

In [None]:
plt.figure(figsize = (15, 5))
sns.boxplot(df.number_of_reviews.values)
df.number_of_reviews.describe()

In [None]:
df.reviews[0], type(df.reviews[0])

As with cuisine, the reviews column appears to be a string mimicing a list. 
Also, the date of the reviews is attached.

In [None]:
# create a template for search
lrx = re.compile('\[\[.*\]\]')
nan = None

In [None]:
def date_extraction(row):
    '''Function is called for extracting data from column 
    reviews and splitting it out into a separate columns
    INPUT: Whole dataset
    OUTPUT: Dataset with additional columns'''

    cell = row['reviews']
    aux_list = [[], []]  # create an auxilliary list for saving temp.data
    if type(cell) == str and lrx.fullmatch(cell):  # compare with searech template
        aux_list = eval(cell)  # transform into a list
        
    
    row['first_date'] = pd.to_datetime(
    aux_list[1][1] if len(aux_list[1]) > 1 else nan)
    row['last_date'] = pd.to_datetime(aux_list[1][0] if len(
        aux_list[1]) > 0 else nan, format='%m/%d/%Y', errors='coerce')

    row['first_date'] = pd.to_datetime(row['first_date'])
    row['last_date'] = pd.to_datetime(row['last_date'])

    return row


df = df.apply(date_extraction, axis=1)

In [None]:
# find a diffderence between date of the first review and the last one
# add this information into a new column

df['time_difference'] = df['last_date'] - df['first_date']

# call the function and get difference in days
df['time_difference'] = df['time_difference'].apply(
    lambda x: abs(x.days)
)

In [None]:
display(df['time_difference'].describe())
sns.boxplot(df['time_difference'].values)

It seems that most reviews have been left over the span of the last 4 months or so.

In [None]:
df['days_since_last_review'] = current_date - df['last_date']
df['days_since_last_review'] = df['days_since_last_review'].apply(
    lambda x: x.days
)

In [None]:
display(df['days_since_last_review'].describe())
sns.boxplot(df['days_since_last_review'].values)

In [None]:
# marking the lines that had their values filled in
df['last_date_filled'] = df[df.last_date.isna()].last_date.apply(lambda x: 1)
df['last_date_filled'] = df['last_date_filled'].fillna(0)
df['first_date_filled'] = df[df.first_date.isna()].first_date.apply(lambda x: 1)
df['first_date_filled'] = df['first_date_filled'].fillna(0)
 
df['last_date'] = df['last_date'].fillna(0)
df['first_date'] = df['first_date'].fillna(0)

In [None]:
df['number_of_reviews'] = df.apply(
    lambda row: 1 if np.isnan(row['number_of_reviews']) and
    (row['last_date'] == 0 or row['first_date'] == 0)
    else row['number_of_reviews'], axis=1
)

In [None]:
df['number_of_reviews'].isna().value_counts()

In [None]:
df['days_since_last_review'] = df['days_since_last_review'].fillna(0)
df['time_difference'] = df['time_difference'].fillna(0)

# 1.6 Price Range

In [None]:
df['prices'].value_counts(dropna=False)

In [None]:
# price range is hard to read using the original notation, so
# it is converted into categorical data
def price_range_sort(price):
    '''The function takes in a string and replaces it with another string'''
    if price == '$$ - $$$':
        price = 'medium'
    elif price == '$':
        price = 'budget'
    elif price == '$$$$':
        price = 'expensive'
    return price


df['filled_budget'] = df[df.prices.isna()].prices.apply(lambda x: 1) # marking the filled values
df['filled_budget'] = df['filled_budget'].fillna(0)
df.prices = df.prices.fillna('unspecified_budget').apply(price_range_sort)

In [None]:
plt.figure(figsize=(15, 10))
sns.boxplot(x = 'prices', y = 'rating',
           data = df)

In [None]:
plt.figure(figsize=(15, 10))
sns.boxplot(x = 'prices', y = 'number_of_reviews',
           data = df)

# 1.7 Website ID

In [None]:
# just a few restaurants have a shared website id
shared_web = df.website_id.value_counts()[df.website_id.value_counts() > 1].index.to_list()
df['shared_web'] = df[df.website_id.isin(shared_web)].website_id.apply(lambda x : 1)
df['shared_web'] = df['shared_web'].fillna(0)

# 1.8 Dependant Variable - Rating

In [None]:
df.loc[df['sample'] == 1].rating.describe()

There are no empty values for the rating, create a graph to see the distribution:

In [None]:
plt.figure(figsize=(15, 5))
sns.distplot(df.loc[df['sample'] == 1].rating.values, bins=25)
plt.title('Rating Distribution\n', fontsize=15)
plt.xlabel('Rating')
plt.ylabel('Quantity (frequency)')

In [None]:
sns.boxplot(df.loc[df['sample'] == 1].rating.values)

The values of rating are mostly in range from 3.5 to 4.5, with anything below 2 being an outlier. It seems that most restaurants have a rating above 3, which should be an average.

It appears that the data in this column is not continuous. Let's check it by using unique():

In [None]:
df.rating.unique()

Data is not continuous, and rating takes the values which are multiples of 0.5. This can be used to increase the accuracy of the model.

# 2 New Data
To increase the accuracy of the model, more data added

# 2.1 Population density

In [None]:
# Data adds the popultion(in millions of people) and area(km squared)
data = [[2.2, 105], [1, 188], [8.9, 1706],
         [3.6, 891], [1.5, 310], [0.3, 41.4], 
         [1.4, 181], [0.4, 368], [1.9, 414], 
         [2.9, 1287], [1.7, 101], [3.3, 607], 
         [1.2, 117], [0.2, 32.6], [0.4, 88], 
         [1.8, 517], [1.8, 525], [0.8, 86.4], 
         [0.9, 219], [0.5, 48], [1.9, 755], 
         [0.5,100], [1.3, 496], [0.7, 454], 
         [0.7,213], [0.5, 120], [0.2, 16], 
         [0.3, 163], [0.7, 39], [0.125, 51.5], 
         [0.8, 327]]
city_dict = {}
for i in range(len(cities)):
    city_dict[cities[i]] = data[i] 
city_dict

In [None]:
# adds the data to the data frame
df['population_and_area'] = df.city.map(city_dict)

Creating a new variable of population density. It appears that keping area and population have a positive impact on MAE, so they are kept.

In [None]:
# using the data from the list, calculates the populaion density
df['population'] = df['population_and_area'].apply(lambda x: x[0])
df['area'] = df['population_and_area'].apply(lambda x: x[1])
df['population_density'] = df['population_and_area'].apply(lambda x: x[0]/x[1])
df.drop(['population_and_area'], inplace=True, axis=1)

# 2.2 Capitals
It is possible that restaurants in the capital might have their rating inflated, or they might be better due to
larger number of tourists. to check that, a new column will be created.

In [None]:
capital = [1, 1, 1, 0, 1, 0, 1, 
           1, 1, 1, 1, 1, 0, 0, 
           0, 1, 1, 1, 1, 1, 1, 
           1, 0, 0, 0, 0, 1, 
           1, 1, 1, 1]

capital_dict = {}
for i in range(len(cities)):
    capital_dict[cities[i]] = capital[i] 
capital_dict
df['in_capital'] = df.city.map(capital_dict)

In [None]:
capital = [1, 1, 1, 1, 0, 0,
          0, 1, 1, 1, 0, 1,
          1, 1, 1, 1, 1, 1,
          1, 0, 0, 1, 1, 1,
          1, 0, 0, 1, 1, 1, 0]

capital_dict = {}
for i in range(len(cities)):
    capital_dict[cities[i]] = capital[i] 
capital_dict
df['in_capital'] = df.city.map(capital_dict)

# 3 Preparing the dataset for the model

# 3.1 Correlations
Checking the correlations between the columns, to see if any of them need to be dropped.

In [None]:
correlations = df.corr()
plt.figure(figsize=(10, 10))
sns.set(font_scale=1)
sns.heatmap(correlations)

A few values have very high or very low correlation. For this model, I have decided to treat the absolute value of the correlation of 0.7 or higher as high. Let's display the rows which have correlations that might be outside the boundary:

In [None]:
correlations['ranking'].sort_values()

In [None]:
correlations['cuisine_count'].sort_values()

In [None]:
correlations['time_difference'].sort_values()

In [None]:
correlations['population_density'].sort_values()

In [None]:
correlations['rating'].sort_values()

It appears that certain parameters have a fairly high correlation between them, but none of them were over the treshhold of 0.7.

# 3.2 Creating dummies
Creating dummy variables so the model can use them for prediction:

In [None]:
df.head(2)

In [None]:
city_dummies = pd.get_dummies(df.city)
cuisine_dummies = pd.get_dummies(df.cuisine.apply(pd.Series).stack()).sum(level = 0)
prices_dummies = pd.get_dummies(df.prices)

In [None]:
np.shape(city_dummies)
np.shape(cuisine_dummies)
np.shape(prices_dummies)

# 3.3 Removing columns from the model dataframe
Certain columns can not be used by the model, so they are added to the list 'to_drop' and then dropped.
Colums that had negative impact on MAE are added to 'negative_impact' list and then dropped.

In [None]:
df_model = df.loc[df['sample'] == 1]

In [None]:
df_model.info()

Dropping columns that the model can not use for prediction. Columns 'filled_cuisine', 'filled_budget', 'chain_restaurant', 'shared_web', 'last_date_filled', 'first_date_filled' appear to have a negative impact on MAE, therefore are dropped.

In [None]:
to_drop = ['id', 'city', 'cuisine', 'prices',
           'reviews', 'website_id', 'most_popular_cuisine',
          'first_date', 'last_date', 'sample']
negative_impact = [ 'filled_cuisine', 'filled_budget',
                   'chain_restaurant', 'shared_web',
                   'last_date_filled', 'first_date_filled']

The dropping lines are separated so the model can be tested with the columns that have negative impact on the model.

In [None]:
df_model = df_model.drop(to_drop, axis=1)
df_model = df_model.drop(negative_impact, axis=1)

In [None]:
np.shape(df_model)

In [None]:
# merging dummies with the dataframe made for the model training
df_model = df_model.merge(city_dummies,
                          left_index=True, right_index=True)
df_model = df_model.merge(cuisine_dummies,
                          left_index=True, right_index=True)
df_model = df_model.merge(prices_dummies,
                          left_index=True, right_index=True)

In [None]:
np.shape(df_model)

In [None]:
df_model.head()

# 4. Model
X - independent variable, data set with the information about the restaurants;
y - dependant variable, rating of a restaurant, which the model is trying to predict

In [None]:
X = df_model.drop(['rating'], axis = 1)
y = df_model['rating']

In [None]:
# X and y train are used for training the model
# X and y test are used for validation of the model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state = RANDOM_SEED)

In [None]:
# Importing:
from sklearn.ensemble import RandomForestRegressor # for creating and training a model
from sklearn import metrics # for assessing model accuracy

In [None]:
# Creating the model
regr = RandomForestRegressor(n_estimators=100)

# Training the model
regr.fit(X_train, y_train)

# Validating the model by predicting rating for X_test
# Saving the values into y_pred
y_pred = regr.predict(X_test)

In 1.8 it is shown that rating is not continues, so to increase the accuracy the values are rounded to the multipes of 0.5. It appears it has a very positive impact on MAE, reducing it by 0.02 or so:

In [None]:
def round_rating_pred(rating_pred):
    '''INPUT: a value of the rating
    OUTPUT: a value \'rounded\' to a multiple of 0.5'''
    if rating_pred <= 0.5:
        return 0.0
    if rating_pred <= 1.5:
        return 1.0
    if rating_pred <= 1.75:
        return 1.5
    if rating_pred <= 2.25:
        return 2.0
    if rating_pred <= 2.75:
        return 2.5
    if rating_pred <= 3.25:
        return 3.0
    if rating_pred <= 3.75:
        return 3.5
    if rating_pred <= 4.25:
        return 4.0
    if rating_pred <= 4.75:
        return 4.5
    return 5.0

for i in range(len(y_pred)):
    y_pred[i] = round_rating_pred(y_pred[i])

In [None]:
# Compares the y_pred and y_test, showing the mean absolute error
# between the two values
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))

In [None]:
# Shows which values have the most impact on the prediction
plt.rcParams['figure.figsize'] = (10,10)
feat_importances = pd.Series(regr.feature_importances_, index=X.columns)
feat_importances.nlargest(15).plot(kind='barh')

# 5. Applying the model and submitting the result
Repeating the process done with the training DF and submitting the result#


In [None]:
sample_submission

In [None]:
final_df = df.loc[df['sample'] == 0]

In [None]:
final_df = final_df.drop('rating', axis = 1)

In [None]:
final_df = final_df.drop(to_drop, axis=1)
final_df = final_df.drop(negative_impact, axis=1)

In [None]:
np.shape(final_df)

In [None]:
final_df = final_df.merge(city_dummies, left_index=True, right_index=True)
final_df = final_df.merge(cuisine_dummies, left_index=True, right_index=True)
final_df = final_df.merge(prices_dummies, left_index=True, right_index=True)

In [None]:
np.shape(final_df)

In [None]:
predict_submission = regr.predict(final_df)
for i in range(len(predict_submission)):
    predict_submission[i] = round_rating_pred(predict_submission[i])

In [None]:
predict_submission

In [None]:
sample_submission['Rating'] = predict_submission

In [None]:
sample_submission

# Analysis of my work
## General look
There were plenty of expected and unexpected results. Feature Engineering provided to be the hardest aspect of this task, as well as I discovered some interesting methods to working with given data. Certain things came up from random mistakes(e.g. keeping both the ranking and normalised ranking columns).
## Potential Improvements
Over the course of the task, I didn't manage to use the entirety of the data given - I wish I could use the most popular cuisine per city, or make a simple analysis of the reviews, yet I did not have enough time. I would also like to improve visual representation of the code, making it more structural, and potentially adding classes, but I was not confident enough in my own ability to work with them, so I decided to stick to the conventional code and convetional functions. The code could also be improved by adding a few extra functions, and possibly the acuracy of the model can be improved by dropping some columns that have little to no impact on the MAE. Decreacing the boundary for correlations might also increase the accuracy of the model.
## Things I Liked About My Model
My initial goal was to break the barrier of 0.2, which I managed to do, ending up with MAE of 0.1753. One of the most unpected things was the effect rounding had on MAE, changing it by 0.02, which was one of the biggest jumps in this value. I also did not expect the status of a capital to have any effect on MAE, but apparently it was more important than I initially expected. During one of the tests, I forgot to drop the ranking column, and I figured out that ranking and normalised ranking both helped the model to increase the accuracy.

In [None]:
sample_submission.to_csv('submission.csv', index=False)