# Import

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

# Загружаем специальный удобный инструмент для разделения датасета:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import collections
import re
import datetime
from datetime import datetime 
import json

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# Fixing RANDOM_SEED and package version:
RANDOM_SEED = 42
!pip freeze > requirements.txt

In [None]:
def round_of_rating(number):
    # Округляем до 0.5
    return np.round(number * 2) / 2

# DATA

In [None]:
DATA_DIR = '/kaggle/input/sf-dst-restaurant-rating/'
df_train = pd.read_csv(DATA_DIR+'/main_task.csv')
df_test = pd.read_csv(DATA_DIR+'kaggle_task.csv')
world_cities = pd.read_csv('/kaggle/input/worldcities/worldcities.csv')
sample_submission = pd.read_csv(DATA_DIR+'/sample_submission.csv')

In [None]:
df_train.info()
df_train.head(5)

In [None]:
df_test.info()
df_test.head(5)

In [None]:
sample_submission.info()
sample_submission.head(5)

In [None]:
# Combine train and test to make the common stucture:
df_train['sample'] = 1 # Marking train 
df_test['sample'] = 0 # Marking test 
df_test['Rating'] = 0 # And add Rating to test filling it with zeroes

data = df_test.append(df_train, sort=False).reset_index(drop=True) # Combine

In [None]:
data.info()
data.sample(5)

In [None]:
world_cities.head(5)

# Cleaning and Prepping Data


## 1. Working with NAN 


# ***Number of Reviews***

There is NaN values in 'Number of Reviews', but we can see, that there is some reviews in 'Reviews' column near:

In [None]:
data[(data['Number of Reviews'].isna()) & (data['Reviews'] !='[[], []]')].head()

In [None]:
# Let's inplace such NaN's with mean values for each city:

cities = data['City'].unique().tolist()# cities list

for i in cities:
    data['Number of Reviews'] = data['Number of Reviews'].mask( 
            (data['Number of Reviews'].isna()) & #Find null values
            (data['Reviews'] != '[[], []]') & #With filled reviews  
            (data['City'] == i), #For each city in cities 
            # Enter mean value for the city:
            data['Number of Reviews'][data['City'] == i].mean()
                                
     ) 

# Replace NaN's with zeroes:
data['Number of Reviews'].fillna(0, inplace=True)
# It is a real value, so I'll make it integer:
data['Number of Reviews'] = data['Number of Reviews'].apply(int)


In [None]:
# Check if I missed values for NaN in ['Reviews']:
data['Number of Reviews'][data.Reviews.isna()]

# *Reviews*

In [None]:
# Fill NaN's with '[[], []]'
data['Reviews'] = data['Reviews'].fillna('[[], []]')

# ***Cuisine Style***

In [None]:
# Fill NaN's with '['Other']'
data['Cuisine Style'] = data['Cuisine Style'].fillna("['other']")


# ***Price Range***

In [None]:
# Fill NaN's with zeroes:
data['Price Range'].fillna(0, inplace=True) 

In [None]:
# Check if we missed some NaN's: 
data.info()

# **2. Working with features**

**Restaurant_id**

In [None]:
data.Restaurant_id.value_counts()

In [None]:
# Maybe _id depends on ranking, so it will be usefull to make it numeric: 
def change_id(x):
    if 'id_' in str(x):
        return str(x).replace('id_', '')
    else: return x
    
data['Restaurant_id'] = data['Restaurant_id'].apply(change_id)
data['Restaurant_id'] = pd.to_numeric(data['Restaurant_id'])


**Cuisine Style**

In [None]:
#Making correct lists:
data['Cuisine Style'] = data['Cuisine Style'].apply(
    lambda x: re.findall('\w+\s*\w+\s*\w+', str(x))
     )
data['Cuisine Style'].sample(5) 

**Price Range**

In [None]:
# Let's fill the values by the dictionary:
price_dict = {'$':1,'$$ - $$$':2,'$$$$':3}
data['Price Range']=data['Price Range'].map(lambda x: price_dict.get(x,x))



In [None]:
# Pick out the dates: 
data['Review_date'] = data.Reviews.apply(lambda x : [0] if pd.isna(x) else x[2:-2].split('], [')[1][1:-1].split("', '"))




# ***New Features***

In [None]:
data.columns

In [None]:
# List of all cuisines:
cuisines = set()

for i in data['Cuisine Style']:
    for j in i:
        cuisines.add(j)


In [None]:
# Frequency of occurrence:
type_cousine = {}  # Creating the dictionary to store the info:

for item in cuisines:  # Iterating over the list of cuisines
    type_cousine[item] = 0 # Add the keys for each cuisine

for i in data['Cuisine Style']:   # Iterating over the ['Cuisine Style']
    for j in i:   # Look into the list of cuisines in each restaurant
        type_cousine[j] += 1   # increase the value of the required key by 1

In [None]:
type_cousine

In [None]:
# The top of cuisines:
top_cuisine = []
for key, value in type_cousine.items():
    if value > 3000:
        top_cuisine.append(key)
top_cuisine

In [None]:
# With this function we will find out if our restaurant has the popular cuisine:

def most_popular_cuisine(x):
    
    for element in top_cuisine:
        if element in x:
            return 1
        else:
            continue
            

# Create the new column:           
data['most_popular_cuisine'] = data['Cuisine Style'].apply(most_popular_cuisine)

In [None]:
data['most_popular_cuisine'].fillna(0, inplace = True)

In [None]:
# Add a new feature "Number of cuisines in a restaurant"
data['cuisine_counts'] = data['Cuisine Style'].apply(lambda x: len(x))

**Review dates**

In [None]:
# Here we will find how many days have gone from the last review:

# Max date in cell:
data['max_date'] = pd.to_datetime(data['Review_date'].apply(lambda x: max(x)))

# New column:
data['days_ago'] = (datetime.now() - data['max_date']).apply(lambda x: x.days)
data.days_ago.fillna(data.days_ago.mean(), inplace=True)
data.days_ago = data.days_ago.apply(int)# Make it integer 
data.drop(['Review_date','max_date'], axis=1, inplace=True, errors='ignore')


**Population of the City**

In [None]:
population_dict = {
    'London' : 0,'Paris' : 0, 'Madrid' : 0, 'Barcelona' : 0,'Berlin' : 0, 'Milan' : 0, 
    'Rome' : 0, 'Prague' : 0, 'Lisbon' : 0,'Vienna' : 0, 'Amsterdam' : 0, 'Brussels' : 0,                         
    'Hamburg' : 0,'Munich' : 0, 'Lyon' : 0, 'Stockholm' : 0, 'Budapest' : 0, 'Warsaw' : 0, 
    'Dublin' : 0, 'Copenhagen' : 0, 'Athens' : 0, 'Edinburgh' : 0, 'Zurich' : 0,'Oporto' : 0, 
    'Geneva' : 0, 'Krakow' : 0, 'Oslo' : 0, 'Helsinki' : 0, 'Bratislava' : 0, 
    'Luxembourg' : 0,'Ljubljana' : 0}

for c in population_dict.keys():# Entering values from dataset
    population_dict[c] = world_cities.population[world_cities.city == c].max()

#Fill missing values from Google:
population_dict['Zurich'] = 402.762
population_dict['Oporto'] = 214.349 
population_dict['Krakow'] = 780.000
# Create new column
data['population'] = data.apply(lambda row: population_dict[row['City']], axis = 1)


City

In [None]:
#Let's make new columns for each city
city_name=pd.get_dummies(data.City)    
data = pd.concat((data,city_name),axis=1)

In [None]:
data.info()

# EDA

In [None]:
# Let's draw the correlation plot
correlation = data[data['sample'] == 1][['Ranking', 'Price Range','Number of Reviews', 'Rating','most_popular_cuisine', 
                                         'cuisine_counts', 'days_ago', 'population']].corr()
plt.figure(figsize=(20, 10))
sns.heatmap(correlation, annot=True, cmap='coolwarm')

There is a large correlation between population/Ranking and Price/cuisine counts. 
We must do something with it(maybe drop later)

In [None]:
# Build Ranking plot:
plt.rcParams['figure.figsize'] = (10,7)
df_train['Ranking'].hist(bins=100)

In [None]:
# Let's see for the value of reustaurants for each city
df_train['City'].value_counts(ascending=True).plot(kind='barh')

In [None]:
# The distribution of 'Ranking' in London:
df_train['Ranking'][df_train['City'] =='London'].hist(bins=100)

In [None]:
# Top 10 Cities:
for x in (df_train['City'].value_counts())[0:10].index:
    df_train['Ranking'][df_train['City'] == x].hist(bins=100)
plt.show()

It seems like we have a normal distribution of 'Ranking' in each City, so it needs to be standardized. 

### The distribution of target variable

In [None]:
df_train['Rating'].value_counts(ascending=True).plot(kind='barh')

In [None]:
# And now we will standardize Ranking:
mn = data.groupby('City')['Ranking'].mean()
st = data.groupby('City')['Ranking'].std()
data['Std_Ranking'] = (data['Ranking'] - data['City'].map(mn))/data['City'].map(st)

# Data Preprocessing

In [None]:
data.info()

In [None]:
data.drop(['Restaurant_id','Cuisine Style', 'Reviews', 'ID_TA', 
           'City','Ranking','URL_TA'], axis=1, inplace=True, errors='ignore')

In [None]:
# Теперь выделим тестовую часть
train_data = data.query('sample == 1').drop(['sample'], axis=1)
test_data = data.query('sample == 0').drop(['sample'], axis=1)

y = train_data.Rating.values            # наш таргет
X = train_data.drop(['Rating'], axis=1)

**Перед тем как отправлять наши данные на обучение, разделим данные на еще один тест и трейн, для валидации. 
Это поможет нам проверить, как хорошо наша модель работает, до отправки submissiona на kaggle.**

In [None]:
# Воспользуемся специальной функцией train_test_split для разбивки тестовых данных
# выделим 20% данных на валидацию (параметр test_size)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

In [None]:
# проверяем
test_data.shape, train_data.shape, X.shape, X_train.shape, X_test.shape

# Model 
Сам ML

In [None]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

In [None]:
# Создаём модель (НАСТРОЙКИ НЕ ТРОГАЕМ)
model = RandomForestRegressor(n_estimators=100, verbose=1, n_jobs=-1, random_state=RANDOM_SEED)

In [None]:
# Обучаем модель на тестовом наборе данных
model.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = round_of_rating(model.predict(X_test))

In [None]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))

In [None]:
# в RandomForestRegressor есть возможность вывести самые важные признаки для модели
plt.rcParams['figure.figsize'] = (10,10)
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(15).plot(kind='barh')

# Submission
Если все устраевает - готовим Submission на кагл

In [None]:
test_data.sample(10)

In [None]:
test_data = test_data.drop(['Rating'], axis=1)

In [None]:
sample_submission

In [None]:
predict_submission = round_of_rating(model.predict(test_data))

In [None]:
predict_submission

In [None]:
sample_submission['Rating'] = predict_submission
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head(10)