In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import seaborn as sns
from matplotlib import pyplot as plt

# Для корректной работы Jupyter Notebook
%matplotlib inline
# Для корректного отображения графиков в тёмной теме
plt.style.use('default')

In [2]:
hotels = pd.read_csv('data/hotels.csv')
hotels.head()

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,days_since_review,lat,lng
0,Stratton Street Mayfair Westminster Borough Lo...,581,2/19/2016,8.4,The May Fair Hotel,United Kingdom,Leaving,3,1994,Staff were amazing,4,7,10.0,"[' Leisure trip ', ' Couple ', ' Studio Suite ...",531 day,51.507894,-0.143671
1,130 134 Southampton Row Camden London WC1B 5AF...,299,1/12/2017,8.3,Mercure London Bloomsbury Hotel,United Kingdom,poor breakfast,3,1361,location,2,14,6.3,"[' Business trip ', ' Couple ', ' Standard Dou...",203 day,51.521009,-0.123097
2,151 bis Rue de Rennes 6th arr 75006 Paris France,32,10/18/2016,8.9,Legend Saint Germain by Elegancia,China,No kettle in room,6,406,No Positive,0,14,7.5,"[' Leisure trip ', ' Solo traveler ', ' Modern...",289 day,48.845377,2.325643
3,216 Avenue Jean Jaures 19th arr 75019 Paris Fr...,34,9/22/2015,7.5,Mercure Paris 19 Philharmonie La Villette,United Kingdom,No Negative,0,607,Friendly staff quiet comfortable room spotles...,11,8,10.0,"[' Leisure trip ', ' Solo traveler ', ' Standa...",681 day,48.888697,2.39454
4,Molenwerf 1 1014 AG Amsterdam Netherlands,914,3/5/2016,8.5,Golden Tulip Amsterdam West,Poland,Torn sheets,4,7586,The staff was very friendly and helpful Break...,20,10,9.6,"[' Business trip ', ' Couple ', ' Standard Dou...",516 day,52.385601,4.84706


In [4]:
hotels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386803 entries, 0 to 386802
Data columns (total 17 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   hotel_address                               386803 non-null  object 
 1   additional_number_of_scoring                386803 non-null  int64  
 2   review_date                                 386803 non-null  object 
 3   average_score                               386803 non-null  float64
 4   hotel_name                                  386803 non-null  object 
 5   reviewer_nationality                        386803 non-null  object 
 6   negative_review                             386803 non-null  object 
 7   review_total_negative_word_counts           386803 non-null  int64  
 8   total_number_of_reviews                     386803 non-null  int64  
 9   positive_review                             386803 non-null  object 
 

### Грубая очистка (*)

In [5]:
data_dig = hotels.copy()
data_dig.drop(columns=['hotel_address',
                       'review_date',
                       'hotel_name',
                       'reviewer_nationality',
                       'negative_review',
                       'positive_review',
                       'tags',
                       'days_since_review'
                      ], inplace=True)

In [6]:
data_dig.dropna(axis=0, inplace=True)
# data_dig.fillna(value=0, axis=0)

### Разделение набора данных (*)

In [7]:
X = data_dig.drop(['reviewer_score'], axis=1)
y = data_dig['reviewer_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print('done')

done


### Создание и обучение модели (*)

In [8]:
regr = RandomForestRegressor(n_estimators=100)
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print('done')

done


### Оценка качества модели (*)

In [9]:
print('MAPE:', metrics.mean_absolute_percentage_error(y_test, y_pred))

MAPE: 0.1412770804654202


In [10]:
del(data_dig)

###  Задание 4.2

Сколько уникальных названий отелей представлено в наборе данных?

In [11]:
hotels.hotel_name.nunique()

1492

###  Задание 4.3

* Когда был оставлен самый свежий отзыв? Введите ответ в формате yyyy-mm-dd.



* Когда был оставлен самый первый отзыв? Введите ответ в формате yyyy-mm-dd.

In [13]:
df_dt = hotels.copy()
df_dt.review_date = pd.to_datetime(df_dt.review_date)
display(df_dt.review_date.max())
display(df_dt.review_date.min())

Timestamp('2017-08-03 00:00:00')

Timestamp('2015-08-04 00:00:00')

###  Задание 4.4

Сколько уникальных тегов представлено в наборе данных?

In [14]:
df_tag = hotels.copy()
tagset = set()
for str_ in df_tag.tags:
    list_ = str_.lstrip('[').rstrip(']').split(', ')
    for tag_ in list_:
        tag_ = tag_.lstrip("\' ").rstrip(" \'")
        tagset.add(tag_)
print(len(tagset))

2368


###  Задание 4.5

Какой тег представлен в наибольшем числе отзывов?

In [15]:
tag_dict = dict()
for str_ in df_tag.tags:
    list_ = str_.lstrip('[').rstrip(']').split(', ')
    for tag_ in list_:
        tag_ = tag_.lstrip("\' ").rstrip(" \'")
        if tag_ not in tag_dict:
            tag_dict[tag_] = 1
        else:
            tag_dict[tag_] += 1
max_tag = ''
max_count = 0
for key_, value_ in tag_dict.items():
    if value_ > max_count:
        max_count = value_
        max_tag = key_
print(max_tag)

Leisure trip


###  Задание 4.6

Из тегов выясните, на сколько ночей чаще всего останавливаются путешественники в отелях.

In [16]:
for key_, value_ in tag_dict.items():
    if str(key_).startswith('Stayed'):
        print(f'{key_}: {value_}')

Stayed 2 nights: 100263
Stayed 1 night: 145373
Stayed 3 nights: 72000
Stayed 6 nights: 7399
Stayed 4 nights: 35748
Stayed 5 nights: 15611
Stayed 8 nights: 1910
Stayed 7 nights: 5549
Stayed 10 nights: 663
Stayed 14 nights: 184
Stayed 19 nights: 23
Stayed 13 nights: 174
Stayed 9 nights: 966
Stayed 17 nights: 27
Stayed 11 nights: 306
Stayed 27 nights: 10
Stayed 12 nights: 217
Stayed 15 nights: 87
Stayed 21 nights: 19
Stayed 18 nights: 24
Stayed 16 nights: 38
Stayed 26 nights: 6
Stayed 24 nights: 5
Stayed 30 nights: 10
Stayed 22 nights: 8
Stayed 23 nights: 6
Stayed 28 nights: 7
Stayed 20 nights: 17
Stayed 25 nights: 4
Stayed 29 nights: 3
