### Imports and loading data

In [6]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

dataset_1 = pd.read_csv("../raw_data/dataset_1.csv")

### Inspecting data

In [2]:
dataset_1.head()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[' Leisure trip ', ' Family with young childre...",3 days,52.360576,4.915968
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",3 days,52.360576,4.915968
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/24/2017,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[' Leisure trip ', ' Couple ', ' Suite ', ' St...",10 days,52.360576,4.915968


In [3]:
dataset_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515738 entries, 0 to 515737
Data columns (total 17 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   Hotel_Address                               515738 non-null  object 
 1   Additional_Number_of_Scoring                515738 non-null  int64  
 2   Review_Date                                 515738 non-null  object 
 3   Average_Score                               515738 non-null  float64
 4   Hotel_Name                                  515738 non-null  object 
 5   Reviewer_Nationality                        515738 non-null  object 
 6   Negative_Review                             515738 non-null  object 
 7   Review_Total_Negative_Word_Counts           515738 non-null  int64  
 8   Total_Number_of_Reviews                     515738 non-null  int64  
 9   Positive_Review                             515738 non-null  object 
 

In [4]:
dataset_1.describe()

Unnamed: 0,Additional_Number_of_Scoring,Average_Score,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,lat,lng
count,515738.0,515738.0,515738.0,515738.0,515738.0,515738.0,515738.0,512470.0,512470.0
mean,498.081836,8.397487,18.53945,2743.743944,17.776458,7.166001,8.395077,49.442439,2.823803
std,500.538467,0.548048,29.690831,2317.464868,21.804185,11.040228,1.637856,3.466325,4.579425
min,1.0,5.2,0.0,43.0,0.0,1.0,2.5,41.328376,-0.369758
25%,169.0,8.1,2.0,1161.0,5.0,1.0,7.5,48.214662,-0.143372
50%,341.0,8.4,9.0,2134.0,11.0,3.0,8.8,51.499981,0.010607
75%,660.0,8.8,23.0,3613.0,22.0,8.0,9.6,51.516288,4.834443
max,2682.0,9.8,408.0,16670.0,395.0,355.0,10.0,52.400181,16.429233


### Dropping redundant columns

In [5]:
dataset_1 = dataset_1[['Negative_Review', 'Positive_Review', 'Reviewer_Score']]

### Cleaning and merging negative and positive reviews

In [6]:
dataset_1[['Negative_Review']] = dataset_1[['Negative_Review']].replace(to_replace="No Negative", value="")

dataset_1[['Positive_Review']] = dataset_1[['Positive_Review']].replace(to_replace="No Positive", value="")

dataset_1["Review_Text"] = dataset_1['Negative_Review'] + " " + dataset_1['Positive_Review']

dataset_1 = dataset_1.drop(columns=['Negative_Review', 'Positive_Review'])

### Lower case

In [7]:
dataset_1['Review_Text'] = dataset_1['Review_Text'].apply(lambda x: x.lower())

### Remove numbers

In [8]:
def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text

dataset_1['Review_Text'] = dataset_1['Review_Text'].apply(remove_numbers)

### Remove punctuation

In [9]:
for punctuation in string.punctuation:
    dataset_1['Review_Text'] = dataset_1['Review_Text'].replace(punctuation, '') 

### Remove stopwords

In [10]:
stop_words = set(stopwords.words('english')) 
dataset_1['reviews'] = dataset_1['Review_Text'].map(word_tokenize)
dataset_1['reviews'] = dataset_1['reviews'].map(lambda x: [w for w in x if not w in stop_words])

### Lemmatizing

In [11]:
lemmatizer = WordNetLemmatizer()

def lemmatizing(text):
    lemmatized = [lemmatizer.lemmatize(word) for word in text]
    text = lemmatized
    text = ' '.join(word for word in text)
    return text

dataset_1['reviews'] = dataset_1['reviews'].apply(lemmatizing)

### Custom scaling and rounding

In [None]:
dataset_1['review_score'] = dataset_1['Reviewer_Score']

dataset_1 = dataset_1.drop(columns=['Review_Text', 'Reviewer_Score'])

In [13]:
#Final check
dataset_1.head()

Unnamed: 0,reviews,review_score
0,angry made post available via possible site us...,0.29
1,real complaint hotel great great location surr...,0.75
2,room nice elderly bit difficult room two story...,0.71
3,room dirty afraid walk barefoot floor looked c...,0.38
4,booked company line showed picture room though...,0.67


In [16]:
dataset_1

Unnamed: 0,reviews,review_score
0,angry made post available via possible site us...,0.29
1,real complaint hotel great great location surr...,0.75
2,room nice elderly bit difficult room two story...,0.71
3,room dirty afraid walk barefoot floor looked c...,0.38
4,booked company line showed picture room though...,0.67
...,...,...
515733,trolly staff help take luggage room location,0.70
515734,hotel look like surely breakfast ok got earlie...,0.58
515735,ac useless hot week vienna gave hot air,0.25
515736,room enormous really comfortable believe famil...,0.88


### Exporting to csv

In [14]:
dataset_1.to_csv('../raw_data/clean_dataset_1.csv')