### Imports and loading data

In [3]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

dataset_3 = pd.read_csv("../raw_data/dataset_3.csv")

### Inspecting data

In [4]:
dataset_3.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [5]:
dataset_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


In [6]:
dataset_3.describe()

Unnamed: 0,Rating
count,20491.0
mean,3.952223
std,1.23303
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


### Lower case

In [7]:
dataset_3['Review'] = dataset_3['Review'].apply(lambda x: x.lower())

### Remove numbers

In [8]:
def remove_numbers(text):
    text = ''.join(word for word in text if not word.isdigit())
    return text

dataset_3['Review'] = dataset_3['Review'].apply(remove_numbers)

### Remove punctuation

In [9]:
for punctuation in string.punctuation:
    dataset_3['Review'] = dataset_3['Review'].replace(punctuation, '') 

### Remove stopwords

In [11]:
stop_words = set(stopwords.words('english')) 
dataset_3['reviews'] = dataset_3['Review'].map(word_tokenize)
dataset_3['reviews'] = dataset_3['reviews'].map(lambda x: [w for w in x if not w in stop_words])

### Lemmatizing

In [12]:
lemmatizer = WordNetLemmatizer()

def lemmatizing(text):
    lemmatized = [lemmatizer.lemmatize(word) for word in text]
    text = lemmatized
    text = ' '.join(word for word in text)
    return text

dataset_3['reviews'] = dataset_3['reviews'].apply(lemmatizing)

### Custom scaling and rounding

In [13]:
dataset_3['review_score'] = dataset_3['Rating'] / 5

dataset_3 = dataset_3.drop(columns=['Rating', 'Review'])

dataset_3['review_score'] = dataset_3['review_score'].round(decimals=2)

In [14]:
dataset_3.head()

Unnamed: 0,reviews,review_score
0,nice hotel expensive parking got good deal sta...,0.8
1,ok nothing special charge diamond member hilto...,0.4
2,nice room * experience hotel monaco seattle go...,0.6
3,"unique , great stay , wonderful time hotel mon...",1.0
4,"great stay great stay , went seahawk game awes...",1.0


### Exporting to csv

In [15]:
dataset_3.to_csv('../raw_data/clean_dataset_3.csv')

### Merging dataset_3 with dataset_1 and exporting to csv

In [16]:
dataset_1 = pd.read_csv("../raw_data/clean_dataset_1.csv")

In [24]:
dataset_merged = dataset_1.append(dataset_3, ignore_index=True)
dataset_merged = dataset_merged.drop(columns="Unnamed: 0")

In [25]:
dataset_merged.to_csv('../raw_data/clean_dataset_merged.csv')