In [None]:
RANDOM_STATE = 1

# Imports

In [13]:
import pandas as pd
import time
import numpy as np
import pickle
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth',300)

from nltk.corpus import stopwords
import re
import nltk
from nltk import tokenize
from nltk.stem import WordNetLemmatizer

import math

import spacy

import pyprojroot.here as here

# Import datasets from drive

Note - for the purpose of brevity, data has already been preprocessed and cleaned.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
data_path = '/content/drive/MyDrive/BT4222/data/raw/mbs_total.csv'
df = pd.read_csv(data_path)
df.head(10)

'/content/drive/MyDrive/BT4222/data/raw/crowne-plaza.csv'

# Clean Data

## Clean Ratings column

In [19]:
def rating_clean(rating1, rating2):
    if not math.isnan(rating1):
        return rating1
    return rating2

df["rating"] = df.apply(lambda row: rating_clean(row['rating1'], row['rating2']), axis = 1)

def valid_rating(rating):
    if math.isnan(rating):
        return False
    return True

df['valid_rating'] = df.apply(lambda row: valid_rating(row['rating']), axis = 1)

def classify_rating(rating):
    if rating>=4:
        return "Positive"
    if rating<=2:
        return "Negative"
    return "Neutral"

df["label"] = df.apply(lambda row: classify_rating(row['rating']), axis = 1)
df["label"].value_counts()

Positive    5099
Neutral      383
Negative     261
Name: label, dtype: int64

## Clean review text

In [None]:
# preprocess function
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
spacy_lemmatizer = spacy.load('en_core_web_sm', disable=['parser','ner'])

In [None]:
def preprocess_text(title, review):


    text = str(title) + " " + str(review)
    # lower text
    text = text.lower()
    # Remove newline characters
    text = text.replace('\\n',' ').replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').replace('\\', ' ')
    # Remove punctuation and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Remove multiple spaces
    text = re.sub(r'\s+',' ', text)
    # lemmatize
    text = spacy_lemmatizer(text)
    text = [token.lemma_ for token in text]
    # Remove stop words
    text = ' '.join([word for word in text if word not in stop_words])
    # tokenization done below, so no need to do it here.
    return text

df["cleaned_review"] = df.apply(lambda row: preprocess_text(row['review_title'], row['review_text']), axis = 1)

## Cleaning Raw title and review, and date

In [41]:
# combine raw title and review
def combine(r):
  return str(r['review_title']) + " " + str(r['review_text'])

df["combined_review"] = df.apply(lambda row: combine(row), axis = 1)

df['date'] = df['date_of_stay'].str.split(':').str[1]
df["date"] = pd.to_datetime(data["date"])

## Specify Covid

In [48]:
from datetime import date, timedelta, datetime

In [56]:
covid_start = datetime(2020, 1, 29, 0, 0)
covid_end = datetime(2022, 4, 1, 0, 0, 0)
def get_period(t):
    if pd.isnull(t):
        return None
    if t - covid_start < timedelta(0):
        return "PreCovid"
    elif t-covid_end >= timedelta(0):
        return "PostCovid"
    return "Covid"

In [57]:
data["covid"] = data.apply(lambda row: get_period(row["date"]), axis = 1)

In [None]:
df.head(10)

# Export Cleaned Data

In [None]:
df.to_csv("/content/drive/MyDrive/BT4222/data/cleaned/cleaned_mbs_reviews.csv")