# Sentiment Analysis with ML

# Notebook Set-up

In [None]:
# Mount to Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# CD to correct folder 
!cd "/content/drive/MyDrive/Colab_Notebooks/Thesis/"

# Initialize path
import sys
sys.path.append('/content/drive/MyDrive/Colab_Notebooks/Thesis/')

In [None]:
pip install -U matplotlib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Import general packages
import pandas as pd
import numpy as np

# Cleaning and pre-processing
import re
import nltk
from nltk import sent_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download("stopwords")

# Visualization
from tabulate import tabulate
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid", palette="pastel")

# Machine Learning
from sklearn.metrics import classification_report
#from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import plot_confusion_matrix

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Functions
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

# Load Data

In [None]:
# ---------- LOAD DATA ----------
import pickle
from pprint import pprint

# Define path
path = "/content/drive/MyDrive/Colab_Notebooks/Thesis/Data/"

# Print aspect categories
aspect_dict = pickle.load(open(path+"reviews_per_aspect.pkl", "rb"))
pprint({key: len(item) for (key, item) in aspect_dict.items()})

# Save data in dataframe
df_euans_total = pd.read_excel(path + 'reviews_per_aspect.xlsx')

# Preview of data
df_euans_total.head(3)

{'Access': 4751,
 'Access Statement': 276,
 'Accessibility': 6666,
 'Accessibility Guide': 457,
 'Accessible Performances': 294,
 'Anything else you wish to tell us?': 3401,
 'Assistance dog facilities': 548,
 'Awards List': 494,
 'COVID Precautions': 6666,
 'Overview': 5624,
 'Staff': 11311,
 'Toilets': 11291,
 'Transport & Parking': 4851,
 'Venue Manager responded to this review': 100}


Unnamed: 0,Aspect,Rating,Review,City,Country,Venue
0,Overview,5.0,Dobbies garden center has a large range of ite...,Perth,United Kingdom,https:||www.euansguide.com|venues|dobbies-gard...
1,Transport & Parking,5.0,"There is disabled parking close to the doors, ...",Perth,United Kingdom,https:||www.euansguide.com|venues|dobbies-gard...
2,Access,5.0,There is a lift and there is also a cafe where...,Perth,United Kingdom,https:||www.euansguide.com|venues|dobbies-gard...


# Cleaning Data


In [None]:
# --------- CLEAN DATA ---------
print(color.BOLD + "---> START CLEANING" + color.END)
# Create a copy of the data
df = df_euans_total

# Rename and drop colums
df = df.drop(columns=["City", "Country"])
df = df.rename(columns={"Review": "Text"})

# Drop irrelevant aspects
df = df[df['Aspect']!='Anything else you wish to tell us?']
df = df[df['Aspect']!='Venue Manager responded to this review']       
df = df[df['Aspect']!='COVID Precautions']  
df = df[df['Aspect']!='Accessibility Guide']  
df = df[df['Aspect']!='Awards List'] 
df = df[df['Aspect']!='Access Statement']

# Drop NaN
df = df[df["Text"].notna()]

# Some reviews contain: "A description about the access has not been added for this venue."
# These have a rating <=0.0
# Remove no description reviews
df = df[df["Rating"]>0.0]

# Remove review if sentence count == 0
df["SentenceCount"] = df["Text"].apply(lambda x: len(sent_tokenize(x)))
df = df[df["SentenceCount"]!=0]

# Take only the venue name
df["Venue"] = df["Venue"].apply(lambda x: ' '.join(x.split('|')[4].split("-")[:-1]))

# Rating into Sentiment
df["Sentiment"] = df["Rating"].map(lambda score: 'positive' if score > 3.0 else 'negative')
df['Label'] = df["Sentiment"].map({'positive': 1, 'negative': 0})

print("---> DONE CLEANING")
df.head(3)

[1m---> START CLEANING[0m
---> DONE CLEANING


Unnamed: 0,Aspect,Rating,Text,Venue,SentenceCount,Sentiment,Label
0,Overview,5.0,Dobbies garden center has a large range of ite...,dobbies garden centre perth,1,positive,1
1,Transport & Parking,5.0,"There is disabled parking close to the doors, ...",dobbies garden centre perth,2,positive,1
2,Access,5.0,There is a lift and there is also a cafe where...,dobbies garden centre perth,2,positive,1


# Exploratory Data Analysis

In [None]:
#----------EDA----------
print(color.BOLD + "---> START EDA" + color.END)
# Plot aspect distributions
review_counts = df['Aspect'].value_counts()
plt.figure(figsize=(8,4))
sns.barplot(review_counts.index, review_counts.values, alpha=0.8)
#ax.bar_label(ax.containers[0])
plt.ylabel('Number of Reviews', fontsize=12)
plt.xlabel('Aspect', fontsize=12)
plt.ylim(0, 10000)
#plt.xticks(rotation=90)
#plt.show()

print(color.BOLD + "Review EDA:" + color.END)
review_metrics = [["Total nr. of reviews:", review_counts.sum()], 
                    ["Max nr. of reviews:", review_counts.max()], 
                    ["Min nr. of reviews:", review_counts.min()], 
                    ["Average nr. of reviews:", review_counts.mean()]]

#define header names
col_names = ["Metric", "Count"]

#display table
print(tabulate(review_metrics, headers=col_names))

# Count nr. of sentences per review
print(color.BOLD + "Sentence EDA:" + color.END)
sentence_metrics = [["Total nr. of sentences:", df["SentenceCount"].sum()], 
                    ["Max nr. of sentences:", df["SentenceCount"].max()], 
                    ["Min nr. of sentences:", df["SentenceCount"].min()], 
                    ["Average nr. of sentences:", df["SentenceCount"].mean()]]

#define header names
col_names = ["Metric", "Count"]
  
#display table
print(tabulate(sentence_metrics, headers=col_names))

# Sentences
sentence_counts = dict()
for aspect in df["Aspect"].unique():
    temp = df[df["Aspect"]==aspect]
    sentence_counts[aspect] = temp["SentenceCount"].mean()

print(color.BOLD + "Average nr. of sentences per aspect" + color.END)
pprint(sentence_counts)

[1m---> START EDA[0m
[1mReview EDA:[0m
Metric                     Count
-----------------------  -------
Total nr. of reviews:    40024
Max nr. of reviews:       8921
Min nr. of reviews:       6724
Average nr. of reviews:   8004.8
[1mSentence EDA:[0m
Metric                            Count
-------------------------  ------------
Total nr. of sentences:    118947
Max nr. of sentences:          57
Min nr. of sentences:           1
Average nr. of sentences:       2.97189
[1mAverage nr. of sentences per aspect[0m
{'Access': 3.715902322811197,
 'Overview': 4.437731196054254,
 'Staff': 1.7686777920410783,
 'Toilets': 2.344735276621059,
 'Transport & Parking': 2.2722723944349523}




ImportError: ignored

<Figure size 576x288 with 1 Axes>

In [None]:
#----------EDA----------
review_count_per_venue = df['Venue'].value_counts()

print(color.BOLD + "Review EDA:" + color.END)
review_metrics_per_venue = [["Total nr. of reviews:", int(review_count_per_venue.sum())], 
                            ["Max nr. of reviews:", review_count_per_venue.max()], 
                            ["Min nr. of reviews:", review_count_per_venue.min()], 
                            ["Average nr. of reviews:", review_count_per_venue.mean()]]

#define header names
col_names = ["Metric", "Count"]

#review_metrics_per_venue

#display table
print(tabulate(review_metrics_per_venue, headers=col_names))

[1mReview EDA:[0m
Metric                         Count
-----------------------  -----------
Total nr. of reviews:    40024
Max nr. of reviews:        275
Min nr. of reviews:          1
Average nr. of reviews:      6.24107


In [None]:
# ---------- EDA ----------

# Count Ratings
print(color.BOLD + "Nr. of Reviews per Rating:" + color.END)
rating_counts = df['Rating'].value_counts()
plt.figure(figsize=(8,4))
ax = sns.barplot(rating_counts.index, rating_counts.values, alpha=0.8)
#ax.bar_label(ax.containers[0])
plt.ylabel('Number of Reviews', fontsize=12)
plt.xlabel('Rating', fontsize=12)
plt.ylim(0, 16000)
plt.show();

# Count Sentiments
print(color.BOLD + "Nr. of Reviews per Sentiment:" + color.END)
sentiment_count = df['Sentiment'].value_counts()
plt.figure(figsize=(8,4))
ax = sns.barplot(sentiment_count.index, sentiment_count.values, alpha=0.8)
#ax.bar_label(ax.containers[0])
plt.ylabel('Number of Reviews', fontsize=12)
plt.xlabel('Sentiment', fontsize=12)
plt.ylim(0, 35000)
plt.show();

[1mNr. of Reviews per Rating:[0m




ImportError: ignored

<Figure size 576x288 with 1 Axes>

[1mNr. of Reviews per Sentiment:[0m




ImportError: ignored

<Figure size 576x288 with 1 Axes>

# Split Data

In [None]:
import sklearn
from sklearn.model_selection import train_test_split

# Shuffle data
df_shuffle = df.sample(frac=1)

# Train data
train_data = df_shuffle[:int(df.shape[0]*0.8)]

X = train_data.Text.values
y = train_data.Label.values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=2020)

# Test data
test_data = df_shuffle[int(df.shape[0]*0.8):]

# Keep important columns
test_data = test_data[['Aspect', 'Text', 'Label']]

results = dict()

In [None]:
train_data

Unnamed: 0,Aspect,Rating,Text,Venue,SentenceCount,Sentiment,Label
3708,Toilets,4.0,"Again, very spacious. The only issue is that t...",ikea glasgow,2,positive,1
37071,Overview,4.5,We had a lovely visit to the Bluebell in Helps...,the bluebell peterborough,4,positive,1
51429,Toilets,3.0,One large accessible toilet off cafe area.,chapter cardiff,1,negative,0
6177,Staff,4.0,Staff were helpful.,marks spencer outlet edinburgh,1,positive,1
19713,Staff,4.0,"Helpful enough, but there was no one obvious u...",royal west of england academy bristol,1,positive,1
...,...,...,...,...,...,...,...
29984,Access,5.0,Entrance level with automatic door. Lift to al...,travelodge morecambe hotel arndale centre,6,positive,1
4482,Toilets,3.5,There were accessible toilets at each entrance...,royal botanic garden edinburgh,1,positive,1
39809,Access,4.5,Paths around the park are smooth and mostly le...,queen elizabeth olympic park london,7,positive,1
56172,Transport & Parking,5.0,There is a pay-and-display car park Just acros...,ada meze kitchen newport,2,positive,1


In [None]:
# Save files
train_data.to_csv(path+'train_data.csv')
test_data.to_csv(path+'test_data.csv')
pd.DataFrame(X_train).to_csv(path+'X_train.csv')
pd.DataFrame(X_val).to_csv(path+'X_val.csv')
pd.DataFrame(y_train).to_csv(path+'y_train.csv')
pd.DataFrame(y_val).to_csv(path+'y_val.csv')

# Sentiment Analysis

## Baseline Model: Machine Learning

In [None]:
# Pre-processing
def text_preprocessing(s):
    """
    - Lowercase the sentence
    - Change "'t" to "not"
    - Isolate and remove punctuations except "?"
    - Remove other special characters
    - Remove stop words except "not" and "can"
    - Remove trailing whitespace
    """
    s = s.lower()
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()

    return s

In [None]:
# Preprocess text 3.5 min
X_train_preprocessed = np.array([text_preprocessing(text) for text in X_train])
X_val_preprocessed = np.array([text_preprocessing(text) for text in X_val])

In [None]:
%%time
# MultinomialNB, alpha=1.0
print(color.BOLD + "Naive Bayes (tf-idf)" + color.END)
nb = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB(alpha=1.0))
                ])
nb.fit(X_train_preprocessed, y_train)
y_pred = nb.predict(X_val_preprocessed)
#plot_confusion_matrix(nb, X_val_preprocessed, y_val)

metrics_nb = classification_report(y_val, y_pred, output_dict=True)
results["NB(tfidf)"] = metrics_nb
pprint(metrics_nb)

[1mNaive Bayes (tf-idf)[0m
{'0': {'f1-score': 0.01188707280832095,
       'precision': 1.0,
       'recall': 0.005979073243647235,
       'support': 669},
 '1': {'f1-score': 0.8839644041179551,
       'precision': 0.792057535959975,
       'recall': 1.0,
       'support': 2533},
 'accuracy': 0.792317301686446,
 'macro avg': {'f1-score': 0.447925738463138,
               'precision': 0.8960287679799874,
               'recall': 0.5029895366218237,
               'support': 3202},
 'weighted avg': {'f1-score': 0.7017596150342119,
                  'precision': 0.8355033537122475,
                  'recall': 0.792317301686446,
                  'support': 3202}}
CPU times: user 2.14 s, sys: 47.6 ms, total: 2.19 s
Wall time: 2.64 s


In [None]:
%%time
# SVM, kernel=linear
print(color.BOLD + "Support Vector Machines (tf-idf)" + color.END)
clf = Pipeline([
                ('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', svm.SVC(kernel='linear'))
                ])
clf.fit(X_train_preprocessed, y_train)
y_pred = clf.predict(X_val_preprocessed)
#plot_confusion_matrix(clf, X_val_preprocessed, y_val)

metrics_svm = classification_report(y_val, y_pred, output_dict=True)
results["SVM(tfidf)"] = metrics_svm
pprint(metrics_svm)

[1mSupport Vector Machines (tf-idf)[0m
{'0': {'f1-score': 0.29820627802690586,
       'precision': 0.5964125560538116,
       'recall': 0.19880418535127056,
       'support': 669},
 '1': {'f1-score': 0.8864296081277214,
       'precision': 0.8200738502853306,
       'recall': 0.9644690090801421,
       'support': 2533},
 'accuracy': 0.8044971892567145,
 'macro avg': {'f1-score': 0.5923179430773137,
               'precision': 0.7082432031695711,
               'recall': 0.5816365972157064,
               'support': 3202},
 'weighted avg': {'f1-score': 0.7635309798212112,
                  'precision': 0.7733438671994823,
                  'recall': 0.8044971892567145,
                  'support': 3202}}
CPU times: user 2min 41s, sys: 379 ms, total: 2min 41s
Wall time: 2min 40s


In [None]:
%%time
# LogReg
print(color.BOLD + "Logistic Regression (tf-idf)" + color.END)
logreg = Pipeline([
                ('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5, solver='sag'))
                ])
logreg.fit(X_train_preprocessed, y_train)
y_pred = logreg.predict(X_val_preprocessed)
#plot_confusion_matrix(logreg, X_val_preprocessed, y_val)

metrics_logreg = classification_report(y_val, y_pred, output_dict=True)
results["LogReg(tfidf)"] = metrics_logreg
pprint(metrics_logreg)

[1mLogistic Regression (tf-idf)[0m
{'0': {'f1-score': 0.4065934065934066,
       'precision': 0.428099173553719,
       'recall': 0.38714499252615847,
       'support': 669},
 '1': {'f1-score': 0.8526315789473684,
       'precision': 0.8421255294570659,
       'recall': 0.8634030793525463,
       'support': 2533},
 'accuracy': 0.763897564022486,
 'macro avg': {'f1-score': 0.6296124927703874,
               'precision': 0.6351123515053925,
               'recall': 0.6252740359393524,
               'support': 3202},
 'weighted avg': {'f1-score': 0.7594399682962751,
                  'precision': 0.7556222090013073,
                  'recall': 0.763897564022486,
                  'support': 3202}}
CPU times: user 2.91 s, sys: 14 ms, total: 2.92 s
Wall time: 2.9 s




SVM model performs best, so we will use SVM as a baseline to see whether we can improve the model.

In [None]:
# Pipeline with Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(TaggedDocument(v.split(), [label]))
    return labeled

def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

def doc2vec_preprocessing(text):
    text = str(text)
    text = text.lower()  
    text = text.replace("\n", '')                            # Remove \n
    text = text.replace("(translated by google)", '')        # Remove (translated by google)
    text = re.sub("n’t", ' not', text)                       # Change n't to not
    text = re.sub("'re", ' are', text)                       # Change 're to are
    text = re.sub(r'[^\w\s]', '', text)                      # Remove punctuation
    text = re.sub(" +", " ", text)                           # Remove multiple spaces
    text = re.sub(r"http.*?(?=\s)", "", text)                # Remove URL's
    text = re.sub("'"," ", text)                             # Remove apostrophes

    return text

In [None]:
%%time
# Vectorization Doc2Vec
from sklearn import utils
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

X_train_doc2vec = [doc2vec_preprocessing(text) for text in X_train_preprocessed]
X_val_doc2vec = [doc2vec_preprocessing(text) for text in X_val_preprocessed]

print(color.BOLD + "Support Vector Machines with Doc2Vec" + color.END)
X_train_label = label_sentences(X_train_doc2vec, 'Train')
X_val_label = label_sentences(X_val_doc2vec, 'Val')
all_data = X_train_label + X_val_label

model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha
    
train_vectors_dbow = get_vectors(model_dbow, len(X_train_label), 300, 'Train')
val_vectors_dbow = get_vectors(model_dbow, len(X_val_label), 300, 'Val')

[1mSupport Vector Machines with Doc2Vec[0m


100%|██████████| 32019/32019 [00:00<00:00, 2944150.38it/s]
100%|██████████| 32019/32019 [00:00<00:00, 2324812.09it/s]
100%|██████████| 32019/32019 [00:00<00:00, 2775428.20it/s]
100%|██████████| 32019/32019 [00:00<00:00, 2884456.71it/s]
100%|██████████| 32019/32019 [00:00<00:00, 2537840.05it/s]
100%|██████████| 32019/32019 [00:00<00:00, 2556877.23it/s]
100%|██████████| 32019/32019 [00:00<00:00, 2994168.06it/s]
100%|██████████| 32019/32019 [00:00<00:00, 2359365.08it/s]
100%|██████████| 32019/32019 [00:00<00:00, 2199397.65it/s]
100%|██████████| 32019/32019 [00:00<00:00, 2391548.75it/s]
100%|██████████| 32019/32019 [00:00<00:00, 2514038.45it/s]
100%|██████████| 32019/32019 [00:00<00:00, 2380315.84it/s]
100%|██████████| 32019/32019 [00:00<00:00, 2554688.50it/s]
100%|██████████| 32019/32019 [00:00<00:00, 2911723.43it/s]
100%|██████████| 32019/32019 [00:00<00:00, 2461417.86it/s]
100%|██████████| 32019/32019 [00:00<00:00, 2231076.52it/s]
100%|██████████| 32019/32019 [00:00<00:00, 2521922.55it/

CPU times: user 2min 27s, sys: 25 s, total: 2min 52s
Wall time: 1min 36s


In [None]:
# SVM, kernel=linear
print(color.BOLD + "Support Vector Machines with Doc2Vec" + color.END)
clf = svm.SVC(kernel='linear')
clf.fit(train_vectors_dbow, y_train)
y_pred = clf.predict(val_vectors_dbow)
#plot_confusion_matrix(clf, val_vectors_dbow, y_val)

metrics_svm_doc2vec = classification_report(y_val, y_pred, output_dict=True)
results["SVM(doc2vec)"] = metrics_svm_doc2vec
pprint(metrics_svm_doc2vec)

[1mSupport Vector Machines with Doc2Vec[0m




{'0': {'f1-score': 0.0, 'precision': 0.0, 'recall': 0.0, 'support': 669},
 '1': {'f1-score': 0.8833478639930253,
       'precision': 0.7910680824484697,
       'recall': 1.0,
       'support': 2533},
 'accuracy': 0.7910680824484697,
 'macro avg': {'f1-score': 0.44167393199651267,
               'precision': 0.39553404122423486,
               'recall': 0.5,
               'support': 3202},
 'weighted avg': {'f1-score': 0.6987883009039142,
                  'precision': 0.6257887110686988,
                  'recall': 0.7910680824484697,
                  'support': 3202}}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ImportError: ignored

<Figure size 432x288 with 2 Axes>

In [None]:
model_names = ['NB (tfidf)', 'SVM (tf-idf)', 'LogReg (tf-idf)', 'SVM (doc2vec)']

results_df = pd.DataFrame(results).transpose()

writer = pd.ExcelWriter('/content/drive/MyDrive/Colab_Notebooks/Thesis/Results/results_from_validation.xlsx')

results_df.to_excel(writer, sheet_name="sentiment_analysis")

writer.save()