In [1]:
import re
import string
import numpy as np
import contractions
from num2words import num2words
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import joblib

from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

import tqdm
from tqdm import tqdm

In [2]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

# Vader text cleaning

In [3]:
def clean_text_vader(text):
    # Expand contractions
    text = contractions.fix(text)
    
    # Remove HTML tags
    text = re.sub('<[^<]+?>', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Replace special characters with their ASCII equivalent
    text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    
    # Replace numbers with their written form
    words = []
    for word in text.split():
        if word.isdigit():
            words.append(num2words(int(word)))
        else:
            words.append(word)
    text = ' '.join(words)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))     
   
    return text

# Machine learning text cleaning

In [4]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Expand contractions
    text = contractions.fix(text)
    
    # Remove HTML tags
    text = re.sub('<[^<]+?>', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Replace special characters with their ASCII equivalent
    text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Replace numbers with their written form
    words = []
    for word in text.split():
        if word.isdigit():
            words.append(num2words(int(word)))
        else:
            words.append(word)
    text = ' '.join(words)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stem the words
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join the tokens back into a string
    preprocessed_text = " ".join(tokens)
    
    return text

# Vader

In [5]:
def catagorise_data(sentiment):
    if sentiment >=-1 and sentiment < -0.5:
        return 1
    
    if sentiment >= -0.5 and sentiment < 0:
        return 2
    
    if sentiment == 0:
        return 3
    
    if sentiment > 0 and sentiment < 0.5:
        return 4
    
    if sentiment >= 0.5 and sentiment <= 1:
        return 5

In [6]:
sid = SentimentIntensityAnalyzer()

def vader_rating(review):
    preprocessed_text = clean_text(review)
    tmp = sid.polarity_scores(preprocessed_text)
    predicted_rating = catagorise_data(tmp['compound'])
    return predicted_rating

# SVM

In [7]:
svm_model = joblib.load('../Models/svm_clf.joblib')
svm_vectorizer = joblib.load('../Models/svm_vectorizer.joblib')

def svm_rating(review):
    preprocessed_text = clean_text(review)
    X = svm_vectorizer.transform([preprocessed_text])
    y_pred = svm_model.predict(X)
    return y_pred[0]

# Naive Bayes

In [8]:
bayes_model = joblib.load('../Models/bayes_clf.joblib')
bayes_vectorizer = joblib.load('../Models/bayes_vectorizer.joblib')

def bayes_rating(review):
    preprocessed_text = clean_text(review)
    X = bayes_vectorizer.transform([preprocessed_text])
    y_pred = bayes_model.predict(X)
    return y_pred[0]

# Logistic Regression

In [9]:
logistic_model = joblib.load('../Models/regression.joblib')

def logistic_rating(review):
    preprocessed_text = clean_text(review)
    predicted_rating = logistic_model.predict([preprocessed_text])[0]
    predicted_rating = np.clip(predicted_rating, 1, 5)
    return predicted_rating

# Comparisson

In [10]:
review = input("Enter a review: ")

print('\n')
print('Vader rating:', vader_rating(review), '\n')
print('SVM rating:', svm_rating(review), '\n')
print('Naive Bayes rating:', bayes_rating(review), '\n')
print('Logistic Regression rating:', logistic_rating(review), '\n')

Enter a review: It was ok


Vader rating: 4 

SVM rating: 3 

Naive Bayes rating: 3 

Logistic Regression rating: 3 

