In [70]:
import sys
sys.path.append('../../NLP_analysis/')


In [72]:
import pandas as pd
import plotly.express as px
from Scripts.Preprocesing import pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB

In [34]:
# this cells does not need to be run again as the results are saved in the csv file
# df = pd.read_csv('data/tripadvisor_hotel_reviews.csv')
# df["cleaned_text"] = pipeline.transform(df["Review"].values)
# df.to_csv("data/cleaned_spellings_Restaurant_reviews.csv", index=False)

In [73]:
df = pd.read_csv('../data/complete_cleaned_spellings_Restaurant_reviews.csv')

In [74]:
X = df['cleaned_text']
y = df['Rating']
y

0       5
1       5
2       5
3       5
4       5
       ..
9782    5
9783    5
9784    5
9785    3
9786    4
Name: Rating, Length: 9787, dtype: int64

While we know TD-IDF will be better, let's try the CountVectorizer first
* 1）Count Vectors + MultinomialNB()

In [75]:
vectorizer = CountVectorizer()
train_test = vectorizer.fit_transform(X)

In [76]:
bag_of_words_df = pd.DataFrame(train_test.toarray())

In [77]:
label_encoder = LabelEncoder()
df['Rating_enc'] = label_encoder.fit_transform(y)

In [78]:
X =bag_of_words_df 
y = df['Rating_enc']

test_size=0.2
random_state=42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

In [79]:
clf1 = MultinomialNB()
clf1.fit(X_train,y_train)

In [80]:
val_pred1 = clf1.predict(X_test)
class_labels = ["1", "2", "3","4","5"]


In [81]:
confusion_matrix_kwargs = dict(
    text_auto=True,
    title="Confusion Matrix", width=1000, height=800,
    labels=dict(x="Predicted", y="True Label"),
    x=class_labels,
    y=class_labels,
    color_continuous_scale='Blues'
)

def report(y_true, y_pred, class_labels):
    print(classification_report(y_true, y_pred, target_names=class_labels))
    confusion_matrix_data = confusion_matrix(y_true, y_pred)
    fig = px.imshow(
        confusion_matrix_data,
        **confusion_matrix_kwargs
        )
    fig.show()

In [82]:
report(y_test, val_pred1, class_labels)

              precision    recall  f1-score   support

           1       0.71      0.80      0.75       356
           2       0.50      0.02      0.04       141
           3       0.40      0.16      0.22       238
           4       0.42      0.59      0.49       465
           5       0.72      0.77      0.74       758

    accuracy                           0.60      1958
   macro avg       0.55      0.47      0.45      1958
weighted avg       0.59      0.60      0.57      1958



This classifier performs differently on different classes, with some classes 1,5 performing better and others 2,3,4 performing worse. The overall accuracy is 60%, so next we will try to use TF-IDF

In [96]:
class TextClassifier:
    
    def __init__(self, model, vectorizer=None):
        self.model = model 
        self.vectorizer = vectorizer 
        self.X_train_tf = None
        self.X_test_tf = None
        
    def train(self, X_train, y_train):
        self.X_train_tf = self.vectorizer.fit_transform(X_train)
        self.model.fit(self.X_train_tf, y_train)
        
    def evaluate(self, X_test, y_test):
        self.X_test_tf = self.vectorizer.transform(X_test)
        
        y_pred = self.model.predict(self.X_test_tf)
        acc = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        
        print(f"Accuracy: {acc}")
        print(report)
        confusion_matrix_kwargs = dict(
            text_auto=True,
            title="Confusion Matrix", width=1000, height=800,
            labels=dict(x="Predicted", y="True Label"),
            x=class_labels,
            y=class_labels,
            color_continuous_scale='Blues'
        )
        confusion_matrix_data = confusion_matrix(y_test, y_pred)
        fig = px.imshow(
            confusion_matrix_data,
            **confusion_matrix_kwargs
            )
        fig.show()
        
    
    def predict(self, text):
        text_tf = self.vectorizer.transform([text])
        return self.model.predict(text_tf)[0]

In [97]:
X = df['cleaned_text']
y = df['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = TextClassifier(model=LogisticRegression(max_iter=1000), vectorizer=TfidfVectorizer(max_features=5000))
model.train(X_train, y_train)
model.evaluate(X_test, y_test)


Accuracy: 0.6205311542390194
              precision    recall  f1-score   support

           1       0.72      0.85      0.78       356
           2       0.31      0.03      0.05       141
           3       0.42      0.24      0.31       238
           4       0.47      0.50      0.48       465
           5       0.69      0.82      0.75       758

    accuracy                           0.62      1958
   macro avg       0.52      0.49      0.47      1958
weighted avg       0.58      0.62      0.59      1958



First of all, our accuracy rate is 62% which means that 62% of our predictions are correct. It's not a great score, but maybe the complexity of the data and the amount of data we have isn't enough to get a better score.

An important fact to note is that recall rates in categories 2 and 3 are very low. This means that the model cannot distinguish between 5 categories. It mainly classifies the data in the most representative categories for example, category 1 has a high recall score meaning that most 1s are classified as 1s, but the precious is 71% so this means that the model classifies a lot as 1s, but there are also a lot of false positives. This is a big sign of underfitting: the model can't classify data in class 2 or class 3, so it just puts them in the category that is most likely to be correct.

The low recall for Class 2 and Class 3 confirms our idea, as it shows that only 0.02% of Class 2 is classified as Class 2 (precious 0.31, so the model not only forgets Class 2, but also badly classifies the small amount of data it classifies there).

What we want to do is try to improve the second and third types of recall. We will focus on this in the next notebook, even though improved accuracy is recommended