In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import LinearSVC
import pandas as pd
import plotly.express as px

In [47]:
reviews = pd.read_csv('../data/electronics_reviews_cleaned_50k.csv')

vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  # We tell the algorithm to add to its vocabulary all the single words (unigrams) and pairs of consecutive words (bigrams)
    # For exemple let's take a comment in our document : "it serves my needs quite well". In this example, if we only use unigrams,
    # our vocabulary will only contain for example "quite" and "well" as two different words. But if we also use bigrams,
    # our vocabulary will also contain the expression "quite well" as a single entity, which can be very useful.
    stop_words='english', # remove common English stop words
    min_df=3, # ignore terms that appear in less than 3 documents
    max_df=0.8 # ignore terms that appear in more than 80% of the documents
)

# Well, our vectorizer seems ready, let's fit it to our reviews. But before, it is important to note that the Sklearn's
# TfidfVectorizer will do the tokenization for us, so we don't need to do it manually.

In [48]:
X = vectorizer.fit_transform(reviews['reviewText'])
print(f"So now we still have the same number of documents: {X.shape[0]}, but our vocabulary has grown to {X.shape[1]} unique terms.")
# So now we have our features matrix X, where each row represents a review and each column represents a term from our vocabulary.

So now we still have the same number of documents: 49981, but our vocabulary has grown to 155419 unique terms.


In [49]:
vectorizer.vocabulary_

{'got': 52575,
 'gps': 53076,
 'husband': 60203,
 'otr': 91137,
 'road': 114977,
 'trucker': 138978,
 'impressed': 61013,
 'shipping': 120639,
 'time': 136135,
 'arrived': 4958,
 'days': 30518,
 'earlier': 38470,
 'expected': 42118,
 'week': 149831,
 'use': 142323,
 'started': 128761,
 'freezing': 49131,
 'just': 65071,
 'glitch': 51103,
 'unit': 140830,
 'worked': 152723,
 'great': 53285,
 'work': 152070,
 'normal': 88193,
 'person': 94905,
 'does': 35356,
 'option': 90545,
 'big': 10259,
 'truck': 138971,
 'routes': 115763,
 'tells': 134049,
 'scale': 117360,
 'coming': 24364,
 'ect': 39429,
 'love': 76718,
 'bigger': 10453,
 'screen': 117542,
 'ease': 38735,
 'putting': 106091,
 'addresses': 1761,
 'memory': 79927,
 'really': 109442,
 'bad': 6785,
 'say': 116940,
 'exception': 41795,
 'probably': 102385,
 'million': 80739,
 'luck': 77264,
 'contacted': 27310,
 'seller': 118639,
 'minutes': 81010,
 'email': 39901,
 'received': 110251,
 'instructions': 62851,
 'exchange': 41834,
 'way

In [52]:
# Now let's define our target variable y and split our data into a train_test set while assuring a good distribution of the target variable in both sets.
y = reviews['overall']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Train set size: {X_train.shape[0]} samples"
      f"\nTest set size: {X_test.shape[0]} samples"
      f"\nOverall distribution in the whole dataset:\n{y.value_counts(normalize=True)}"
      f"\nOverall distribution in the train set:\n{y_train.value_counts(normalize=True)}"
      f"\nOverall distribution in the test set:\n{y_test.value_counts(normalize=True)}")

Train set size: 39984 samples
Test set size: 9997 samples
Overall distribution in the whole dataset:
overall
5    0.618515
4    0.202177
3    0.079330
1    0.056722
2    0.043256
Name: proportion, dtype: float64
Overall distribution in the train set:
overall
5    0.618522
4    0.202181
3    0.079332
1    0.056723
2    0.043242
Name: proportion, dtype: float64
Overall distribution in the test set:
overall
5    0.618486
4    0.202161
3    0.079324
1    0.056717
2    0.043313
Name: proportion, dtype: float64


In [59]:
classifier = LinearSVC(random_state = 1)

classifier.fit(X_train, y_train)
# Let's for now already check the accuracy on the train set
train_accuracy = classifier.score(X_train, y_train)
print(f"Train set accuracy: {train_accuracy:.4f}")

Train set accuracy: 0.9912


In [60]:
prediction = classifier.predict(X_test)
test_accuracy = accuracy_score(y_test, prediction)
print(f"Test set accuracy: {test_accuracy:.4f}")

Test set accuracy: 0.6548


In [66]:
# Now let's conclude about our first model.
confusion_matrix_report = classification_report(y_test, prediction)
print("Classification Report:\n", confusion_matrix_report)


Classification Report:
               precision    recall  f1-score   support

           1       0.58      0.43      0.49       567
           2       0.32      0.09      0.14       433
           3       0.34      0.14      0.20       793
           4       0.39      0.25      0.31      2021
           5       0.72      0.91      0.81      6183

    accuracy                           0.65      9997
   macro avg       0.47      0.36      0.39      9997
weighted avg       0.60      0.65      0.61      9997



In [None]:
# Interpretation for the 5-grade class:
# Precision of 72% : Out of all the reviews that were predicted to be 5-star reviews, 72% of them were actually 5-star reviews.
# Recall of 85% : Out of all the actual 5-star reviews, 85% of them were correctly identified by the model.
# F1-score of 78% : This score indicates a good balance between precision and recall

# So the model did well on the 5-star reviews, but the conclusion is not the same for the other classes.
# We can see that the precision, recall and f1-score are quite low for the 1-star, 2-star, 3-star and 4-star classes.