In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from joblib import dump

import re
import string
import numpy as np
import contractions
from num2words import num2words
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from tqdm import tqdm

# Support Vector machine

In [2]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

The same clean_text preprocessing function as used in the data preprocessing notebook. This is so we can type our own reviews and see what we get.

In [3]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Expand contractions
    text = contractions.fix(text)
    
    # Remove HTML tags
    text = re.sub('<[^<]+?>', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Replace special characters with their ASCII equivalent
    text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Replace numbers with their written form
    words = []
    for word in text.split():
        if word.isdigit():
            words.append(num2words(int(word)))
        else:
            words.append(word)
    text = ' '.join(words)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stem the words
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join the tokens back into a string
    preprocessed_text = " ".join(tokens)
    
    return text

In [4]:
df = pd.read_csv('../Data/preprocessed_data.csv')

In [5]:
df.isnull().mean()

overall              0.0
reviewText           0.0
asin                 0.0
preprocessed_text    0.0
dtype: float64

In [6]:
df.dropna(inplace=True)

We take an even sample from the dataset so that the model does not bias 5 star reviews. Taking a smaller sample also reduces the training time for the model.

In [7]:
df_sampled = df.groupby('overall').apply(lambda x: x.sample(n=10000, random_state=42)).reset_index(drop=True)
df_sampled.head()

Unnamed: 0,overall,reviewText,asin,preprocessed_text
0,1,It fits fine on Char Broil tru infrared BBQ pi...,B00005MF8V,fit fine char broil tru infrar bbq pit materi ...
1,1,We had a 20K Generac and what a diseaster! The...,B00C2LV4H8,20k generac diseast unit six year old less six...
2,1,I just bought this gas can because my new lawn...,B001QCWQUS,bought gas new lawn mower arriv today need bes...
3,1,This piece of cheaply made piece of junk gives...,B00004TBJI,piec cheapli made piec junk give new mean rink...
4,1,"Great cover, held up during a rough winter las...",B000WEMG2O,great cover held rough winter last year damag ...


In [8]:
X = df_sampled['preprocessed_text']
y = df_sampled['overall']

We are using a Term Frequency-Inverse Document Frequency vectorizer so the model can understand which words are important even if they dont appear frequently within each individual review. We also use a Radial Basis Function kernel as it yielded the best results.

In [9]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
clf = SVC(kernel="rbf", C=1, gamma='scale')
clf.fit(X_train, y_train)

We get an accuracy of 50%

In [12]:
y_pred = clf.predict(X_test)

In [13]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.5044


In [14]:
def predict_rating(review):
    preprocessed_text = clean_text(review)
    X = vectorizer.transform([preprocessed_text])
    y_pred = clf.predict(X)
    return y_pred[0]

In [15]:
predict_rating("Good product, works well. However, broke after only 1 use. I'm happy")

2

In [16]:
tqdm.pandas()
df['svm_rating'] = df['preprocessed_text'].progress_apply(predict_rating)
df.head()

100%|████████████████████████████████████████████████████████████████████████| 179642/179642 [1:10:58<00:00, 42.18it/s]


Unnamed: 0,overall,reviewText,asin,preprocessed_text,svm_rating
0,5,I don't spend a lot on my flags because they r...,9539723809,spend lot flag realli get beat lesser qualiti ...,5
1,5,A very dear friend of mine is slowly losing he...,B00000JSZH,dear friend mine slowli lose sight pen make po...,5
2,5,This is absolutely exquisite! It's made of car...,B00000JSZH,absolut exquisit made cardboard like descript ...,5
3,4,"This is really nice to use, however, just not ...",B00000JSZH,realli nice use howev color saddl shimmer give...,3
4,5,This Angel is beautiful. I as so glad I chose ...,B00000JSZH,angel beauti glad chose one even beauti look o...,5


In [17]:
df.describe()

Unnamed: 0,overall,svm_rating
count,179642.0,179642.0
mean,4.088621,3.703176
std,1.307608,1.393932
min,1.0,1.0
25%,4.0,3.0
50%,5.0,4.0
75%,5.0,5.0
max,5.0,5.0


In [18]:
df.to_csv('../Data/svm.csv', index=False)
dump(clf, '../Models/svm_clf.joblib')
dump(vectorizer, '../Models/svm_vectorizer.joblib')

['../Models/svm_vectorizer.joblib']