In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("chrome_reviews.csv", usecols=['Text', 'Star'])  # only taking the Text and Star columns

In [3]:
df.head()

Unnamed: 0,Text,Star
0,This is very helpfull aap.,5
1,Good,3
2,Not able to update. Neither able to uninstall.,1
3,Nice app,4
4,Many unwanted ads,1


In [4]:
df['Star'].value_counts() # counting only the stars

5    3871
1    1894
4     652
3     451
2     336
Name: Star, dtype: int64

In [5]:
df.sample(5)

Unnamed: 0,Text,Star
1396,Keeps crashing,1
558,❤️❤️❤️,3
1533,Not updating google chrome in my phone but oth...,1
1720,I don't like how Chrome opens on its own !,1
2535,Bad,4


In [6]:
import re

In [7]:
def get_clean(x):
      x = str(x).lower().replace('\\','').replace('_',' ')  ## converting x in to lowercase, relacing \\ and _ with space
      x = re.sub("(.)\\1{2,}", "\\1", x)
      return x

In [8]:
df['Text'] = df['Text'].apply(lambda x: get_clean(x)) # calling get_clean function and passing data row by row for text

In [9]:
df.head()

Unnamed: 0,Text,Star
0,this is very helpfull aap.,5
1,good,3
2,not able to update. neither able to uninstall.,1
3,nice app,4
4,many unwanted ads,1


In [10]:
df["Star"].value_counts(normalize = True)

5    0.537340
1    0.262909
4    0.090505
3    0.062604
2    0.046641
Name: Star, dtype: float64

## TFIDF and Linear SVM 

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [12]:
#creating tfidf vectorizer
tfidf = TfidfVectorizer(max_features = 20000, ngram_range=(1, 5), analyzer= 'char')

In [13]:
x = tfidf.fit_transform(df['Text'])
y = df['Star']

In [14]:
x.shape, y.shape

((7204, 20000), (7204,))

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [16]:
x_train.shape, y_train.shape

((5763, 20000), (5763,))

In [17]:
lsvm = LinearSVC(C = 20, class_weight= 'balanced')
lsvm.fit(x_train, y_train)



LinearSVC(C=20, class_weight='balanced')

In [18]:
y_pred = lsvm.predict(x_test)

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.66      0.66      0.66       383
           2       0.11      0.09      0.10        64
           3       0.11      0.07      0.09        97
           4       0.11      0.06      0.08       117
           5       0.73      0.83      0.78       780

    accuracy                           0.64      1441
   macro avg       0.35      0.34      0.34      1441
weighted avg       0.59      0.64      0.61      1441



In [20]:
x = "shit!!! dont install that"
x = get_clean(x) 
vec = tfidf.transform([x])
lsvm.predict(vec)

array([1], dtype=int64)

In [21]:
x1 = "This is really a nice app. I liked it" 
x1 = get_clean(x1)
vec = tfidf.transform([x1])
lsvm.predict(vec)

array([5], dtype=int64)