In [46]:
import pandas as pd
import numpy as np
import nltk
import re 
from nltk.corpus import stopwords
from autocorrect import Speller
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

tokenizer = nltk.RegexpTokenizer(r"\w+")
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pengalo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pengalo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pengalo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\pengalo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Drop the columns 
Delete @, links, #
replace 0 with -1
replace 4 with 1
Tokenize
delete stopwords 
lemmatisation
stemming

In [122]:
def removeStopWords(L) :
  filtered_sentence = []
  for w in L:
    if w not in stop_words:
        filtered_sentence.append(w)
  return filtered_sentence

def lemmatizeWords(L) :
  lemmatizedSentence = []
  for w in L:
    lemmatizedSentence.append(lemmatizer.lemmatize(w))
  return lemmatizedSentence

def stemWords(L) : 
  stemedSentence = []
  for w in L:
    stemedSentence.append(ps.stem(w))
  return stemedSentence

def removeNumbers(L) :
  noNumberSentence = []
  for w in L:
    if not w.isdigit() :
      noNumberSentence.append(w)
  return noNumberSentence

def detokenizer(L) : 
  return TreebankWordDetokenizer().detokenize(L)

def flatener(L) :
    flatenedTokens = []
    for w in L:
      flatenedTokens.append(w[0] + "/" + w[1])
    return flatenedTokens

def reduceScore(Rating):
    if(Rating <=2 ):
        return -1
    else:
        return 1
    
def clean_text(Review):
   
    Review = str(Review).lower() # convert to lowercase
    Review = re.sub('\[.*?\]', '', Review) # Remove []
    Review = re.sub('https?://\S+|www\.\S+', '', Review) # Remove URls
    Review = re.sub('<.*?>+', '', Review) #Remove html code
    Review = re.sub(r'[^a-z0-9\s]', '', Review) # Remove punctuation
    Review = re.sub('\n', '', Review) #remove return to line
    Review = re.sub('\w*\d\w*', '', Review) #Remove numbers
    return Review

def libelToScore(libel):
    if libel == "negative" : return -1
    elif libel == "positive" :return 1
    else : return 0

print(removeNumbers(["awww", "50", "mentioned"]))

['awww', 'mentioned']


# Sentiment 140 database

In [66]:
training_data = pd.read_csv("./Sentiment140 - A Twitter Sentiment Analysis Tool/training.1600000.processed.noemoticon.csv", names = ["polarity", "ID", "Date", "Querry", "User", "tweets"])
training_data.head(5)

Unnamed: 0,polarity,ID,Date,Querry,User,tweets
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [67]:
training_data = training_data.drop( columns = ["ID", "Date", "Querry", "User"])

In [68]:
training_data["Clean_tweets"] = training_data["tweets"].replace('(@\w+.*?)',"", regex=True).replace('(#\w+.*?)',"", regex=True).str.replace('http\S+|www.\S+', '', case=False, regex=True).str.lower()
training_data["polarity"] = training_data["polarity"].replace(0,-1).replace(4,1)

In [69]:
training_data

Unnamed: 0,polarity,tweets,Clean_tweets
0,-1,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got da..."
1,-1,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...
2,-1,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to s...
3,-1,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,-1,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am..."
...,...,...,...
1599995,1,Just woke up. Having no school is the best fee...,just woke up. having no school is the best fee...
1599996,1,TheWDB.com - Very cool to hear old Walt interv...,thewdb.com - very cool to hear old walt interv...
1599997,1,Are you ready for your MoJo Makeover? Ask me f...,are you ready for your mojo makeover? ask me f...
1599998,1,Happy 38th Birthday to my boo of alll time!!! ...,happy 38th birthday to my boo of alll time!!! ...


In [70]:
training_data["Tokenized_tweets"] = training_data["Clean_tweets"].apply(tokenizer.tokenize)
training_data

Unnamed: 0,polarity,tweets,Clean_tweets,Tokenized_tweets
0,-1,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got da...","[awww, that, s, a, bummer, you, shoulda, got, ..."
1,-1,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,"[is, upset, that, he, can, t, update, his, fac..."
2,-1,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to s...,"[i, dived, many, times, for, the, ball, manage..."
3,-1,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its..."
4,-1,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am...","[no, it, s, not, behaving, at, all, i, m, mad,..."
...,...,...,...,...
1599995,1,Just woke up. Having no school is the best fee...,just woke up. having no school is the best fee...,"[just, woke, up, having, no, school, is, the, ..."
1599996,1,TheWDB.com - Very cool to hear old Walt interv...,thewdb.com - very cool to hear old walt interv...,"[thewdb, com, very, cool, to, hear, old, walt,..."
1599997,1,Are you ready for your MoJo Makeover? Ask me f...,are you ready for your mojo makeover? ask me f...,"[are, you, ready, for, your, mojo, makeover, a..."
1599998,1,Happy 38th Birthday to my boo of alll time!!! ...,happy 38th birthday to my boo of alll time!!! ...,"[happy, 38th, birthday, to, my, boo, of, alll,..."


In [71]:
training_data["Tokenized_tweets"] = training_data["Tokenized_tweets"].apply(removeStopWords)
training_data

Unnamed: 0,polarity,tweets,Clean_tweets,Tokenized_tweets
0,-1,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got da...","[awww, bummer, shoulda, got, david, carr, thir..."
1,-1,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,"[upset, update, facebook, texting, might, cry,..."
2,-1,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to s...,"[dived, many, times, ball, managed, save, 50, ..."
3,-1,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[whole, body, feels, itchy, like, fire]"
4,-1,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am...","[behaving, mad, see]"
...,...,...,...,...
1599995,1,Just woke up. Having no school is the best fee...,just woke up. having no school is the best fee...,"[woke, school, best, feeling, ever]"
1599996,1,TheWDB.com - Very cool to hear old Walt interv...,thewdb.com - very cool to hear old walt interv...,"[thewdb, com, cool, hear, old, walt, interviews]"
1599997,1,Are you ready for your MoJo Makeover? Ask me f...,are you ready for your mojo makeover? ask me f...,"[ready, mojo, makeover, ask, details]"
1599998,1,Happy 38th Birthday to my boo of alll time!!! ...,happy 38th birthday to my boo of alll time!!! ...,"[happy, 38th, birthday, boo, alll, time, tupac..."


In [72]:
training_data["Tokenized_tweets"] = training_data["Tokenized_tweets"].apply(lemmatizeWords)
training_data

Unnamed: 0,polarity,tweets,Clean_tweets,Tokenized_tweets
0,-1,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got da...","[awww, bummer, shoulda, got, david, carr, thir..."
1,-1,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,"[upset, update, facebook, texting, might, cry,..."
2,-1,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to s...,"[dived, many, time, ball, managed, save, 50, r..."
3,-1,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[whole, body, feel, itchy, like, fire]"
4,-1,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am...","[behaving, mad, see]"
...,...,...,...,...
1599995,1,Just woke up. Having no school is the best fee...,just woke up. having no school is the best fee...,"[woke, school, best, feeling, ever]"
1599996,1,TheWDB.com - Very cool to hear old Walt interv...,thewdb.com - very cool to hear old walt interv...,"[thewdb, com, cool, hear, old, walt, interview]"
1599997,1,Are you ready for your MoJo Makeover? Ask me f...,are you ready for your mojo makeover? ask me f...,"[ready, mojo, makeover, ask, detail]"
1599998,1,Happy 38th Birthday to my boo of alll time!!! ...,happy 38th birthday to my boo of alll time!!! ...,"[happy, 38th, birthday, boo, alll, time, tupac..."


In [73]:
training_data["Tokenized_tweets"] = training_data["Tokenized_tweets"].apply(stemWords)
training_data

Unnamed: 0,polarity,tweets,Clean_tweets,Tokenized_tweets
0,-1,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got da...","[awww, bummer, shoulda, got, david, carr, thir..."
1,-1,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,"[upset, updat, facebook, text, might, cri, res..."
2,-1,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to s...,"[dive, mani, time, ball, manag, save, 50, rest..."
3,-1,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[whole, bodi, feel, itchi, like, fire]"
4,-1,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am...","[behav, mad, see]"
...,...,...,...,...
1599995,1,Just woke up. Having no school is the best fee...,just woke up. having no school is the best fee...,"[woke, school, best, feel, ever]"
1599996,1,TheWDB.com - Very cool to hear old Walt interv...,thewdb.com - very cool to hear old walt interv...,"[thewdb, com, cool, hear, old, walt, interview]"
1599997,1,Are you ready for your MoJo Makeover? Ask me f...,are you ready for your mojo makeover? ask me f...,"[readi, mojo, makeov, ask, detail]"
1599998,1,Happy 38th Birthday to my boo of alll time!!! ...,happy 38th birthday to my boo of alll time!!! ...,"[happi, 38th, birthday, boo, alll, time, tupac..."


In [74]:
training_data["Tokenized_tweets"] = training_data["Tokenized_tweets"].apply(removeNumbers)
training_data

Unnamed: 0,polarity,tweets,Clean_tweets,Tokenized_tweets
0,-1,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got da...","[awww, bummer, shoulda, got, david, carr, thir..."
1,-1,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,"[upset, updat, facebook, text, might, cri, res..."
2,-1,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to s...,"[dive, mani, time, ball, manag, save, rest, go..."
3,-1,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[whole, bodi, feel, itchi, like, fire]"
4,-1,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am...","[behav, mad, see]"
...,...,...,...,...
1599995,1,Just woke up. Having no school is the best fee...,just woke up. having no school is the best fee...,"[woke, school, best, feel, ever]"
1599996,1,TheWDB.com - Very cool to hear old Walt interv...,thewdb.com - very cool to hear old walt interv...,"[thewdb, com, cool, hear, old, walt, interview]"
1599997,1,Are you ready for your MoJo Makeover? Ask me f...,are you ready for your mojo makeover? ask me f...,"[readi, mojo, makeov, ask, detail]"
1599998,1,Happy 38th Birthday to my boo of alll time!!! ...,happy 38th birthday to my boo of alll time!!! ...,"[happi, 38th, birthday, boo, alll, time, tupac..."


In [76]:
training_data["Tokenized_tweets"] = training_data["Tokenized_tweets"].apply(nltk.pos_tag)
training_data

Unnamed: 0,polarity,tweets,Clean_tweets,Tokenized_tweets
0,-1,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got da...","[(awww, JJ), (bummer, NN), (shoulda, NN), (got..."
1,-1,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,"[(upset, JJ), (updat, JJ), (facebook, NN), (te..."
2,-1,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to s...,"[(dive, JJ), (mani, NN), (time, NN), (ball, NN..."
3,-1,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[(whole, JJ), (bodi, NN), (feel, VB), (itchi, ..."
4,-1,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am...","[(behav, NN), (mad, NNS), (see, VBP)]"
...,...,...,...,...
1599995,1,Just woke up. Having no school is the best fee...,just woke up. having no school is the best fee...,"[(woke, JJ), (school, NN), (best, JJS), (feel,..."
1599996,1,TheWDB.com - Very cool to hear old Walt interv...,thewdb.com - very cool to hear old walt interv...,"[(thewdb, NN), (com, NN), (cool, NN), (hear, V..."
1599997,1,Are you ready for your MoJo Makeover? Ask me f...,are you ready for your mojo makeover? ask me f...,"[(readi, NN), (mojo, NN), (makeov, NN), (ask, ..."
1599998,1,Happy 38th Birthday to my boo of alll time!!! ...,happy 38th birthday to my boo of alll time!!! ...,"[(happi, NN), (38th, CD), (birthday, NN), (boo..."


In [77]:
training_data["Tokenized_tweets"] = training_data["Tokenized_tweets"].apply(flatener)
training_data

Unnamed: 0,polarity,tweets,Clean_tweets,Tokenized_tweets
0,-1,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got da...","[awww/JJ, bummer/NN, shoulda/NN, got/VBD, davi..."
1,-1,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,"[upset/JJ, updat/JJ, facebook/NN, text/NN, mig..."
2,-1,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to s...,"[dive/JJ, mani/NN, time/NN, ball/NN, manag/NNS..."
3,-1,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[whole/JJ, bodi/NN, feel/VB, itchi/NNS, like/I..."
4,-1,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am...","[behav/NN, mad/NNS, see/VBP]"
...,...,...,...,...
1599995,1,Just woke up. Having no school is the best fee...,just woke up. having no school is the best fee...,"[woke/JJ, school/NN, best/JJS, feel/NN, ever/RB]"
1599996,1,TheWDB.com - Very cool to hear old Walt interv...,thewdb.com - very cool to hear old walt interv...,"[thewdb/NN, com/NN, cool/NN, hear/VBP, old/JJ,..."
1599997,1,Are you ready for your MoJo Makeover? Ask me f...,are you ready for your mojo makeover? ask me f...,"[readi/NN, mojo/NN, makeov/NN, ask/NN, detail/NN]"
1599998,1,Happy 38th Birthday to my boo of alll time!!! ...,happy 38th birthday to my boo of alll time!!! ...,"[happi/NN, 38th/CD, birthday/NN, boo/VB, alll/..."


In [78]:
training_data["Tokenized_tweets"] = training_data["Tokenized_tweets"].apply(detokenizer)
training_data

Unnamed: 0,polarity,tweets,Clean_tweets,Tokenized_tweets
0,-1,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got da...",awww/JJ bummer/NN shoulda/NN got/VBD david/JJ ...
1,-1,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,upset/JJ updat/JJ facebook/NN text/NN might/MD...
2,-1,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to s...,dive/JJ mani/NN time/NN ball/NN manag/NNS save...
3,-1,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,whole/JJ bodi/NN feel/VB itchi/NNS like/IN fir...
4,-1,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am...",behav/NN mad/NNS see/VBP
...,...,...,...,...
1599995,1,Just woke up. Having no school is the best fee...,just woke up. having no school is the best fee...,woke/JJ school/NN best/JJS feel/NN ever/RB
1599996,1,TheWDB.com - Very cool to hear old Walt interv...,thewdb.com - very cool to hear old walt interv...,thewdb/NN com/NN cool/NN hear/VBP old/JJ walt/...
1599997,1,Are you ready for your MoJo Makeover? Ask me f...,are you ready for your mojo makeover? ask me f...,readi/NN mojo/NN makeov/NN ask/NN detail/NN
1599998,1,Happy 38th Birthday to my boo of alll time!!! ...,happy 38th birthday to my boo of alll time!!! ...,happi/NN 38th/CD birthday/NN boo/VB alll/JJ ti...


In [79]:
X = training_data["Tokenized_tweets"]  
y = training_data["polarity"].astype('int')  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [96]:
uniSVM = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 1))), 
    ('SVM', LinearSVC(C=0.1))
])
biSVM = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(2, 2))), 
    ('SVM', LinearSVC(C=0.1))
])
uniBiSVM = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))), 
    ('SVM', LinearSVC(C=0.2))
])

In [97]:
uniSVM.fit(X_train, y_train)
uniSVM.score(X_test, y_test)

0.773946875

In [98]:
biSVM.fit(X_train, y_train)
biSVM.score(X_test, y_test)

0.7752

In [99]:
uniBiSVM.fit(X_train, y_train)
uniBiSVM.score(X_test, y_test)

0.775825

In [95]:
training_data["polarity"].value_counts()

 1    800000
-1    800000
Name: polarity, dtype: int64

In [100]:
uniBiSVM.score(X_train, y_train)

0.822340625

In [103]:
y_pred = uniSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.79      0.75      0.77    160000
           1       0.76      0.80      0.78    160000

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000

[[120031  39969]
 [ 32368 127632]]


In [105]:
y_pred = biSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.79      0.75      0.77    160000
           1       0.76      0.80      0.78    160000

    accuracy                           0.78    320000
   macro avg       0.78      0.78      0.78    320000
weighted avg       0.78      0.78      0.78    320000

[[120435  39565]
 [ 32371 127629]]


In [106]:
y_pred = uniBiSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.79      0.76      0.77    160000
           1       0.77      0.80      0.78    160000

    accuracy                           0.78    320000
   macro avg       0.78      0.78      0.78    320000
weighted avg       0.78      0.78      0.78    320000

[[120992  39008]
 [ 32728 127272]]


# Amazon Review database

- delete reviews with a score = 3
- transform score from {1,2} to -1 and {4,5} to 1 

In [24]:
training_data = pd.read_csv("./Amazon reviews/Reviews.csv")
training_data.head(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [25]:
training_data = training_data.drop( columns = ["Id", "ProductId", "UserId", "ProfileName", "HelpfulnessNumerator","HelpfulnessDenominator", "Time", "Summary"])

In [26]:
training_data["Score"] = training_data[training_data["Score"]!=3]
training_data["Score"] = training_data["Score"].apply(reduceScore)
training_data["Score"].value_counts()

 1    486417
-1     82037
Name: Score, dtype: int64

In [27]:
training_data["Text"] = training_data["Text"].apply(clean_text)
training_data

Unnamed: 0,Score,Text
0,1,i have bought several of the vitality canned d...
1,-1,product arrived labeled as jumbo salted peanut...
2,1,this is a confection that has been around a fe...
3,-1,if you are looking for the secret ingredient i...
4,1,great taffy at a great price there was a wide...
...,...,...
568449,1,great for sesame chickenthis is a good if not ...
568450,-1,im disappointed with the flavor the chocolate ...
568451,1,these stars are small so you can give of thos...
568452,1,these are the best treats for training and rew...


In [29]:
training_data["Text"] = training_data["Text"].apply(tokenizer.tokenize)
training_data

Unnamed: 0,Score,Text
0,1,"[i, have, bought, several, of, the, vitality, ..."
1,-1,"[product, arrived, labeled, as, jumbo, salted,..."
2,1,"[this, is, a, confection, that, has, been, aro..."
3,-1,"[if, you, are, looking, for, the, secret, ingr..."
4,1,"[great, taffy, at, a, great, price, there, was..."
...,...,...
568449,1,"[great, for, sesame, chickenthis, is, a, good,..."
568450,-1,"[im, disappointed, with, the, flavor, the, cho..."
568451,1,"[these, stars, are, small, so, you, can, give,..."
568452,1,"[these, are, the, best, treats, for, training,..."


In [30]:
training_data["Text"] = training_data["Text"].apply(removeStopWords)
training_data

Unnamed: 0,Score,Text
0,1,"[bought, several, vitality, canned, dog, food,..."
1,-1,"[product, arrived, labeled, jumbo, salted, pea..."
2,1,"[confection, around, centuries, light, pillowy..."
3,-1,"[looking, secret, ingredient, robitussin, beli..."
4,1,"[great, taffy, great, price, wide, assortment,..."
...,...,...
568449,1,"[great, sesame, chickenthis, good, better, res..."
568450,-1,"[im, disappointed, flavor, chocolate, notes, e..."
568451,1,"[stars, small, give, one, training, session, t..."
568452,1,"[best, treats, training, rewarding, dog, good,..."


In [31]:
training_data["Text"] = training_data["Text"].apply(lemmatizeWords)
training_data

Unnamed: 0,Score,Text
0,1,"[bought, several, vitality, canned, dog, food,..."
1,-1,"[product, arrived, labeled, jumbo, salted, pea..."
2,1,"[confection, around, century, light, pillowy, ..."
3,-1,"[looking, secret, ingredient, robitussin, beli..."
4,1,"[great, taffy, great, price, wide, assortment,..."
...,...,...
568449,1,"[great, sesame, chickenthis, good, better, res..."
568450,-1,"[im, disappointed, flavor, chocolate, note, es..."
568451,1,"[star, small, give, one, training, session, tr..."
568452,1,"[best, treat, training, rewarding, dog, good, ..."


In [32]:
training_data["Text"] = training_data["Text"].apply(stemWords)
training_data

Unnamed: 0,Score,Text
0,1,"[bought, sever, vital, can, dog, food, product..."
1,-1,"[product, arriv, label, jumbo, salt, peanutsth..."
2,1,"[confect, around, centuri, light, pillowi, cit..."
3,-1,"[look, secret, ingredi, robitussin, believ, fo..."
4,1,"[great, taffi, great, price, wide, assort, yum..."
...,...,...
568449,1,"[great, sesam, chickenthi, good, better, restu..."
568450,-1,"[im, disappoint, flavor, chocol, note, especi,..."
568451,1,"[star, small, give, one, train, session, tri, ..."
568452,1,"[best, treat, train, reward, dog, good, groom,..."


In [33]:
training_data["Text"] = training_data["Text"].apply(nltk.pos_tag)
training_data

Unnamed: 0,Score,Text
0,1,"[(bought, VBN), (sever, JJ), (vital, NN), (can..."
1,-1,"[(product, NN), (arriv, NN), (label, JJ), (jum..."
2,1,"[(confect, NN), (around, IN), (centuri, NN), (..."
3,-1,"[(look, NN), (secret, JJ), (ingredi, NN), (rob..."
4,1,"[(great, JJ), (taffi, JJ), (great, JJ), (price..."
...,...,...
568449,1,"[(great, JJ), (sesam, JJ), (chickenthi, NN), (..."
568450,-1,"[(im, JJ), (disappoint, NN), (flavor, NN), (ch..."
568451,1,"[(star, NN), (small, JJ), (give, VBP), (one, C..."
568452,1,"[(best, JJS), (treat, NN), (train, NN), (rewar..."


In [34]:
training_data["Text"] = training_data["Text"].apply(flatener)
training_data

Unnamed: 0,Score,Text
0,1,"[bought/VBN, sever/JJ, vital/NN, can/MD, dog/V..."
1,-1,"[product/NN, arriv/NN, label/JJ, jumbo/JJ, sal..."
2,1,"[confect/NN, around/IN, centuri/NN, light/JJ, ..."
3,-1,"[look/NN, secret/JJ, ingredi/NN, robitussin/NN..."
4,1,"[great/JJ, taffi/JJ, great/JJ, price/NN, wide/..."
...,...,...
568449,1,"[great/JJ, sesam/JJ, chickenthi/NN, good/JJ, b..."
568450,-1,"[im/JJ, disappoint/NN, flavor/NN, chocol/NN, n..."
568451,1,"[star/NN, small/JJ, give/VBP, one/CD, train/NN..."
568452,1,"[best/JJS, treat/NN, train/NN, reward/NN, dog/..."


In [35]:
training_data["Text"] = training_data["Text"].apply(detokenizer)
training_data

Unnamed: 0,Score,Text
0,1,bought/VBN sever/JJ vital/NN can/MD dog/VB foo...
1,-1,product/NN arriv/NN label/JJ jumbo/JJ salt/NN ...
2,1,confect/NN around/IN centuri/NN light/JJ pillo...
3,-1,look/NN secret/JJ ingredi/NN robitussin/NN bel...
4,1,great/JJ taffi/JJ great/JJ price/NN wide/JJ as...
...,...,...
568449,1,great/JJ sesam/JJ chickenthi/NN good/JJ better...
568450,-1,im/JJ disappoint/NN flavor/NN chocol/NN note/N...
568451,1,star/NN small/JJ give/VBP one/CD train/NN sess...
568452,1,best/JJS treat/NN train/NN reward/NN dog/NN go...


In [36]:
X = training_data["Text"]  
y = training_data["Score"].astype('int')  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [37]:
uniSVM = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 1))), 
    ('SVM', LinearSVC(C=0.1))
])
biSVM = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(2, 2))), 
    ('SVM', LinearSVC(C=0.1))
])
uniBiSVM = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))), 
    ('SVM', LinearSVC(C=0.2))
])

In [38]:
uniSVM.fit(X_train, y_train)
uniSVM.score(X_test, y_test)

0.9194395334723066

In [39]:
biSVM.fit(X_train, y_train)
biSVM.score(X_test, y_test)

0.9272941569693292

In [40]:
uniBiSVM.fit(X_train, y_train)
uniBiSVM.score(X_test, y_test)

0.9324660703133933

In [41]:
y_pred = uniSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.82      0.57      0.67     16407
           1       0.93      0.98      0.95     97284

    accuracy                           0.92    113691
   macro avg       0.87      0.77      0.81    113691
weighted avg       0.91      0.92      0.91    113691

[[ 9332  7075]
 [ 2084 95200]]


In [42]:
y_pred = biSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.85      0.60      0.71     16407
           1       0.94      0.98      0.96     97284

    accuracy                           0.93    113691
   macro avg       0.89      0.79      0.83    113691
weighted avg       0.92      0.93      0.92    113691

[[ 9885  6522]
 [ 1744 95540]]


In [45]:
y_pred = uniBiSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.85      0.65      0.74     16407
           1       0.94      0.98      0.96     97284

    accuracy                           0.93    113691
   macro avg       0.89      0.82      0.85    113691
weighted avg       0.93      0.93      0.93    113691

[[10673  5734]
 [ 1944 95340]]


# Trip Advisor Dataset

In [97]:
training_data = pd.read_csv("./Tripadvisor dataset/tripadvisor_hotel_reviews.csv")
training_data.head(5)

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [98]:
training_data["Score"] = training_data["Rating"]
training_data["Text"] = training_data["Review"]
training_data["Score"].value_counts()

5    9054
4    6039
3    2184
2    1793
1    1421
Name: Score, dtype: int64

In [99]:
training_data = training_data[training_data["Score"].astype('int')!=3]
training_data["Score"].value_counts()

5    9054
4    6039
2    1793
1    1421
Name: Score, dtype: int64

In [100]:
training_data["Score"] = training_data["Score"].astype('int').apply(reduceScore)
training_data["Score"].value_counts()

 1    15093
-1     3214
Name: Score, dtype: int64

In [101]:
training_data = training_data.drop(training_data[training_data['Score'] > 0].sample(n=11879).index)

In [102]:
training_data["Score"].value_counts()

 1    3214
-1    3214
Name: Score, dtype: int64

In [104]:
training_data["Text"] = training_data["Text"].apply(clean_text)
training_data

training_data["Text"] = training_data["Text"].apply(tokenizer.tokenize)
training_data

training_data["Text"] = training_data["Text"].apply(removeStopWords)
training_data

training_data["Text"] = training_data["Text"].apply(lemmatizeWords)
training_data

training_data["Text"] = training_data["Text"].apply(stemWords)
training_data

training_data["Text"] = training_data["Text"].apply(nltk.pos_tag)
training_data

training_data["Text"] = training_data["Text"].apply(flatener)
training_data

training_data["Text"] = training_data["Text"].apply(detokenizer)
training_data

X = training_data["Text"]  
y = training_data["Score"].astype('int')  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

uniSVM = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 1))), 
    ('SVM', LinearSVC(C=0.1))
])
biSVM = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(2, 2))), 
    ('SVM', LinearSVC(C=0.1))
])
uniBiSVM = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))), 
    ('SVM', LinearSVC(C=0.2))
])

In [105]:
uniSVM.fit(X_train, y_train)
uniSVM.score(X_test, y_test)

0.9121306376360808

In [106]:
biSVM.fit(X_train, y_train)
biSVM.score(X_test, y_test)

0.921461897356143

In [107]:
uniBiSVM.fit(X_train, y_train)
uniBiSVM.score(X_test, y_test)

0.9191290824261276

In [108]:
y_pred = uniSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.90      0.93      0.91       643
           1       0.93      0.89      0.91       643

    accuracy                           0.91      1286
   macro avg       0.91      0.91      0.91      1286
weighted avg       0.91      0.91      0.91      1286

[[599  44]
 [ 69 574]]


In [109]:
y_pred = biSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.92      0.92      0.92       643
           1       0.92      0.92      0.92       643

    accuracy                           0.92      1286
   macro avg       0.92      0.92      0.92      1286
weighted avg       0.92      0.92      0.92      1286

[[593  50]
 [ 51 592]]


In [110]:
y_pred = uniBiSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.92      0.92      0.92       643
           1       0.92      0.92      0.92       643

    accuracy                           0.92      1286
   macro avg       0.92      0.92      0.92      1286
weighted avg       0.92      0.92      0.92      1286

[[592  51]
 [ 53 590]]


# Winemag dataset 

In [148]:
training_data = pd.read_csv("./news/all-data.csv", names=["Score","Text"])
training_data.head(5)

Unnamed: 0,Score,Text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [149]:
training_data["Score"] = training_data["Score"].apply(libelToScore)
training_data["Score"].value_counts()

 0    2879
 1    1363
-1     604
Name: Score, dtype: int64

In [158]:
training_data["Text"] = training_data["Text"].apply(clean_text)
training_data

training_data["Text"] = training_data["Text"].apply(tokenizer.tokenize)
training_data

training_data["Text"] = training_data["Text"].apply(removeStopWords)
training_data

training_data["Text"] = training_data["Text"].apply(lemmatizeWords)
training_data

training_data["Text"] = training_data["Text"].apply(stemWords)
training_data

training_data["Text"] = training_data["Text"].apply(nltk.pos_tag)
training_data

training_data["Text"] = training_data["Text"].apply(flatener)
training_data

training_data["Text"] = training_data["Text"].apply(detokenizer)
training_data

X = training_data["Text"]  
y = training_data["Score"].astype('int')  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

uniSVM = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 1))), 
    ('SVM', LinearSVC(C=0.1))
])
biSVM = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(2, 2))), 
    ('SVM', LinearSVC(C=0.1))
])
uniBiSVM = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))), 
    ('SVM', LinearSVC(C=0.2))
])

In [125]:
uniSVM.fit(X_train, y_train)
uniSVM.score(X_test, y_test)

0.7154639175257732

In [128]:
biSVM.fit(X_train, y_train)
biSVM.score(X_test, y_test)

0.7061855670103093

In [127]:
uniBiSVM.fit(X_train, y_train)
uniBiSVM.score(X_test, y_test)

0.7350515463917526

## Results with the original database

In [129]:
y_pred = uniSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.68      0.36      0.47       121
           0       0.72      0.94      0.82       576
           1       0.71      0.39      0.51       273

    accuracy                           0.72       970
   macro avg       0.70      0.56      0.60       970
weighted avg       0.71      0.72      0.69       970

[[ 43  61  17]
 [  6 544  26]
 [ 14 152 107]]


In [130]:
y_pred = biSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.75      0.31      0.44       121
           0       0.70      0.97      0.81       576
           1       0.76      0.33      0.46       273

    accuracy                           0.71       970
   macro avg       0.73      0.54      0.57       970
weighted avg       0.72      0.71      0.67       970

[[ 38  69  14]
 [  5 557  14]
 [  8 175  90]]


In [131]:
y_pred = uniBiSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.70      0.43      0.53       121
           0       0.74      0.93      0.82       576
           1       0.72      0.47      0.57       273

    accuracy                           0.74       970
   macro avg       0.72      0.61      0.64       970
weighted avg       0.73      0.74      0.72       970

[[ 52  53  16]
 [  9 533  34]
 [ 13 132 128]]


In [140]:
training_data = training_data.drop(training_data[training_data['Score'] == 1].sample(n=759).index)
training_data = training_data.drop(training_data[training_data['Score'] == 0].sample(n=2275).index)

ValueError: Cannot take a larger sample than population when 'replace=False'

In [142]:
uniSVM.fit(X_train, y_train)
uniSVM.score(X_test, y_test)

0.6115702479338843

In [143]:
biSVM.fit(X_train, y_train)
biSVM.score(X_test, y_test)

0.6253443526170799

In [144]:
uniBiSVM.fit(X_train, y_train)
uniBiSVM.score(X_test, y_test)

0.6363636363636364

## Results after downsampling

In [145]:
y_pred = uniSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.72      0.71      0.72       121
           0       0.53      0.64      0.58       121
           1       0.59      0.49      0.53       121

    accuracy                           0.61       363
   macro avg       0.62      0.61      0.61       363
weighted avg       0.62      0.61      0.61       363

[[86 24 11]
 [14 77 30]
 [19 43 59]]


In [146]:
y_pred = biSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.74      0.70      0.72       121
           0       0.53      0.71      0.61       121
           1       0.64      0.46      0.54       121

    accuracy                           0.63       363
   macro avg       0.64      0.63      0.62       363
weighted avg       0.64      0.63      0.62       363

[[85 24 12]
 [16 86 19]
 [14 51 56]]


In [147]:
y_pred = uniBiSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.75      0.73      0.74       121
           0       0.56      0.60      0.58       121
           1       0.61      0.59      0.60       121

    accuracy                           0.64       363
   macro avg       0.64      0.64      0.64       363
weighted avg       0.64      0.64      0.64       363

[[88 20 13]
 [16 72 33]
 [13 37 71]]


In [156]:
training_data = training_data.append(training_data[training_data['Score'] == -1])
## augmentation -1 * 3 and 1 * 2

In [157]:
training_data["Score"].value_counts()

 0    2879
 1    2726
-1    2416
Name: Score, dtype: int64

In [159]:
uniSVM.fit(X_train, y_train)
uniSVM.score(X_test, y_test)

0.7800623052959501

In [160]:
biSVM.fit(X_train, y_train)
biSVM.score(X_test, y_test)

0.8218068535825546

In [161]:
uniBiSVM.fit(X_train, y_train)
uniBiSVM.score(X_test, y_test)

0.832398753894081

## Results after augmentation

In [162]:
y_pred = uniSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.83      0.88      0.85       483
           0       0.74      0.79      0.76       576
           1       0.79      0.67      0.73       546

    accuracy                           0.78      1605
   macro avg       0.78      0.78      0.78      1605
weighted avg       0.78      0.78      0.78      1605

[[427  41  15]
 [ 35 457  84]
 [ 55 123 368]]


In [163]:
y_pred = biSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.86      0.93      0.89       483
           0       0.78      0.84      0.81       576
           1       0.83      0.71      0.77       546

    accuracy                           0.82      1605
   macro avg       0.82      0.83      0.82      1605
weighted avg       0.82      0.82      0.82      1605

[[447  24  12]
 [ 25 486  65]
 [ 49 111 386]]


In [164]:
y_pred = uniBiSVM.predict(X_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.86      0.96      0.90       483
           0       0.83      0.78      0.81       576
           1       0.81      0.78      0.79       546

    accuracy                           0.83      1605
   macro avg       0.83      0.84      0.83      1605
weighted avg       0.83      0.83      0.83      1605

[[462  14   7]
 [ 31 450  95]
 [ 46  76 424]]
