**Importing required packages**

In [None]:
import json
import gzip
import pandas as pd
import gensim

**Downloading the dataset from Amazon Snap --> Cell Phones and Accessories**

In [None]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Cell_Phones_and_Accessories_5.json.gz

--2022-04-24 12:20:39--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Cell_Phones_and_Accessories_5.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 169071325 (161M) [application/octet-stream]
Saving to: ‘Cell_Phones_and_Accessories_5.json.gz.1’


2022-04-24 12:20:43 (47.3 MB/s) - ‘Cell_Phones_and_Accessories_5.json.gz.1’ saved [169071325/169071325]



In [None]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/Electronics.json.gz

--2022-04-24 12:20:47--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/Electronics.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3322874357 (3.1G) [application/octet-stream]
Saving to: ‘Electronics.json.gz’


2022-04-24 12:21:45 (54.0 MB/s) - ‘Electronics.json.gz’ saved [3322874357/3322874357]




**Retrieve 1 lakh records and making balanced dataset**

In [None]:
reviews = []
sentiment = []
p=0
ng=0
emp=0
with gzip.open("/content/Cell_Phones_and_Accessories_5.json.gz") as f:
    for l in f:
      if p==50000 and ng==50000:
        break
      d=json.loads(l.strip())
      r=d.get("reviewText","")
      if r=="":
        emp+=1
        continue
      rating=int(d.get('overall'))
      if rating>=3 and p<50000:
        sentiment.append(1)
        reviews.append(r)
        p+=1
      elif rating<3 and ng<50000:
        sentiment.append(0)
        reviews.append(r)
        ng+=1

print(emp)
dataset={'Review':reviews,'Sentiment':sentiment}
df=pd.DataFrame(dataset)
df['Sentiment'].describe()

188


count    100000.000000
mean          0.500000
std           0.500003
min           0.000000
25%           0.000000
50%           0.500000
75%           1.000000
max           1.000000
Name: Sentiment, dtype: float64

In [None]:
print(p,ng)

50000 50000


**Extracted reviews**

In [None]:
df.head()

Unnamed: 0,Review,Sentiment
0,Looks even better in person. Be careful to not...,1
1,When you don't want to spend a whole lot of ca...,1
2,"so the case came on time, i love the design. I...",1
3,DON'T CARE FOR IT. GAVE IT AS A GIFT AND THEY...,0
4,"I liked it because it was cute, but the studs ...",1


**Removing Stop words**

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
newreviews=[]
for line in reviews:
  newreviews.append(remove_stopwords(line))
newreviews[:10]

dataset={'Review':newreviews,'Sentiment':sentiment}
newdf=pd.DataFrame(dataset)
newdf.head()

Unnamed: 0,Review,Sentiment
0,Looks better person. Be careful drop phone rhi...,1
1,When don't want spend lot cash want great deal...,1
2,"case came time, love design. I'm actually miss...",1
3,DON'T CARE FOR IT. GAVE IT AS A GIFT AND THEY ...,0
4,"I liked cute, studs fall easily protect phone ...",1


**Tokenization of reviews and simple preprocessing to remove accents and special characters**

In [None]:
newdf['tokenized']=newdf['Review'].apply(simple_preprocess,deacc=True)
newdf.head()

Unnamed: 0,Review,Sentiment,tokenized
0,Looks better person. Be careful drop phone rhi...,1,"[looks, better, person, be, careful, drop, pho..."
1,When don't want spend lot cash want great deal...,1,"[when, don, want, spend, lot, cash, want, grea..."
2,"case came time, love design. I'm actually miss...",1,"[case, came, time, love, design, actually, mis..."
3,DON'T CARE FOR IT. GAVE IT AS A GIFT AND THEY ...,0,"[don, care, for, it, gave, it, as, gift, and, ..."
4,"I liked cute, studs fall easily protect phone ...",1,"[liked, cute, studs, fall, easily, protect, ph..."


**Stemming the tokens of each reviews**

In [None]:
from gensim.parsing.porter import PorterStemmer
porter_stemmer = PorterStemmer()
newdf['stemmed']=[[porter_stemmer.stem(word) for word in tokens] for tokens in newdf['tokenized'] ]
newdf.head()

Unnamed: 0,Review,Sentiment,tokenized,stemmed
0,Looks better person. Be careful drop phone rhi...,1,"[looks, better, person, be, careful, drop, pho...","[look, better, person, be, care, drop, phone, ..."
1,When don't want spend lot cash want great deal...,1,"[when, don, want, spend, lot, cash, want, grea...","[when, don, want, spend, lot, cash, want, grea..."
2,"case came time, love design. I'm actually miss...",1,"[case, came, time, love, design, actually, mis...","[case, came, time, love, design, actual, miss,..."
3,DON'T CARE FOR IT. GAVE IT AS A GIFT AND THEY ...,0,"[don, care, for, it, gave, it, as, gift, and, ...","[don, care, for, it, gave, it, as, gift, and, ..."
4,"I liked cute, studs fall easily protect phone ...",1,"[liked, cute, studs, fall, easily, protect, ph...","[like, cute, stud, fall, easili, protect, phon..."


**Dropping unwanted columns**

In [None]:
newdf=newdf.drop(columns=["Review","tokenized"],axis=1)
newdf.head()

Unnamed: 0,Sentiment,stemmed
0,1,"[look, better, person, be, care, drop, phone, ..."
1,1,"[when, don, want, spend, lot, cash, want, grea..."
2,1,"[case, came, time, love, design, actual, miss,..."
3,0,"[don, care, for, it, gave, it, as, gift, and, ..."
4,1,"[like, cute, stud, fall, easili, protect, phon..."


**Define the word2vec model**


1.   Each word with 100 dimensions
2.   Window for skip gram is chosen as 3 (3 words before and after current word)
3.   Minimum length review = 1
4.   No. of threads = 4



In [None]:
from gensim.models import Word2Vec
dim = 100
window = 3
min_count = 1
workers = 4 #threads
sg = 1 # skip gram model =1
stemmed_tokens = pd.Series(newdf['stemmed']).values
print(stemmed_tokens)

[list(['look', 'better', 'person', 'be', 'care', 'drop', 'phone', 'rhineston', 'fall', 'duh', 'more', 'decor', 'case', 'protect', 'fit', 'perfectli', 'secur', 'phone', 'overal', 'pleas', 'purchas'])
 list(['when', 'don', 'want', 'spend', 'lot', 'cash', 'want', 'great', 'deal', 'thi', 'shop', 'bui', 'from'])
 list(['case', 'came', 'time', 'love', 'design', 'actual', 'miss', 'stud', 'notic', 'stud', 'bit', 'sloppi', 'bow', 'notic', 'haven', 'phone', 've', 'notic', 'far'])
 ...
 list(['rubber', 'case', 'fit', 'poorli', 'kept', 'week', 'couldn', 'bother', 'send', 'threw', 'out'])
 list(['aw', 'fit', 'everi', 'time', 'try', 'holster', 'start', 'come', 'off', 'veri', 'unhappi', 'thi', 'am', 'activ', 'look', 'case'])
 list(['the', 'phone', 'fell', 'flat', 'screen', 'ft', 'and', 'got', 'shatter', 'screen', 'cost', 'replac', 'ballis', 'letdown', 'stai', 'awai', 'case'])]


**Compile the word2vec model and save the model**

In [None]:
model = Word2Vec(stemmed_tokens, min_count = min_count, size = dim, workers = workers, window = window, sg = sg)
model.save("/content/Cell_Phones_and_Accessories_wordvec.model")

In [None]:
embedded_matrix=[]
vocab_size=len(model.wv.vocab)
words=list(model.wv.vocab.keys())
print(words)
print(len(words))
d=dict()
for w in words:
  try:
    embedded_matrix.append(model.wv[w])
  except:
    print("Word not found")
embedded_matrix[0]

['look', 'better', 'person', 'be', 'care', 'drop', 'phone', 'rhineston', 'fall', 'duh', 'more', 'decor', 'case', 'protect', 'fit', 'perfectli', 'secur', 'overal', 'pleas', 'purchas', 'when', 'don', 'want', 'spend', 'lot', 'cash', 'great', 'deal', 'thi', 'shop', 'bui', 'from', 'came', 'time', 'love', 'design', 'actual', 'miss', 'stud', 'notic', 'bit', 'sloppi', 'bow', 'haven', 've', 'far', 'for', 'it', 'gave', 'as', 'gift', 'and', 'thei', 'were', 'okai', 'with', 'just', 'not', 'what', 'expect', 'like', 'cute', 'easili', 'recommend', 'the', 'product', 'exactli', 'pictur', 'nice', 'howev', 'dai', 'later', 'fell', 'apart', 'disappoint', 'qualiti', 'final', 'got', 'todai', 'took', 'forev', 'here', 'pic', 'right', 'wrote', 'review', 'start', 'off', 'packag', 'come', 'glue', 'opinion', 'good', 'coupl', 'if', 'that', 'wouldn', 'anoth', 'thank', 'none', 'jewel', 'fallen', 'glu', 'well', 'frame', 'given', 'happi', 'thing', 'know', 'is', 'carri', 'jean', 'tightli', 'big', 'you', 'worth', 'do', 'i

array([ 0.07629935,  0.09934748,  0.28636965,  0.17831433,  0.25044376,
        0.35616016,  0.07090731,  0.13086367,  0.69161564, -0.48810902,
        0.07003625, -0.47046646,  0.10394704, -0.24352965,  0.17088366,
       -0.10113315, -0.13200523, -0.6357845 ,  0.25706202,  0.1679882 ,
        0.23189239, -0.7486203 ,  0.43152726, -0.02845055,  0.26779088,
       -0.61599207, -0.44634312,  0.3407318 , -0.30198586, -0.10325061,
       -0.205022  ,  0.05525421,  0.251885  , -0.03910535,  0.31874308,
       -0.00589792, -0.11678181,  0.26002374, -0.26040298, -0.10227366,
        0.08572169, -0.40318742, -0.19379422,  0.40650144,  0.32528234,
        0.63823134,  0.32444438, -0.33788395, -0.10238775, -0.00196284,
        0.1841028 , -0.3237222 ,  0.671873  ,  0.3571726 ,  0.22026415,
       -0.5936914 ,  0.24824527, -0.14264612, -0.237571  ,  0.1981395 ,
        0.05217509, -0.32795203,  0.38160053, -0.22402821,  0.30432406,
        0.12767722, -0.25059065,  0.3463975 ,  0.1231373 , -0.42

In [None]:
m=len(stemmed_tokens[0])
for i in stemmed_tokens:
  l=len(i)
  if m<l:
    m=l
print(m)

1870


**Generate the word embeddings matrix**

In [None]:
import numpy as np
embeddedmatrix=np.array(model.wv)
embeddedmatrix

array(<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x7f95a557da10>,
      dtype=object)

**Transform each review to corresponding word vector embeddings**



**Each review is represented by 100 dimensional vector (Mean of all the word embeddings)**

Write the result to .csv file

In [None]:
word2vec_filename ='train_review_word2vec_100.csv'
features = (newdf['stemmed'])
labels = newdf['Sentiment']
with open(word2vec_filename, 'w+') as word2vec_file:
    for index, row in features.items():
        model_vector = (np.mean([model[token] for token in row], axis=0)).tolist()
        if index == 0:
            header = ",".join(str(ele) for ele in range(100))
            word2vec_file.write(header)
            word2vec_file.write("\n")
        if type(model_vector) is list:  
            line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
        else:
            line1 = ",".join([str(0) for i in range(100)])
        word2vec_file.write(line1)
        word2vec_file.write('\n')

  
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


**Store the review embeddings to dataframe**

In [None]:
from sklearn.tree import DecisionTreeClassifier
word2vec_df = pd.read_csv(word2vec_filename)
word2vec_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.201102,0.149875,-0.112171,0.118383,0.416979,0.327286,0.154336,0.057478,0.447208,-0.186070,...,-0.268290,0.299024,-0.344833,-0.173966,-0.181065,-0.201993,-0.118203,0.078211,0.175062,-0.164593
1,0.351723,0.146811,-0.206192,0.096770,0.406218,0.204849,0.126309,0.055647,0.284582,-0.273914,...,-0.351445,0.269002,-0.331432,-0.125766,-0.346509,-0.236837,-0.077785,-0.038711,0.112401,-0.158287
2,0.285156,0.148674,0.009843,0.205816,0.458559,0.333724,0.183890,0.081429,0.298612,-0.071473,...,-0.163430,0.249583,-0.244086,0.010536,-0.212766,-0.142644,-0.139323,0.166398,0.321142,-0.183039
3,0.156542,0.214251,-0.047994,0.226156,0.330246,0.323934,-0.128445,0.082838,0.350057,-0.189285,...,-0.273811,0.173252,-0.326897,-0.139023,-0.384983,-0.139406,-0.159024,0.013196,0.300179,-0.248503
4,0.106647,0.122458,-0.093243,0.057714,0.415199,0.330782,0.282219,0.032486,0.366546,-0.092426,...,-0.294113,0.292388,-0.442882,-0.188807,-0.232056,-0.154678,-0.242584,0.122429,0.257907,-0.282877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.275240,0.268842,-0.105282,0.047685,0.422296,0.329610,0.170659,-0.101163,0.300907,-0.190797,...,-0.352807,0.447245,-0.294171,-0.294370,-0.192227,-0.030138,-0.216543,0.132777,0.371935,0.049680
99996,0.226305,0.234817,-0.144354,0.016274,0.400869,0.230626,0.053952,0.067196,0.307234,-0.211814,...,-0.306989,0.308030,-0.233209,-0.063928,-0.393219,-0.210882,-0.077741,0.028259,0.200243,-0.133496
99997,0.222326,0.185661,-0.127061,0.117363,0.423375,0.361657,0.139204,0.067513,0.292826,-0.089660,...,-0.305266,0.403562,-0.341631,-0.098265,-0.487246,-0.177771,-0.145369,0.198640,0.294211,-0.012900
99998,0.222889,0.136583,-0.102832,0.101539,0.447973,0.309816,0.045750,0.009233,0.464476,-0.156971,...,-0.156217,0.250843,-0.218813,-0.198054,-0.258687,-0.177395,-0.140799,0.136471,0.135503,-0.096089


**Decision Tree Classifier**

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
clf_decision_word2vec = DecisionTreeClassifier()
features = word2vec_df
labels = newdf['Sentiment']
clf_decision_word2vec.fit(features,labels)

DecisionTreeClassifier()

In [None]:
testreviews = []
testsentiment = []
t_p=0
t_ng=0
t_emp=0
with gzip.open('/content/Electronics.json.gz') as f:
    for l in f:
      if t_p==5000 and t_ng==5000:
        break
      t_d=json.loads(l.strip())
      t_r=t_d.get("reviewText","")
      if t_r=="":
        t_emp+=1
        continue
      t_rating=int(t_d.get('overall'))
      if t_rating>=3 and t_p<5000:
        testsentiment.append(1)
        testreviews.append(t_r)
        t_p+=1
      elif t_rating<3 and t_ng<5000:
        testsentiment.append(0)
        testreviews.append(t_r)
        t_ng+=1

print(t_emp)
testset={'treview':testreviews,'tsentiment':testsentiment}
tdf=pd.DataFrame(testset)
tdf['tsentiment'].describe()

7


count    10000.000000
mean         0.500000
std          0.500025
min          0.000000
25%          0.000000
50%          0.500000
75%          1.000000
max          1.000000
Name: tsentiment, dtype: float64

In [None]:
tdf.head()

Unnamed: 0,treview,tsentiment
0,This was the first time I read Garcia-Aguilera...,1
1,"As with all of Ms. Garcia-Aguilera's books, I ...",1
2,I've not read any of Ms Aguilera's works befor...,1
3,This romance novel is right up there with the ...,1
4,Carolina Garcia Aguilera has done it again. S...,1


In [None]:
newtestreviews=[]
for line in testreviews:
  newtestreviews.append(remove_stopwords(line))
newtestreviews[:10]

testset={'treview':newtestreviews,'tsentiment':testsentiment}
newtestdf=pd.DataFrame(testset)
newtestdf.head()

Unnamed: 0,treview,tsentiment
0,This time I read Garcia-Aguilera. I came book ...,1
1,"As Ms. Garcia-Aguilera's books, I think MUST R...",1
2,"I've read Ms Aguilera's works before, having f...",1
3,This romance novel right rest amazing mystery ...,1
4,Carolina Garcia Aguilera again. She's written ...,1


In [None]:
newtestdf['tokenized']=newtestdf['treview'].apply(simple_preprocess,deacc=True)
newtestdf.head()

Unnamed: 0,treview,tsentiment,tokenized
0,This time I read Garcia-Aguilera. I came book ...,1,"[this, time, read, garcia, aguilera, came, boo..."
1,"As Ms. Garcia-Aguilera's books, I think MUST R...",1,"[as, ms, garcia, aguilera, books, think, must,..."
2,"I've read Ms Aguilera's works before, having f...",1,"[ve, read, ms, aguilera, works, before, having..."
3,This romance novel right rest amazing mystery ...,1,"[this, romance, novel, right, rest, amazing, m..."
4,Carolina Garcia Aguilera again. She's written ...,1,"[carolina, garcia, aguilera, again, she, writt..."


In [None]:
porter_stemmer = PorterStemmer()
newtestdf['stemmed']=[[porter_stemmer.stem(word) for word in tokens] for tokens in newtestdf['tokenized'] ]
newtestdf.head()

Unnamed: 0,treview,tsentiment,tokenized,stemmed
0,This time I read Garcia-Aguilera. I came book ...,1,"[this, time, read, garcia, aguilera, came, boo...","[thi, time, read, garcia, aguilera, came, book..."
1,"As Ms. Garcia-Aguilera's books, I think MUST R...",1,"[as, ms, garcia, aguilera, books, think, must,...","[as, ms, garcia, aguilera, book, think, must, ..."
2,"I've read Ms Aguilera's works before, having f...",1,"[ve, read, ms, aguilera, works, before, having...","[ve, read, ms, aguilera, work, befor, have, fi..."
3,This romance novel right rest amazing mystery ...,1,"[this, romance, novel, right, rest, amazing, m...","[thi, romanc, novel, right, rest, amaz, myster..."
4,Carolina Garcia Aguilera again. She's written ...,1,"[carolina, garcia, aguilera, again, she, writt...","[carolina, garcia, aguilera, again, she, writt..."


In [None]:
newtestdf['stemmed'].head()

0    [thi, time, read, garcia, aguilera, came, book...
1    [as, ms, garcia, aguilera, book, think, must, ...
2    [ve, read, ms, aguilera, work, befor, have, fi...
3    [thi, romanc, novel, right, rest, amaz, myster...
4    [carolina, garcia, aguilera, again, she, writt...
Name: stemmed, dtype: object

In [None]:
newtestdf=newtestdf.drop(columns=["treview","tokenized"],axis=1)
newtestdf.head()

Unnamed: 0,tsentiment,stemmed
0,1,"[thi, time, read, garcia, aguilera, came, book..."
1,1,"[as, ms, garcia, aguilera, book, think, must, ..."
2,1,"[ve, read, ms, aguilera, work, befor, have, fi..."
3,1,"[thi, romanc, novel, right, rest, amaz, myster..."
4,1,"[carolina, garcia, aguilera, again, she, writt..."


In [None]:
dim = 100
window = 3
min_count = 1
workers = 4 #threads
sg = 1 # skip gram model =1
test_stemmed_tokens = pd.Series(newtestdf['stemmed']).values
print(test_stemmed_tokens)

[list(['thi', 'time', 'read', 'garcia', 'aguilera', 'came', 'book', 'live', 'regi', 'kelli', 'thi', 'book', 'exactli', 'look', 'hit', 'spot', 'enjoi', 'book', 'written', 'onc', 'start', 'book', 'kept', 'come', 'more', 'it', 'cultur', 'famili', 'friendship', 'romanc', 'look', 'littl', 'romanc', 'pick', 'book', 'end', 'turn', 'right', 'love', 'main', 'chartacht', 'margarita', 'aka', 'daisi', 've', 'miami', 'wai', 'daisi', 'told', 'stori', 'certainli', 'felt', 'there', 'also', 'go', 'daisi', 'peril', 'close', 'book', 'feel', 'grown', 'emotion', 'well'])
 list(['as', 'ms', 'garcia', 'aguilera', 'book', 'think', 'must', 'read', 'imposs', 'down', 'success', 'deviat', 'past', 'lupe', 'solano', 'seri', 'captur', 'essenc', 'excit', 'local', 'color', 'divers', 'fabric', 'miami', 'sensual', 'cultur', 'enlighten'])
 list(['ve', 'read', 'ms', 'aguilera', 'work', 'befor', 'have', 'finish', 'on', 'hot', 'summer', 'go', 'check', 'lupe', 'solano', 'seri', 've', 'heard', 'about', 'on', 'hot', 'summer', 

In [None]:
testmodel = Word2Vec(test_stemmed_tokens, min_count = min_count, size = dim, workers = workers, window = window, sg = sg)
testmodel.save("/content/electronics10000.model")

In [None]:
word2vec_filename ='test_review_word2vec_electronics.csv'
testfeatures = (newtestdf['stemmed'])
testlabels = newtestdf['tsentiment']
with open(word2vec_filename, 'w+') as word2vec_file:
    for index, row in testfeatures.items():
      model_vector = (np.mean([testmodel[token] for token in row], axis=0)).tolist()
      if index == 0:
          header = ",".join(str(ele) for ele in range(100))
          word2vec_file.write(header)
          word2vec_file.write("\n")
      if type(model_vector) is list:  
          line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
      else:
          line1 = ",".join([str(0) for i in range(100)])
      word2vec_file.write(line1)
      word2vec_file.write('\n')

  
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [None]:
test_df = pd.read_csv(word2vec_filename)
test_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.101114,0.187956,0.112255,0.173764,0.329515,-0.091193,0.017521,0.075244,0.195732,-0.103139,...,-0.169747,-0.003647,-0.113560,-0.178687,-0.143147,-0.007474,-0.032628,0.132370,0.293125,-0.395657
1,0.026571,0.181777,0.112661,0.160100,0.259524,-0.071283,-0.012348,0.072837,0.215870,-0.113231,...,-0.124038,0.021418,-0.084178,-0.109378,-0.130558,0.006736,-0.050064,0.069358,0.267093,-0.354992
2,0.126531,0.207412,0.064850,0.147857,0.347451,-0.062478,0.013363,0.132603,0.212351,-0.099188,...,-0.181747,0.000802,-0.115498,-0.189689,-0.124184,-0.020767,0.004547,0.036262,0.268855,-0.401255
3,0.107136,0.197011,0.117682,0.184625,0.332852,-0.117511,-0.000217,0.054227,0.199995,-0.124115,...,-0.165153,-0.007770,-0.074437,-0.186986,-0.174308,0.027719,-0.060444,0.101355,0.299018,-0.451777
4,0.053885,0.136751,0.107622,0.178785,0.302442,-0.090097,0.017779,0.067449,0.195120,-0.157921,...,-0.178129,0.009265,-0.124857,-0.162127,-0.146592,-0.012956,-0.040450,0.114863,0.268735,-0.373408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.074040,0.302943,0.098401,0.216110,0.383185,-0.071323,-0.025246,0.131392,0.250428,-0.062728,...,-0.076838,0.032918,-0.131642,-0.205731,-0.104652,-0.050079,-0.028155,0.208551,0.347064,-0.445989
9996,0.107955,0.242960,0.042789,0.166086,0.364952,0.048111,-0.032899,0.075454,0.220500,-0.120591,...,-0.149167,0.058439,-0.108760,-0.235277,-0.049310,-0.037583,0.083080,0.161348,0.267207,-0.416793
9997,0.099178,0.258980,0.042428,0.157956,0.327532,0.018058,-0.074173,0.018857,0.196663,-0.068405,...,-0.162155,-0.011379,-0.114924,-0.269588,-0.006341,-0.039465,0.044309,0.232953,0.290361,-0.401188
9998,0.128668,0.271356,0.055805,0.168621,0.384702,0.039111,-0.042001,0.100323,0.229990,-0.123674,...,-0.155902,0.035042,-0.135866,-0.254373,-0.048874,-0.068895,0.057688,0.169285,0.286728,-0.423430


In [None]:
from sklearn.metrics import classification_report
test_predictions_word2vec = clf_decision_word2vec.predict(test_df)
print(classification_report(newtestdf['tsentiment'].values,test_predictions_word2vec))

              precision    recall  f1-score   support

           0       0.54      0.88      0.67      5000
           1       0.68      0.26      0.38      5000

    accuracy                           0.57     10000
   macro avg       0.61      0.57      0.52     10000
weighted avg       0.61      0.57      0.52     10000



In [None]:
newtestdf['predicted']=test_predictions_word2vec

In [None]:
newtestdf

Unnamed: 0,tsentiment,stemmed,predicted
0,1,"[thi, time, read, garcia, aguilera, came, book...",1
1,1,"[as, ms, garcia, aguilera, book, think, must, ...",0
2,1,"[ve, read, ms, aguilera, work, befor, have, fi...",0
3,1,"[thi, romanc, novel, right, rest, amaz, myster...",1
4,1,"[carolina, garcia, aguilera, again, she, writt...",0
...,...,...,...
9995,0,"[look, great, control, imposs, master, thei, i...",0
9996,0,"[consid, radio, come, box, mark, execut, micro...",1
9997,0,"[reciev, fisher, slim, thing, notic, plastici,...",1
9998,0,"[what, disappoint, thi, look, great, desk, sou...",0


In [None]:
from sklearn import svm

In [None]:
svm_clf=svm.SVC()
svm_clf.fit(features,labels)

SVC()

In [None]:
test_predictions_word2vec = svm_clf.predict(test_df)
print(classification_report(newtestdf['tsentiment'].values,test_predictions_word2vec))

              precision    recall  f1-score   support

           0       0.60      0.23      0.33      5000
           1       0.52      0.85      0.65      5000

    accuracy                           0.54     10000
   macro avg       0.56      0.54      0.49     10000
weighted avg       0.56      0.54      0.49     10000



In [None]:
newtestdf

Unnamed: 0,tsentiment,stemmed,predicted
0,1,"[thi, time, read, garcia, aguilera, came, book...",1
1,1,"[as, ms, garcia, aguilera, book, think, must, ...",0
2,1,"[ve, read, ms, aguilera, work, befor, have, fi...",0
3,1,"[thi, romanc, novel, right, rest, amaz, myster...",1
4,1,"[carolina, garcia, aguilera, again, she, writt...",0
...,...,...,...
9995,0,"[look, great, control, imposs, master, thei, i...",0
9996,0,"[consid, radio, come, box, mark, execut, micro...",1
9997,0,"[reciev, fisher, slim, thing, notic, plastici,...",1
9998,0,"[what, disappoint, thi, look, great, desk, sou...",0


In [None]:
from sklearn.ensemble import RandomForestClassifier
rfmodel = RandomForestClassifier(n_estimators = 100)
rfmodel.fit(features,labels)

RandomForestClassifier()

In [None]:
test_predictions_word2vec = rfmodel.predict(test_df)
print(classification_report(newtestdf['tsentiment'].values,test_predictions_word2vec))

              precision    recall  f1-score   support

           0       0.54      0.94      0.69      5000
           1       0.78      0.19      0.31      5000

    accuracy                           0.57     10000
   macro avg       0.66      0.57      0.50     10000
weighted avg       0.66      0.57      0.50     10000



In [None]:
from sklearn.linear_model import LogisticRegression
lrmodel = LogisticRegression(max_iter=1000)
lrmodel.fit(features,labels)

LogisticRegression(max_iter=1000)

In [None]:
test_predictions_word2vec = lrmodel.predict(test_df)
print(classification_report(newtestdf['tsentiment'].values,test_predictions_word2vec))

              precision    recall  f1-score   support

           0       0.81      0.03      0.05      5000
           1       0.51      0.99      0.67      5000

    accuracy                           0.51     10000
   macro avg       0.66      0.51      0.36     10000
weighted avg       0.66      0.51      0.36     10000

