In [97]:
#wordnet in nltk
from nltk.corpus import wordnet as wn
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score
#import classification report
from sklearn.metrics import classification_report, confusion_matrix
nltk.download('wordnet')
import numpy as np

#random forest
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package wordnet to /home/dorin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [98]:
#read imdb/imdb.neg
with open('imdb/imdb.neg', 'r') as f:
    neg = f.readlines()
#read imdb/imdb.pos
with open('imdb/imdb.pos', 'r') as f:
    pos = f.readlines()

In [99]:
# use svms for sentiment analysis on the reviews
# use the first 1000 reviews for training and the next 1000 for testing

data_points = 10000

train = neg[:data_points] + pos[:data_points]
test = neg[data_points:data_points*2] + pos[data_points:data_points*2]

#create labels
y_train = [0]*data_points + [1]*data_points
y_test = [0]*data_points + [1]*data_points

#use a count vectorizer to extract features
vectorizer = CountVectorizer(min_df=5, max_df=0.8, ngram_range=(1,2), stop_words='english')
X_train = vectorizer.fit_transform(train)
X_test = vectorizer.transform(test)

In [58]:
#train the svm
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

In [59]:
#predict the test set
y_pred = clf.predict(X_test)

In [60]:
#classification report of y pred vs y test
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.82      0.79     10000
           1       0.80      0.73      0.77     10000

    accuracy                           0.78     20000
   macro avg       0.78      0.78      0.78     20000
weighted avg       0.78      0.78      0.78     20000



In [69]:
#gensim word2vec
import gensim
from gensim.models import Word2Vec
from nltk.data import find
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)



LookupError: 
**********************************************************************
  Resource [93mword2vec_sample[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('word2vec_sample')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mmodels/word2vec_sample/pruned.word2vec.txt[0m

  Searched in:
    - '/home/dorin/nltk_data'
    - '/home/dorin/miniconda3/envs/uni_env/nltk_data'
    - '/home/dorin/miniconda3/envs/uni_env/share/nltk_data'
    - '/home/dorin/miniconda3/envs/uni_env/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [70]:
#download word2vec model
nltk.download("word2vec_sample")

[nltk_data] Downloading package word2vec_sample to
[nltk_data]     /home/dorin/nltk_data...
[nltk_data]   Unzipping models/word2vec_sample.zip.


True

In [72]:
#load nltk word2vec_sample
from nltk.data import find
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

In [73]:
#find synonyms of cancer using model
model.most_similar('cancer')

[('lymphoma', 0.7796972393989563),
 ('leukemia', 0.778712272644043),
 ('cancers', 0.7422202229499817),
 ('Cancer', 0.6882086992263794),
 ('disease', 0.6603152751922607),
 ('tumor', 0.6315516829490662),
 ('tumors', 0.6185369491577148),
 ('malignancy', 0.6102085113525391),
 ('Leukemia', 0.5698093175888062),
 ('malignancies', 0.5675305128097534)]

In [82]:
#get embedding for Cancer
model['cancer'].shape

(300,)

In [100]:
#convert the reviews to word2vec vectors
X_train = []
for review in train:
    review_vec = np.zeros(300)
    size = 0
    for word in review.split():
        try:
            review_vec += model[word]
            size += 1
        except KeyError:
            pass
    #average
    if size != 0:
        review_vec /= size
    X_train.append(review_vec)

X_test = []
for review in test:
    review_vec = np.zeros(300)
    size = 0
    for word in review.split():
        try:
            review_vec += model[word]
            size += 1
        except KeyError:
            pass
    #average
    if size != 0:
        review_vec /= size
    X_test.append(review_vec)

In [101]:
#random forest
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.75      0.74     10000
           1       0.74      0.72      0.73     10000

    accuracy                           0.73     20000
   macro avg       0.74      0.73      0.73     20000
weighted avg       0.74      0.73      0.73     20000



In [102]:
#random forest with word vectorizer 
data_points = 10000

train = neg[:data_points] + pos[:data_points]
test = neg[data_points:data_points*2] + pos[data_points:data_points*2]

#create labels
y_train = [0]*data_points + [1]*data_points
y_test = [0]*data_points + [1]*data_points

#use a count vectorizer to extract features
vectorizer = CountVectorizer(min_df=5, max_df=0.8, ngram_range=(1,2), stop_words='english')
X_train = vectorizer.fit_transform(train)
X_test = vectorizer.transform(test)

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.78      0.77     10000
           1       0.77      0.76      0.76     10000

    accuracy                           0.77     20000
   macro avg       0.77      0.77      0.77     20000
weighted avg       0.77      0.77      0.77     20000



In [92]:
#train svm
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)


In [93]:
#get classification report
y_pred = clf.predict(X_test)

In [94]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.78      0.76     10000
           1       0.77      0.73      0.75     10000

    accuracy                           0.76     20000
   macro avg       0.76      0.76      0.76     20000
weighted avg       0.76      0.76      0.76     20000



In [86]:
X_train[0].shape

(300,)

In [95]:
len(neg)

300000