In [1]:
!ls

README         imdb.vocab     [34mtest[m[m
Untitled.ipynb imdbEr.txt     [34mtrain[m[m


In [16]:
import os
cwd = os.getcwd()
datadir = '/'.join(cwd.split('/')[0:-1]) + '/Sentiment_Analysis/train'
datadir

'/Users/tramy/Documents/GitHub/Sentiment_Analysis/train'

In [19]:
from sklearn.datasets import load_files
import pandas as pd
import numpy as np


In [15]:
reviews_train = load_files(datadir)
# it is not yet a dataframe so cannot use pandas or numpy here

In [28]:
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train:{}".format(type(text_train)))
print("length of text_train = total reviews: {}".format(len(text_train)))
print("text_train[3]:\n{}".format(text_train[3]))

type of text_train:<class 'list'>
length of text_train = total reviews: 75000
text_train[1]:
b"Dan Katzir has produced a wonderful film that takes us on a roller-coaster ride through a real romance set in the troubles surrounding modern Israel.<br /><br />For anyone who's ever been in love, the film brings back the uncertainties, the insecurities and heartache that make love so bitter-sweet. The atmosphere of fear and isolation that came with the difficult times in Israel at that time just serve to intensify the feeling. Instantly, you are drawn in to Dan's plight, and you can't fail to be deeply moved.<br /><br />You can't write drama and passion like this - the contrast between the realities of Dan's desperate, snatched relationship with Iris, and the realities of a state in turmoil make this eminently watchable. If you have an ounce of passion, and have ever been in love, see this film."


In [32]:
#Filter out some expression, like <br../>

text_train = [doc.replace(b"<br />", b"") for doc in text_train] #we need b" " because those are bytes-like objects, not string

In [38]:
print ("Samples per class (training): {}".format(np.bincount(y_train)))
print ("Sample y_train[1:4]:\n{}".format(y_train[1:4]))

Samples per class (training): [12500 12500 50000]
Sample y_train[1]:
[2 2 1]


#Bag of words
3 Steps:
    1. Tokenize - CountVectorizer
    2. Vocab building -  CountVectorizer
    3. Encoding - Transform 
    (Based on how often each word appear. return a vector of 0 and 1, 0: not appear, 1: appear)

In [39]:
from sklearn.feature_extraction.text import CountVectorizer

In [41]:
#Apply the CountVectorizer function into vectorizing our training data
vect = CountVectorizer().fit(text_train) 
X_train = vect.transform(text_train)
print("X_train:n\{}".format(X_train))

X_train:n\  (0, 3192)	1
  (0, 3289)	1
  (0, 5908)	3
  (0, 6720)	1
  (0, 6725)	1
  (0, 7429)	1
  (0, 8512)	1
  (0, 11331)	1
  (0, 12958)	1
  (0, 13471)	1
  (0, 14981)	1
  (0, 16997)	1
  (0, 17381)	2
  (0, 17534)	1
  (0, 18209)	1
  (0, 20529)	1
  (0, 22867)	1
  (0, 23532)	1
  (0, 27359)	1
  (0, 34156)	1
  (0, 37162)	1
  (0, 39661)	1
  (0, 41342)	2
  (0, 42840)	2
  (0, 43154)	1
  :	:
  (74999, 119185)	4
  (74999, 119431)	1
  (74999, 119504)	1
  (74999, 119671)	1
  (74999, 120596)	1
  (74999, 121727)	1
  (74999, 122216)	1
  (74999, 122220)	1
  (74999, 122397)	3
  (74999, 122437)	1
  (74999, 122629)	1
  (74999, 122969)	1
  (74999, 123058)	1
  (74999, 123256)	4
  (74999, 123354)	1
  (74999, 123364)	1
  (74999, 123381)	1
  (74999, 123406)	1
  (74999, 123963)	1
  (74999, 124051)	1
  (74999, 124335)	6
  (74999, 124369)	1
  (74999, 124950)	1
  (74999, 124985)	1
  (74999, 126139)	3


In [43]:
print("X_train:n\{}".format(repr(X_train)))

X_train:n\<75000x127229 sparse matrix of type '<class 'numpy.int64'>'
	with 10315468 stored elements in Compressed Sparse Row format>


In [51]:
# To look at features in the vocab list - use the get_feature_names() function
feature_names = vect.get_feature_names()
print ("Number of features: {}".format(len(feature_names)))
print ("First 20 features:/n{}".format(feature_names[::2000]))

Number of features: 127229
First 20 features:/n['00', '9out', 'ages', 'andress', 'aryeman', 'baio', 'bellah', 'bloodstain', 'briers', 'calm', 'cessation', 'circenses', 'complementing', 'countedsix', 'dahan', 'dench', 'discourages', 'dreamkeeeper', 'elderly', 'esposito', 'fang', 'flaquer', 'frizzyhead', 'gerry', 'grandness', 'halycon', 'hesitated', 'hubiriffic', 'incongruence', 'ireperable', 'journal', 'kindsa', 'landingham', 'limply', 'maaaybbbeee', 'marthe', 'mephestophelion', 'modem', 'mushing', 'nigger', 'oghris', 'oxbridge', 'pensamentos', 'pleaaaaaaaase', 'prettified', 'quantify', 'recommendation', 'retrieving', 'rp', 'scam', 'sequiter', 'sidemen', 'snk', 'sprite', 'stroptomycin', 'swings', 'teoe', 'toiled', 'tsiolkovsky', 'unflagging', 'vaporised', 'wakens', 'wilshire', 'ynis']


#Once we have the feature, build a classifier using Logistics regression - it works best for high-dimensional sparse data like this. We can take this step to evaluate our feature extraction

In [52]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))



Mean cross-validation accuracy: 0.71


#Next we are trying to improve this accuracy by filtering out super rare words (appears in less than 5 documents) and 
filter STOP WORDS

In [55]:
#Remove rarely seen features, min-seen-documents = 5
vect2 = CountVectorizer(min_df=5).fit(text_train)
X_train = vect2.transform(text_train)
feature_names2 = vect2.get_feature_names()
print ("Number of features: {}".format(len(feature_names2)))
print ("First 20 features:/n{}".format(feature_names2[:20]))


Number of features: 44550
First 20 features:/n['00', '000', '001', '007', '00am', '00pm', '00s', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '100', '1000', '1001']


In [58]:
#Filter stop words
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
vect3 = CountVectorizer(min_df=5, stop_words = "english").fit(text_train)
feature_names3 = vect3.get_feature_names()
print ("Number of features: {}".format(len(feature_names3)))
print ("First 20 features:/n{}".format(feature_names3[:20]))

Number of features: 44241
First 20 features:/n['00', '000', '001', '007', '00am', '00pm', '00s', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '100', '1000', '1001']


In [61]:
#Rescale data with TF-IDF (term frequency-inverse document frequency)
#skip for now but will try again

#Laten Dirichlet Allocation for topic summarizing
vect_func = CountVectorizer(max_features = 10000, max_df = .15)
X = vect_func.fit_transform(text_train)


In [66]:
from sklearn.decomposition import LatentDirichletAllocation #n_components = number of topics
lda = LatentDirichletAllocation(n_components = 10, learning_method ="batch", max_iter = 25, random_state = 0)

document_topics = lda.fit_transform(X)

In [73]:
!pip install mglearn

import mglearn
sorting = np.argsort(lda.components_, axis = 1)[:, ::-1]
feature_names4 = np.array(vect.get_feature_names())

mglearn.tools.print_topics(topics = range(10), feature_names= feature_names4, sorting =sorting, topics_per_chunk=5, n_words=10)

Collecting mglearn
[?25l  Downloading https://files.pythonhosted.org/packages/fb/01/8d3630ecc767c9de96a9c46e055f2a3a5f9e14a47d3d0348a36a5005fe67/mglearn-0.1.7.tar.gz (540kB)
[K     |████████████████████████████████| 542kB 3.7MB/s eta 0:00:01
Building wheels for collected packages: mglearn
  Building wheel for mglearn (setup.py) ... [?25ldone
[?25h  Stored in directory: /Users/tramy/Library/Caches/pip/wheels/74/cf/8d/04f4932d15854a36726c6210763c7127e62de28f5c8ddfcf3b
Successfully built mglearn
Installing collected packages: mglearn
Successfully installed mglearn-0.1.7




topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
codswallop    associations  benson        conceals      astro         
alleviate     carnality     ddlj          compared      deafening     
cuthbert      death         barnett       crude         adrienne      
decarlo       1981          barry         bagging       busfield      
charlene      1986          decarlo       brands        curfew        
decadence     beijing       campier       beijing       caught        
charlie       caked         benefiting    carol         circulate     
contempt      citations     dat           conceivably   commiserate   
angrily       comfy         ddr           baggy         boogie        
chad          damnation     casca         casca         accurately    


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
cann

In [74]:
lda100 = LatentDirichletAllocation(n_components = 100, learning_method ="batch", max_iter = 25, random_state = 0)

document_topics = lda100.fit_transform(X)

In [75]:
sorting = np.argsort(lda100.components_, axis = 1)[:, ::-1]
feature_names5 = np.array(vect.get_feature_names())

topics = np.array([7,16,24,25,28,36,37,45,51,53,54,63,89,97])

mglearn.tools.print_topics(topics = range(10), feature_names= feature_names4, sorting =sorting, topics_per_chunk=7, n_words=20)

topic 0       topic 1       topic 2       topic 3       topic 4       topic 5       topic 6       
--------      --------      --------      --------      --------      --------      --------      
avengers      comfy         alias         care          compared      cann          caracter      
cya           carnality     alyson        bots          bagging       cocoon        decaprio      
apologized    chazz         allegations   anger         baggy         acts          amongst       
cleaver       1981          churchill     camp          commando      consecutive   collinson     
cleavers      afterward     auteurs       coloring      crude         cameron       bleach        
conundrum     autos         authoress     advising      convenience   coe           braless       
alarming      commiserate   bigtime       daniels       caracter      connor        collaborate   
aching        anniversary   buffer        begemot       cahill        connolly      churn         
cuthbert  