# Sentiment Analysis

BOOK: Learning word vectors for sentiment analysis.

## 1. Data Preprocessing

In [50]:
import pyprind
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
pbar = pyprind.ProgBar(50000)
labels = {'pos':1,'neg':0}
df = pd.DataFrame()
for s in ('test','train'):
    for l in ('pos','neg'):
        path = './aclImdb/%s/%s'%(s,l)
        for file in os.listdir(path):
            with open(os.path.join(path,file),'r') as infiles:
                txt = infiles.read()
            df = df.append([[txt,labels[l]]],ignore_index = True)
            pbar.update()
        

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:20


In [4]:
df.columns = ['review','sentiment']
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv',index = False)

In [5]:
df = pd.read_csv('./movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0


## 2. Transforming words into feature vectors

 CountVectorizer: Construct a bag-of-word based on the word counts in respective documents

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array(['The sun is shining','The weather is sweet','The sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)

In [7]:
count.vocabulary_

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}

In [8]:
bag.toarray()

array([[0, 1, 1, 1, 0, 1, 0],
       [0, 1, 0, 0, 1, 1, 1],
       [1, 2, 1, 1, 1, 2, 1]], dtype=int64)

n-grams of size 3 and 4 yield good performances in anti-spam filtering

## 3. TD-IDF

###  3.1 Introduction
tf-idf : term frequency-inverse document frequency 

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
tfidf.fit_transform(bag).toarray()

array([[0.  , 0.43, 0.56, 0.56, 0.  , 0.43, 0.  ],
       [0.  , 0.43, 0.  , 0.  , 0.56, 0.43, 0.56],
       [0.4 , 0.48, 0.31, 0.31, 0.31, 0.48, 0.31]])

### 3.2 Clean text data

In [10]:
df.loc[0,'review'][-50:]

'to Star Cinema!! Way to go, Jericho and Claudine!!'

In [11]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [12]:
preprocessor('is seven .<br /><br /> Title (Brazil): Not available')

'is seven title brazil not available'

In [13]:
df['review'] = df['review'].apply(preprocessor)

In [14]:
df.head()

Unnamed: 0,review,sentiment
0,my family and i normally do not watch local mo...,1
1,believe it or not this was at one time the wor...,0
2,after some internet surfing i found the homefr...,0
3,one of the most unheralded great works of anim...,1
4,it was the sixties and anyone with long hair a...,0


### 3.3 Processing documents into tokens

In [15]:
# Tokenize documents
def tokenizer(text):
    return text.split()

In [16]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [18]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/TobiasChen/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [19]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

## 4. Training LR Model

In [22]:
X_train = df.loc[:25000,'review'].values
y_train = df.loc[:25000,'sentiment'].values
X_test = df.loc[25000:,'review'].values
y_test = df.loc[25000:,'sentiment'].values

In [None]:
#from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents = None,lowercase = False,preprocessor = None)
param_grid = [{'vect__ngarm_range':[(-1,1)],
              'vect__stop_words':[stop,None],
              'vect__tokenizer':[tokenizer,tokenizer_porter],
               'clf__penalty':['l1','l2'],
               'clf__C':[1.0,10.0,100.0]},
              {'vect__ngram_range':[(1,1)],
               'vect__stop_words':[stop,None],
               'vect__tokenizer':[tokenizer,tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty':['l1','l2'],
               'clf__C':[1.0,10.0,100.0]       
              },]
lr_tfidf = Pipeline([('vect',tfidf),('clf',LogisticRegression(random_state = 0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf,param_grid,scoring = 'accuracy',cv = 5, verbose = 1, n_jobs = -1)
gs_lr_tfidf.fit(X_train,y_train)


In [36]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [37]:
gs_lr_tfidf.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


KeyboardInterrupt: 

In [None]:
print("Best parameter: {}".format(gs_lr_tfidf.best_params_))

In [None]:
print("CV accuracy: {}".format(gs_lr_tfidf.best_score_))
print("Test accuracy: {}".format(clf.score(X_test,X_train)))

Naive Bayes classifier:  
1. popular in e-mail spam filtering  
2. easy to implement   
3. computationally efficient   
4. perform well in samll database  
BOOK: Naive Bayes and Text Classification I - introduction and theory

## 5. Online Algorithms and out-of-core learning

In [38]:
import numpy as np
import re 
from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [39]:
next(stream_docs(path='./movie_data.csv'))

('"My family and I normally do not watch local movies for the simple reason that they are poorly made, they lack the depth, and just not worth our time.<br /><br />The trailer of ""Nasaan ka man"" caught my attention, my daughter in law\'s and daughter\'s so we took time out to watch it this afternoon. The movie exceeded our expectations. The cinematography was very good, the story beautiful and the acting awesome. Jericho Rosales was really very good, so\'s Claudine Barretto. The fact that I despised Diether Ocampo proves he was effective at his role. I have never been this touched, moved and affected by a local movie before. Imagine a cynic like me dabbing my eyes at the end of the movie? Congratulations to Star Cinema!! Way to go, Jericho and Claudine!!"',
 1)

In [40]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [41]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error = 'ignore',n_features = 2**21,preprocessor = None,tokenizer = tokenizer)
clf = SGDClassifier(loss ='log',random_state = 1, n_iter = 1)
doc_stream = stream_docs(path = './movie_data.csv')

In [42]:
import pyprind
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()



0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:30


In [44]:
X_test,y_test = get_minibatch(doc_stream,size = 5000)
X_test = vect.transform(X_test)
print(clf.score(X_test,y_test))

0.8676


In [49]:
clf = clf.partial_fit(X_test, y_test)

## 6. Reference

Learning word vectors for sentiment analysis.  
Words vs Character N-Grams for anti-spam filtering(Ioannis Kanaris)   
NLP: An algorithm for suffix stripping www.nltk.org/api/nltk/stem/html  
Latent allocation (D.M.Blei)  
Efficient Estimation of Word Representations in Vector Space.