In [None]:
# http://bogotobogo.com/Algorithms/Machine_Learning_NLP_Sentiment_Analysis_1.php

In [None]:
# IMDB

In [28]:
import pyprind
import pandas as pd
import os
import io

basepath = './aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            # For python2, use 'io.open', for Python3, just us 'open' 
            with io.open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:16:19


In [35]:
X = df.loc[:,:].values
X.shape

(50000, 2)

In [40]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False, encoding='utf-8')

df = pd.read_csv('./movie_data.csv')
df.head(5)

Unnamed: 0,review,sentiment
0,Going into the movie with the right expectatio...,0
1,I want to believe all new horror films coming ...,0
2,I love this movie like no other. Another time ...,1
3,Of all the versions of the Odyssey (or of any ...,1
4,I watched this film on ITV and I enjoyed it a ...,1


In [58]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
vectorizer

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [95]:
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    ]
bag = vectorizer.fit_transform(corpus)
bag.shape, bag

((4, 9), <4x9 sparse matrix of type '<type 'numpy.int64'>'
 	with 19 stored elements in Compressed Sparse Row format>)

In [96]:
bag.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [97]:
vectorizer.vocabulary_

{u'and': 0,
 u'document': 1,
 u'first': 2,
 u'is': 3,
 u'one': 4,
 u'second': 5,
 u'the': 6,
 u'third': 7,
 u'this': 8}

In [98]:
vectorizer.vocabulary_.get('third')

7

In [99]:
vectorizer.transform(['Something completely new.']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [100]:
# tf-idf with a corpus document

In [101]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(smooth_idf=False)
np.set_printoptions(precision=2)
tfidf.fit_transform(vectorizer.fit_transform(corpus)).toarray()

array([[ 0.  ,  0.43,  0.57,  0.43,  0.  ,  0.  ,  0.34,  0.  ,  0.43],
       [ 0.  ,  0.24,  0.  ,  0.24,  0.  ,  0.89,  0.19,  0.  ,  0.24],
       [ 0.56,  0.  ,  0.  ,  0.  ,  0.56,  0.  ,  0.24,  0.56,  0.  ],
       [ 0.  ,  0.43,  0.57,  0.43,  0.  ,  0.  ,  0.34,  0.  ,  0.43]])

In [104]:
# The weights of each feature computed by the fit method call 
# are stored in a model attribute:
tfidf.idf_

array([ 2.39,  1.29,  1.69,  1.29,  2.39,  2.39,  1.  ,  2.39,  1.29])

In [70]:
# tf-idf with a counts example

In [105]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
transformer   

TfidfTransformer(norm=u'l2', smooth_idf=False, sublinear_tf=False,
         use_idf=True)

In [106]:
counts = [[3, 0, 1],
          [2, 0, 0],
          [3, 0, 0],
          [4, 0, 0],
          [3, 2, 0],
          [3, 0, 2]]
tfidf = transformer.fit_transform(counts)
tfidf

<6x3 sparse matrix of type '<type 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [107]:
tfidf.toarray() 

array([[ 0.82,  0.  ,  0.57],
       [ 1.  ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  ],
       [ 0.47,  0.88,  0.  ],
       [ 0.58,  0.  ,  0.81]])

In [91]:
transformer = TfidfTransformer()
transformer.fit_transform(counts).toarray()

array([[ 0.85,  0.  ,  0.52],
       [ 1.  ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  ],
       [ 1.  ,  0.  ,  0.  ],
       [ 0.55,  0.83,  0.  ],
       [ 0.63,  0.  ,  0.78]])

In [92]:
transformer.idf_

array([ 1.  ,  2.25,  1.85])