### Imports

In [1]:
# Standard imports
import numpy as np
import pandas as pd

# SKLearn related imports
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import preprocessing

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin

# Let's look at some Movie Reviews

In [3]:
df = pd.read_csv('./data/imdb_sentiment.csv')
df = df.sample(frac=1).reset_index(drop=True)
df.dtypes

Sentiment    object
Text         object
dtype: object

In [4]:
df.head()

Unnamed: 0,Sentiment,Text
0,Negative,Note to all mad scientists everywhere: if you'...
1,Positive,If you go to this movie expecting something it...
2,Negative,How this film gains a 6.7 rating is beyond bel...
3,Positive,"Just got through watching this version of ""Sam..."
4,Negative,Zombi 3 starts as a group of heavily armed men...


In [5]:
df.Text[1]

'If you go to this movie expecting something it isn\'t, you will be disappointed, as with any movie. This movie contains what Hemmingway described as the "iceberg effect". On the surface, its simply a cache of random movie clips smashed together to make a movie. If this would be written in a book, it would be a short story, because the action in the movie is very fast paced, and unless you actually try to catch it, the reasoning behind the plot (along with some subtle foreshadowing) can very well pass you by. Definitely a movie you will have to see twice in order to fully appreciate. Experimental Cinematography barely describes this movie. The camera-work and post production add much to the overall flavour of the film, making it quite artistic at some points and open to interpretation at others (something to be desired in American movies as of late). Although, at some parts it may get a little raunchy, gruesome and too heavy for some audiences, the movie never becomes completely unreal

In [6]:
# Split in train and validation
train_df, validation_df = train_test_split(df, test_size=0.2, random_state=42)

# Stopwords

# Representing Text through a Bag of Words

In [7]:
vectorizer = CountVectorizer()

In [8]:
vectorizer.fit(train_df['Text'].values)

# Looking at a small sample of the vocabulary:
vocabulary = list(vectorizer.vocabulary_.keys())
print("Small sample of the vocabulary:", vocabulary[0:20])

# Number of words in the vocabulary
print("\nNumber of distinct words:", len(vocabulary))

Small sample of the vocabulary: ['marvellous', 'beingness', 'lynched', 'masculine', 'galvanized', 'glitters', 'industrial', 'presumptive', 'anticlimactic', 'teamo', 'harry', 'tnn', 'pyun', 'clamour', 'lucian', 'boilers', 'tumbuan', 'calomari', 'intruded', 'jovial']

Number of distinct words: 68587


In [10]:
sentence = train_df['Text'].values[12:13]
print(sentence[0], '\n')

# Tranform sentence into bag of words representation
word_count_sentence = vectorizer.transform(sentence)

# Find the indexes of the words which appear in the sentence
_, columns = word_count_sentence.nonzero()

# Get the inverse map to map vector indexes to words
vocabulary = vectorizer.vocabulary_
inv_map = {v: k for k, v in vocabulary.items()}

# Extract the corresponding word and count
counts = [(inv_map[i], word_count_sentence[0, i]) for i in columns]

for word, count in counts:
    print(word, ": ", count)

A phenomenal achievement in awfulness. It's actually hilariously awful.<br /><br />First off...Nicholas Cage must now have made it to the finals in the Over-Emoting Category in his acting class. Wearing new hair plugs and with a face that has been lifted so many times his pinned back ears seem to be straining to touch in the back he oozes not only a sick smarmiess but creates a "hero" character that you have no vested interest in.<br /><br />I don't know what it is with Neil Labute and female characters. He makes females out to be totally deviant and evil...and pays them back by having Cage punch several of them directly in the face and call them all "b****es" a few times too. I've enjoyed LaBute's early films and a few of his plays...but it's a strange fascination he has.<br /><br />I'd give this film a 2 out of 10 solely based on Ellen Burstyn's performance. By the time she finally makes her appearance (bravely soldiering through her scenes with her wig line clearly visible on her fo

In [11]:
word_count_matrix = vectorizer.transform(train_df['Text'].values)
word_count_matrix.shape

(20000, 68587)

# Kicking it up a notch with TF-IDF

In [12]:
tfidf = TfidfTransformer()
tfidf.fit(word_count_matrix)

word_term_frequency_matrix = tfidf.transform(word_count_matrix)

In [13]:
word_term_frequency_matrix

<20000x68587 sparse matrix of type '<class 'numpy.float64'>'
	with 2763676 stored elements in Compressed Sparse Row format>