In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from termcolor import colored

In [3]:
filepath_dict = {'imdb': 'reviews.txt'}

df_list = []
for source, filepath in filepath_dict.items():
  df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
  df_list.append(df)

df = pd.concat(df_list)
print(colored('Visualizing the dataset', 'green'))
print(df.iloc[0], '\n')

[32mVisualizing the dataset[0m
sentence    A very, very, very slow-moving, aimless movie ...
label                                                       0
Name: 0, dtype: object 



In [6]:
sentences = df['sentence']
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(sentences)

print(colored('Vocabulary of all the unique words in the sentence', 'green'))
print(vectorizer.vocabulary_, '\n')

vectorizer.transform(sentences).toarray()

[32mVocabulary of all the unique words in the sentence[0m



array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
sentences = df['sentence'].values
y = df['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

x_train = vectorizer.transform(sentences_train)
x_test = vectorizer.transform(sentences_test)


In [9]:
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
score = classifier.score(x_test, y_test)

print(colored('Model performance', 'green'))
print('Accuracy:', score)

[32mModel performance[0m
Accuracy: 0.7486631016042781


### in order to increase above accuracy I have rewritten by cleaning and applying normalization

## Remove all punctuations from the text

In [30]:
import string
def remove_punctuation(text):
    return ("".join([char for char in text if char not in string.punctuation]))

In [35]:
sentences = []
for sentence in df['sentence']:
  sentences.append(remove_punctuation(sentence))
df['sentence'] = sentences
df.head()

Unnamed: 0,sentence,label,tokens
0,A very very very slowmoving aimless movie abou...,0,"[A, very,, very,, very, slow-moving,, aimless,..."
1,Not sure who was more lost the flat character...,0,"[Not, sure, who, was, more, lost, -, the, flat..."
2,Attempting artiness with black white and clev...,0,"[Attempting, artiness, with, black, &, white, ..."
3,Very little music or anything to speak of,0,"[Very, little, music, or, anything, to, speak,..."
4,The best scene in the movie was when Gerardo i...,1,"[The, best, scene, in, the, movie, was, when, ..."


## Tokenize sentence to separate words in to array

In [36]:
import nltk
def tokenize(text):
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    return tokenizer.tokenize(text)

In [37]:
df['tokens'] = df['sentence'].apply(lambda sentence : tokenize(sentence))
df.head()

Unnamed: 0,sentence,label,tokens
0,A very very very slowmoving aimless movie abou...,0,"[A, very, very, very, slowmoving, aimless, mov..."
1,Not sure who was more lost the flat character...,0,"[Not, sure, who, was, more, lost, the, flat, c..."
2,Attempting artiness with black white and clev...,0,"[Attempting, artiness, with, black, white, and..."
3,Very little music or anything to speak of,0,"[Very, little, music, or, anything, to, speak,..."
4,The best scene in the movie was when Gerardo i...,1,"[The, best, scene, in, the, movie, was, when, ..."


## Removing n-grams

In [43]:
# Remove tokens of length less than 3
def remove_stopwords(text):
    return [x for x in text if len(x) > 3 ]

In [47]:
df['tokens'] = df['tokens'].apply(lambda x : remove_stopwords(x))
df.head()

Unnamed: 0,sentence,label,tokens
0,A very very very slowmoving aimless movie abou...,0,"[very, very, very, slowmoving, aimless, movie,..."
1,Not sure who was more lost the flat character...,0,"[sure, more, lost, flat, characters, audience,..."
2,Attempting artiness with black white and clev...,0,"[Attempting, artiness, with, black, white, cle..."
3,Very little music or anything to speak of,0,"[Very, little, music, anything, speak]"
4,The best scene in the movie was when Gerardo i...,1,"[best, scene, movie, when, Gerardo, trying, fi..."


## Stemming

In [48]:
import nltk
import pandas as pd
from nltk.stem import PorterStemmer

porter_stemmer=PorterStemmer()
stemmed_texts=[]

for token_arr in df['tokens']:
  stemmed_texts.append([porter_stemmer.stem(word=word) for word in token_arr])

stemdf= pd.DataFrame({'original_sentence': sentences, 'stemmed_sentence': stemmed_texts})
stemdf

Unnamed: 0,original_sentence,stemmed_sentence
0,A very very very slowmoving aimless movie abou...,"[veri, veri, veri, slowmov, aimless, movi, abo..."
1,Not sure who was more lost the flat character...,"[sure, more, lost, flat, charact, audienc, nea..."
2,Attempting artiness with black white and clev...,"[attempt, arti, with, black, white, clever, ca..."
3,Very little music or anything to speak of,"[veri, littl, music, anyth, speak]"
4,The best scene in the movie was when Gerardo i...,"[best, scene, movi, when, gerardo, tri, find, ..."
...,...,...
743,I just got bored watching Jessice Lange take h...,"[just, bore, watch, jessic, lang, take, cloth]"
744,Unfortunately any virtue in this films product...,"[unfortun, virtu, thi, film, product, work, lo..."
745,In a word it is embarrassing,"[word, embarrass]"
746,Exceptionally bad,[except]


## Lemmatization

In [53]:
nltk.download('wordnet')
lemmatiser = nltk.stem.WordNetLemmatizer()
stemmed_texts=[]

for token_arr in stemdf['stemmed_sentence']:
  stemmed_texts.append([lemmatiser.lemmatize(word=word) for word in token_arr])

stemdf= pd.DataFrame({'original_sentence': sentences, 'stemmed_sentence': stemmed_texts})
stemdf

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,original_sentence,stemmed_sentence
0,A very very very slowmoving aimless movie abou...,"[very, very, very, slowmoving, aimless, movie,..."
1,Not sure who was more lost the flat character...,"[sure, more, lost, flat, character, audience, ..."
2,Attempting artiness with black white and clev...,"[Attempting, artiness, with, black, white, cle..."
3,Very little music or anything to speak of,"[Very, little, music, anything, speak]"
4,The best scene in the movie was when Gerardo i...,"[best, scene, movie, when, Gerardo, trying, fi..."
...,...,...
743,I just got bored watching Jessice Lange take h...,"[just, bored, watching, Jessice, Lange, take, ..."
744,Unfortunately any virtue in this films product...,"[Unfortunately, virtue, this, film, production..."
745,In a word it is embarrassing,"[word, embarrassing]"
746,Exceptionally bad,[Exceptionally]


## Generate sentence after preprocessing

In [52]:
def combine_sentences(tokens):
    return " ".join([word for word in tokens])

In [61]:
df['sentence'] = stemdf['stemmed_sentence'].apply(lambda x : combine_sentences(x))
df.head()

Unnamed: 0,sentence,label,tokens
0,very very very slowmoving aimless movie about ...,0,"[very, very, very, slowmoving, aimless, movie,..."
1,sure more lost flat character audience nearly ...,0,"[sure, more, lost, flat, characters, audience,..."
2,Attempting artiness with black white clever ca...,0,"[Attempting, artiness, with, black, white, cle..."
3,Very little music anything speak,0,"[Very, little, music, anything, speak]"
4,best scene movie when Gerardo trying find song...,1,"[best, scene, movie, when, Gerardo, trying, fi..."


In [62]:
df.tail()

Unnamed: 0,sentence,label,tokens
743,just bored watching Jessice Lange take clothes,0,"[just, bored, watching, Jessice, Lange, take, ..."
744,Unfortunately virtue this film production work...,0,"[Unfortunately, virtue, this, films, productio..."
745,word embarrassing,0,"[word, embarrassing]"
746,Exceptionally,0,[Exceptionally]
747,insult one intelligence huge waste money,0,"[insult, ones, intelligence, huge, waste, money]"


## Python Implementation - Vectorizer

In [58]:
sentences = df['sentence'].values
y = df['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

x_train = vectorizer.transform(sentences_train)
x_test = vectorizer.transform(sentences_test)

## Classify with logistic regression

In [59]:
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
score = classifier.score(x_test, y_test)

print(colored('Model performance', 'green'))
print('Accuracy:', score)

[32mModel performance[0m
Accuracy: 0.7540106951871658
