# Building a Sentiment Analyzer
This project is using the IMDB movie review dataset. Publications Using the Dataset:
Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). Learning Word Vectors for Sentiment Analysis. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011). 

> Initial Stage (Compiling the data and building the dataframe)

In [None]:
#Download the data
from urllib.request import urlretrieve as retrieve
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
retrieve(url, filename="./aclImdb_v1.tar.gz") 
print ("download complete!")

In [None]:
# Unzipping the data
import tarfile
tfile = tarfile.open("aclImdb_v1.tar.gz", "r:gz")
tfile.extractall()

print ("ready to go!")

In [None]:
# Convert the dataset from files to a python DataFrame
import pandas as pd
import os
folder = 'aclImdb'
labels = {'pos': 1, 'neg': 0}
df = pd.DataFrame()
for f in ('test', 'train'):    
    for l in ('pos', 'neg'):
        path = os.path.join(folder, f, l)
        for file in os.listdir (path) :
            with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]],ignore_index=True)
df.columns = ['review', 'sentiment']

In [None]:
# Save the assembled data as .csv file for future retrieve
df.to_csv('movie_data.csv', index=False, encoding='utf-8')
df.head()

> Start working from here after performing the previous steps once

In [1]:
# Read the saved file
import pandas as pd
df = pd.read_csv('movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
review       50000 non-null object
sentiment    50000 non-null int64
dtypes: int64(1), object(1)
memory usage: 586.0+ KB


## Step 1: Preprocessing

Text preprocessing include basic four steps
- 1-	Tokenization
- 2-	Removing punctuation
- 3-	Removing stop words
- 4-	Stemming

In [2]:
# Using nltk
import nltk

In [3]:
# Defining a function for tokenization and removing punctuations
# RegexpTokenizer can tokenize and remove punctuation in the same time
def mytokenizer(item):
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    item = str(item)
    item = tokenizer.tokenize(item)
    return item

In [4]:
# Defining a function for removing stop words
def mystopwordsremover(item):
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    item = [w for w in item if not w in stop_words]
    return item

In [5]:
# Defining a function for Stemming
def mystemmer(item):
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    item = [ps.stem(word) for word in item]
    return str(item)

In [6]:
# Processing the dataframe
# Clone the dataframe to compare the accuracy of ML algurithms before and after text processing
df2 = df

# Measure processing time
from time import time
t0 = time()

# Tokenization & removing punctuation
df2.review = df2.review.apply(lambda x: mytokenizer(x))

# Remove stopwords
df2.review = df2.review.apply(lambda x: mystopwordsremover(x))

# Stemming
df2.review = df2.review.apply(lambda x: mystemmer(x))

print ("Text preprocessing time:", round(time()-t0, 3), "s")

Text preprocessing time: 1886.523 s


In [8]:
df2.head()

Unnamed: 0,review,sentiment
0,"['I', 'went', 'saw', 'movi', 'last', 'night', ...",1
1,"['actor', 'turn', 'director', 'bill', 'paxton'...",1
2,"['As', 'recreat', 'golfer', 'knowledg', 'sport...",1
3,"['I', 'saw', 'film', 'sneak', 'preview', 'deli...",1
4,"['bill', 'paxton', 'taken', 'true', 'stori', '...",1


In [9]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
review       50000 non-null object
sentiment    50000 non-null int64
dtypes: int64(1), object(1)
memory usage: 586.0+ KB


In [10]:
# Save the processed data as .csv file for future retrieve
df2.to_csv('movie_data_processed.csv', index=False, encoding='utf-8')

## Step 2: Feature Extraction

In [11]:
# Read the saved files
import pandas as pd
df = pd.read_csv('movie_data.csv')
df2 = pd.read_csv('movie_data_processed.csv')

In [21]:
df.head()

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [22]:
df2.head()

Unnamed: 0,review,sentiment
0,"['I', 'went', 'saw', 'movi', 'last', 'night', ...",1
1,"['actor', 'turn', 'director', 'bill', 'paxton'...",1
2,"['As', 'recreat', 'golfer', 'knowledg', 'sport...",1
3,"['I', 'saw', 'film', 'sneak', 'preview', 'deli...",1
4,"['bill', 'paxton', 'taken', 'true', 'stori', '...",1


Using Unprocessed text

In [13]:
# Split data into train and test data. Each 25000 
X_train = df.loc[:24999, 'review'].values
y_train = df.loc[:24999, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [14]:
# Convert the text corpus into the feature vectors (eg. using TfidTransformer)
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

In [15]:
# check the shape of your files as an indication of successful steps so far
train_vectors.shape

(25000, 73822)

In [16]:
test_vectors.shape

(25000, 73822)

Using processed text

In [17]:
# Split data into train and test data. Each 25000 
X_train2 = df2.loc[:24999, 'review'].values
y_train2 = df2.loc[:24999, 'sentiment'].values
X_test2 = df2.loc[25000:, 'review'].values
y_test2 = df2.loc[25000:, 'sentiment'].values

In [18]:
# Convert the text corpus into the feature vectors (eg. using TfidTransformer)
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
train_vectors2 = vectorizer.fit_transform(X_train)
test_vectors2 = vectorizer.transform(X_test)

In [19]:
# check the shape of your files as an indication of successful steps so far
train_vectors2.shape

(25000, 73822)

In [20]:
test_vectors2.shape

(25000, 73822)

## Step 3: Choosing ML Algorithm
Measure operation time and accuracy for comparing between algorithms

Steps of algorithm development
- Step 0: Import it
- step 1: Create it
- step 2: Fit it
- step 3: Use it (predictions)
- step 4: Evaluate it (calculate accuracy) 

In [23]:
# To compare processing times between different algorithms
from time import time

### Using Basic Naive Bayes Classifier

In [24]:
print ("Basic Naive Bayes Metrics using unprocessed texts")

# Step 3.0: Import it
from sklearn.naive_bayes import MultinomialNB

# step 3.1: Create it
clf = MultinomialNB()

# step 3.2: Fit it 
t0 = time()
clf = clf.fit(train_vectors, y_train)
print ("Fitting time:", round(time()-t0, 3), "s")

# step 3.3: Use it ( for predictions)
t0 = time()
predicted = clf.predict(test_vectors)
print ("Training time:", round(time()-t0, 3), "s")

# step 3.4: Evaluate it (calculate accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(predicted, y_test)
print ("Accuracy before text processing =", accuracy) 

Basic Naive Bayes Metrics using unprocessed texts
Fitting time: 0.816 s
Training time: 0.504 s
Accuracy before text processing = 0.83664


In [25]:
print ("Basic Naive Bayes Metrics using processed text")

# Step 3.0: Import it
from sklearn.naive_bayes import MultinomialNB

# step 3.1: Create it
clf = MultinomialNB()

# step 3.2: Fit it 
t0 = time()
clf = clf.fit(train_vectors2, y_train2)
print ("Fitting time:", round(time()-t0, 3), "s")

# step 3.3: Use it ( for predictions)
t0 = time()
predicted = clf.predict(test_vectors2)
print ("Training time:", round(time()-t0, 3), "s")

# step 3.4: Evaluate it (calculate accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(predicted, y_test2)
print ("Accuracy after text processing =", accuracy) 

Basic Naive Bayes Metrics using processed text
Fitting time: 0.6 s
Training time: 0.515 s
Accuracy after text processing = 0.83664


### Using SVM

In [28]:
print ("SVM Metrics")

# Step 3.0: Import it
from sklearn.svm import SVC

# step 3.1: Create it
clf = SVC(C=10000.0, kernel="rbf")

# step 3.2: Fit it 
t0 = time()
clf = clf.fit(train_vectors, y_train)
print ("Fitting time:", round(time()-t0, 3), "s")

# step 3.3: Use it ( for predictions)
t0 = time()
predicted = clf.predict(test_vectors)
print ("Training time:", round(time()-t0, 3), "s")

# step 3.4: Evaluate it (calculate accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(predicted, y_test)
print ("Accuracy =", accuracy) 

SVM Metrics




Fitting time: 1589.629 s
Training time: 1440.987 s
Accuracy = 0.88052


In [29]:
print ("SVM Metrics using processed text")

# Step 3.0: Import it
from sklearn.svm import SVC

# step 3.1: Create it
clf = SVC(C=10000.0, kernel="rbf")

# step 3.2: Fit it 
t0 = time()
clf = clf.fit(train_vectors2, y_train2)
print ("Fitting time:", round(time()-t0, 3), "s")

# step 3.3: Use it ( for predictions)
t0 = time()
predicted = clf.predict(test_vectors2)
print ("Training time:", round(time()-t0, 3), "s")

# step 3.4: Evaluate it (calculate accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(predicted, y_test2)
print ("Accuracy =", accuracy) 

SVM Metrics using processed text
Fitting time: 1405.006 s
Training time: 1349.856 s
Accuracy = 0.88052


### Using Decision Tree

In [30]:
print ("Decision Tree Metrics")

# Step 3.0: Import it
from sklearn import tree

# step 3.1: Create it
clf = tree.DecisionTreeClassifier(min_samples_split=2)

# step 3.2: Fit it 
t0 = time()
clf = clf.fit(train_vectors, y_train)
print ("Fitting time:", round(time()-t0, 3), "s")

# step 3.3: Use it ( for predictions)
t0 = time()
predicted = clf.predict(test_vectors)
print ("Training time:", round(time()-t0, 3), "s")

# step 3.4: Evaluate it (calculate accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(predicted, y_test)
print ("Accuracy before text processing =", accuracy) 

Decision Tree Metrics
Fitting time: 298.126 s
Training time: 0.46 s
Accuracy before text processing = 0.70776


In [32]:
print ("Decision Tree Metrics using processed text")

# Step 3.0: Import it
from sklearn import tree

# step 3.1: Create it
clf = tree.DecisionTreeClassifier(min_samples_split=2)

# step 3.2: Fit it 
t0 = time()
clf = clf.fit(train_vectors2, y_train2)
print ("Fitting time:", round(time()-t0, 3), "s")

# step 3.3: Use it ( for predictions)
t0 = time()
predicted = clf.predict(test_vectors2)
print ("Training time:", round(time()-t0, 3), "s")

# step 3.4: Evaluate it (calculate accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(predicted, y_test2)
print ("Accuracy after text processing =", accuracy) 

Decision Tree Metrics using processed text
Fitting time: 306.782 s
Training time: 0.435 s
Accuracy after text processing = 0.71044
