Importing packages

In [1]:
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


Mounting the google drive directory

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Setting up the root directory of the project

In [3]:
root_data_path = '/content/gdrive/MyDrive/Colab Notebooks/Kaggle-NLP-Tutorial'

os.listdir(root_data_path)

['dataset',
 'Part1_error_solved.ipynb',
 'part3.ipynb',
 'models',
 'Part2-word2vec.ipynb',
 'Part2.ipynb',
 'Part1.ipynb']

Import the pandas package, then use the "read_csv" function to read the labeled training data


In [4]:
import pandas as pd
train = pd.read_csv(root_data_path + '/dataset/' + "labeledTrainData.tsv.zip", header=0, \
                    delimiter="\t", quoting=3)

Reading rows and columns

In [5]:
print (train.shape)
print (train.columns)
print (train.head(5))

(25000, 3)
Index(['id', 'sentiment', 'review'], dtype='object')
         id  sentiment                                             review
0  "5814_8"          1  "With all this stuff going down at the moment ...
1  "2381_9"          1  "\"The Classic War of the Worlds\" by Timothy ...
2  "7759_3"          0  "The film starts with a manager (Nicholas Bell...
3  "3630_4"          0  "It must be assumed that those who praised thi...
4  "9495_8"          1  "Superbly trashy and wondrously unpretentious ...


In [6]:
train.shape

(25000, 3)

In [7]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [None]:
print (train["review"][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

Import BeautifulSoup into your workspace

In [8]:
from bs4 import BeautifulSoup
example1 = BeautifulSoup(train["review"][0])
print (train["review"][0])
print (example1.get_text())

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

Use regular expressions to do a find-and-replace

In [9]:
import re
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      example1.get_text() )  # The text to search
print (letters_only)

 With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    mi

Convert to lower case
Split into words

In [10]:
lower_case = letters_only.lower()
words = lower_case.split()

In [11]:
lower_case = letters_only.lower()
words = lower_case.split()

 Download text data sets, including stop words

In [None]:

import nltk
nltk.download()

 Import the stop word list

In [13]:
from nltk.corpus import stopwords
print (stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Remove stop words from "words"

In [14]:
words = [w for w in words if not w in stopwords.words("english")]
print (words)

['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent', 'moonwalker', 'part', 'biography', 'part', 'feature', 'film', 'remember', 'going', 'see', 'cinema', 'originally', 'released', 'subtle', 'messages', 'mj', 'feeling', 'towards', 'press', 'also', 'obvious', 'message', 'drugs', 'bad', 'kay', 'visually', 'impressive', 'course', 'michael', 'jackson', 'unless', 'remotely', 'like', 'mj', 'anyway', 'going', 'hate', 'find', 'boring', 'may', 'call', 'mj', 'egotist', 'consenting', 'making', 'movie', 'mj', 'fans', 'would', 'say', 'made', 'fans', 'true', 'really', 'nice', 'actual', 'feature', 'film', 'bit', 'finally', 'starts', 'minutes', 'excluding', 'smooth', 'criminal', 'sequence', 'joe', 'pesci', 'convincing', 'psychopathic', 'powerful', 'drug', 'lord', 

 Function to convert a raw review to a string of words
 The input is a single string (a raw movie review), and
 the output is a single string (a preprocessed movie review)
 1. Remove HTML
 2. Remove non-letters
 3. Convert to lower case, split into individual words
 4. In Python, searching a set is much faster than searching a list, so convert the stop words to a set
 5. Remove stop words
 6. Join the words back into one string separated by space, and return the result.


In [15]:
def review_to_words( raw_review ):
  review_text = BeautifulSoup(raw_review).get_text()
  letters_only = re.sub("[^a-zA-Z]", " ", review_text)
  words = letters_only.lower().split()
  stops = set(stopwords.words("english"))
  meaningful_words = [w for w in words if not w in stops]
  return( " ".join( meaningful_words ))

calling the function for a single review:

In [16]:
clean_review = review_to_words( train["review"][0] )
print (clean_review)

stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate working

 Get the number of reviews based on the dataframe column size
 Initialize an empty list to hold the clean reviews
 Loop over each review; create an index i that goes from 0 to the length of the movie review list
 Call our function for each one, and add the result to the list of clean reviews

In [17]:
'''
num_reviews = train["review"].size
clean_train_reviews = []
for i in range( 0, num_reviews ):
  clean_train_reviews.append( review_to_words( train["review"][i] ) )
  '''

'\nnum_reviews = train["review"].size\nclean_train_reviews = []\nfor i in range( 0, num_reviews ):\n  clean_train_reviews.append( review_to_words( train["review"][i] ) )\n  '

Cleaning and parsing the training set movie reviews...

In [18]:
print ("Cleaning and parsing the training set movie reviews...\n")
num_reviews = train["review"].size
clean_train_reviews = []
for i in range( 0, num_reviews ):
  if( (i+1)%5000 == 0 ):
    print ("Review %d of %d\n" % ( i+1, num_reviews ))
  clean_train_reviews.append( review_to_words( train["review"][i] ))

Cleaning and parsing the training set movie reviews...



  review_text = BeautifulSoup(raw_review).get_text()


Review 5000 of 25000

Review 10000 of 25000

Review 15000 of 25000

Review 20000 of 25000

Review 25000 of 25000



In [19]:
len(clean_train_reviews)
clean_train_reviews[0]

'stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate workin

creating the Bag of Words

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)



In [21]:
train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()

In [22]:
train_data_features.shape

(25000, 5000)

To see what the training data array now looks

In [23]:
print (train_data_features.shape)

(25000, 5000)


 the vocabulary:

In [None]:
#vocab = vectorizer.get_feature_names()
#print (vocab)

print the counts of each word in the vocabulary:

In [None]:
'''
import numpy as np
dist = np.sum(train_data_features, axis=0)
for tag, count in zip(vocab, dist):
    print (count, tag)
'''

Training the random forest...

In [24]:
y = train["sentiment"]
print ("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
rf_classifier = forest.fit( train_data_features, y )

Training the random forest...


In [25]:
y_pred = rf_classifier.predict(train_data_features)

In [26]:
from sklearn.metrics._plot.confusion_matrix import confusion_matrix
#EVALUATION metrics
cm = confusion_matrix(y_pred, y)
accuracy = accuracy_score(y_pred, y)
recall = recall_score(y_pred, y)
precision = precision_score(y_pred, y)
f1 = f1_score(y_pred, y)

print ("Confusion matrix: {}".format(cm))
print ("Accuracy: {}".format(accuracy))
print ("Recall: {}".format(recall))
print ("Precision: {}".format(precision))
print ("F1-measure: {}".format(f1))

Confusion matrix: [[12500     0]
 [    0 12500]]
Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1-measure: 1.0


Read the test data

In [27]:
test = pd.read_csv(root_data_path + '/dataset/' + "testData.tsv.zip", header=0, delimiter="\t", \
                   quoting=3 )

Verify that there are 25,000 rows and 2 columns

In [28]:
print (test.shape)

(25000, 2)


Create an empty list and append the clean reviews one by one

In [29]:
num_reviews = len(test["review"])
clean_test_reviews = []

Cleaning and parsing the test set movie reviews...

In [30]:
print ("Cleaning and parsing the test set movie reviews...\n")
for i in range(0,num_reviews):
    if( (i+1) % 1000 == 0 ):
        print ("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append( clean_review )

Cleaning and parsing the test set movie reviews...



  review_text = BeautifulSoup(raw_review).get_text()


Review 1000 of 25000

Review 2000 of 25000

Review 3000 of 25000

Review 4000 of 25000

Review 5000 of 25000

Review 6000 of 25000

Review 7000 of 25000

Review 8000 of 25000

Review 9000 of 25000

Review 10000 of 25000

Review 11000 of 25000

Review 12000 of 25000

Review 13000 of 25000

Review 14000 of 25000

Review 15000 of 25000

Review 16000 of 25000

Review 17000 of 25000

Review 18000 of 25000

Review 19000 of 25000

Review 20000 of 25000

Review 21000 of 25000

Review 22000 of 25000

Review 23000 of 25000

Review 24000 of 25000

Review 25000 of 25000



Get a bag of words for the test set, and convert to a numpy array

In [31]:
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

Use the random forest to make sentiment label predictions

In [32]:
result = forest.predict(test_data_features)

Copy the results to a pandas dataframe with an "id" column and a "sentiment" column

In [33]:
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

Use pandas to write the comma-separated output file

In [34]:
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )