Preparing to Train a Model

In [1]:
!pip install gensim



In [2]:
import pandas as pd
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import gensim
from gensim import corpora
from gensim.models import word2vec

from pprint import pprint

Mounting the google drive directory

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Setting up the root directory of the project

In [4]:
root_data_path = '/content/gdrive/MyDrive/Colab Notebooks/Kaggle-NLP-Tutorial'
#root_data_path = '/content/gdrive/My Drive/Colab Notebooks/Kaggle-NLP-Tutorial'
os.listdir(root_data_path)

['dataset',
 'part3.ipynb',
 'models',
 'Part2.ipynb',
 'Part1.ipynb',
 'IMDB_Review_Classification_BoWs.ipynb',
 'IMDB_Review_Classification-Word2vec.ipynb']

Read data from files

In [5]:
train = pd.read_csv(root_data_path + '/dataset/' + "labeledTrainData.tsv.zip", header=0, \
                    delimiter="\t", quoting=3)
test = pd.read_csv(root_data_path + '/dataset/' + "testData.tsv.zip", header=0, delimiter="\t", \
                   quoting=3 )
unlabeled_train = pd.read_csv(root_data_path + '/dataset/' + "unlabeledTrainData.tsv.zip", header=0, delimiter="\t", quoting=3 )

Verify the number of reviews that were read (100,000 in total)

In [6]:
print ("Read %d labeled train reviews, %d labeled test reviews, " \
 "and %d unlabeled reviews\n" % (train["review"].size,
 test["review"].size, unlabeled_train["review"].size ))

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



Import various modules for string cleaning

In [7]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

Function to convert a document to a sequence of words,
optionally removing stop words.  Returns a list of words.
1. Remove HTML
2. Remove non-letters
3. Convert words to lower case and split them
4. Optionally remove stop words (false by default)
5. Return a list of words

In [8]:
def review_to_wordlist( review, remove_stopwords=False ):
  review_text = BeautifulSoup(review).get_text()
  review_text = re.sub("[^a-zA-Z]"," ", review_text)
  words = review_text.lower().split()
  if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
  return(words)

Download the punkt tokenizer for sentence splitting

In [None]:
import nltk.data
nltk.download()

Load the punkt tokenizer

In [10]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

Define a function to split a review into parsed sentences

In [11]:
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
   raw_sentences = tokenizer.tokenize(review.strip())
   sentences = []
   for raw_sentence in raw_sentences:
      if len(raw_sentence) > 0:
            sentences.append( review_to_wordlist( raw_sentence, \
            remove_stopwords ))
   return sentences

 Apply this function to prepare our data for input to Word2Vec

In [12]:
sentences = []
print ("Parsing sentences from training set")
for review in tqdm(train["review"]):
    sentences += review_to_sentences(review, tokenizer)

print ("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set


  review_text = BeautifulSoup(review).get_text()
  review_text = BeautifulSoup(review).get_text()
100%|██████████| 25000/25000 [01:41<00:00, 246.17it/s]


Parsing sentences from unlabeled set


Check how many sentences we have in total - should be around 850,000+

In [13]:
print (len(sentences))
print (sentences[0])
print (sentences[1])

796172
['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']
['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent']


Import the built-in logging module and configure it so that Word2Vec
creates nice output messages

In [14]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

Set values for various hyper-parameters of Word2Vec

In [15]:
num_features = 300
min_word_count = 5
num_workers = 4
context = 10
downsampling = 1e-3

Initialize and train the model

In [16]:
print ("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            vector_size= num_features, min_count = min_word_count, \
            window = context, sample = downsampling)
print ("word2vec trained")

Training model...
word2vec trained


Save the model for later use

In [17]:
model_name = "300features_40minwords_10context"
word2vec_model_path = root_data_path + '/models/'+model_name
model.save(word2vec_model_path)
print ("Model saved at {}".format(word2vec_model_path))

Model saved at /content/gdrive/MyDrive/Colab Notebooks/Kaggle-NLP-Tutorial/models/300features_40minwords_10context


# Loading the word2vec model

In [18]:
word2vec_model_path = root_data_path + '/models/'+model_name
saved_model = word2vec.Word2Vec.load(word2vec_model_path)

In [19]:
saved_model.wv.most_similar('actor')

[('performer', 0.6317141056060791),
 ('actress', 0.6225956678390503),
 ('comedian', 0.577025830745697),
 ('role', 0.5471652150154114),
 ('actors', 0.5358803868293762),
 ('thespian', 0.5095436573028564),
 ('performance', 0.4891031086444855),
 ('impersonator', 0.48819369077682495),
 ('villain', 0.4815177321434021),
 ('talent', 0.4556313157081604)]

In [20]:
vocab = list(model.wv.key_to_index.keys())
print (vocab)



### Representing training data using Word2Vec

In [None]:
#train_data_features = np.zeros((len(train.shape[0], num_features), type=np.float64))

train_data_features = []

remove_stopwords = True
for review in tqdm(train["review"]):
  wordlist = review_to_wordlist(review, remove_stopwords)

  word_vectors = []
  for word in wordlist:
    if word in vocab:
      word_vector = model.wv[word]
    else:
      word_vector = np.zeros(num_features)
    word_vectors.append(word_vector)
  vector_representation = np.mean(word_vectors, axis=0)
  #print (vector_representation.shape)

  train_data_features.append(vector_representation)

In [47]:
train_word2vec_features = np.array(train_data_features)

In [48]:
train_word2vec_features.shape

(100, 300)

### Training a ML model

In [49]:
y = train["sentiment"]
print ("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
rf_classifier = forest.fit(train_word2vec_features, y )

Training the random forest...


#### Classification performance on training dataset

In [50]:
y_pred = rf_classifier.predict(train_word2vec_features)

from sklearn.metrics._plot.confusion_matrix import confusion_matrix
#EVALUATION metrics
cm = confusion_matrix(y_pred, y)
accuracy = accuracy_score(y_pred, y)
recall = recall_score(y_pred, y)
precision = precision_score(y_pred, y)
f1 = f1_score(y_pred, y)

print ("Confusion matrix: {}".format(cm))
print ("Accuracy: {}".format(accuracy))
print ("Recall: {}".format(recall))
print ("Precision: {}".format(precision))
print ("F1-measure: {}".format(f1))


Confusion matrix: [[55  0]
 [ 0 45]]
Accuracy: 1.0
Recall: 1.0
Precision: 1.0
F1-measure: 1.0


### Representing testing data using Word2Vec

In [51]:
test_data_features = []

remove_stopwords = True
for review in tqdm(test["review"]):
  wordlist = review_to_wordlist(review, remove_stopwords)

  word_vectors = []
  for word in wordlist:
    if word in vocab:
      word_vector = model.wv[word]
    else:
      word_vector = np.zeros(num_features)

    word_vectors.append(word_vector)
  vector_representation = np.mean(word_vectors, axis=0)
  test_data_features.append(vector_representation)

  review_text = BeautifulSoup(review).get_text()
100%|██████████| 100/100 [00:05<00:00, 19.34it/s]


In [52]:
test_word2vec_features = np.array(test_data_features)

In [53]:
test_word2vec_features.shape

(100, 300)

Using the trained model to make inference on the testing dataset

In [54]:
result = forest.predict(test_word2vec_features)

In [55]:
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

ValueError: ignored

In [None]:
output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )