# Preliminaries

In [1]:
# install sent2vec
!pip install git+https://github.com/epfml/sent2vec

Collecting git+https://github.com/epfml/sent2vec
  Cloning https://github.com/epfml/sent2vec to /tmp/pip-req-build-hs42t8sz
  Running command git clone -q https://github.com/epfml/sent2vec /tmp/pip-req-build-hs42t8sz
Building wheels for collected packages: sent2vec
  Building wheel for sent2vec (setup.py) ... [?25l- \ | / - \ | / - \ | / - \ | / done
[?25h  Created wheel for sent2vec: filename=sent2vec-0.0.0-cp36-cp36m-linux_x86_64.whl size=1139432 sha256=49322fda39fcffd7f40384274744aa2d0e83ef3b5018f7cc1698b00ed419d0dd
  Stored in directory: /tmp/pip-ephem-wheel-cache-4aexgkan/wheels/f5/1a/52/b5f36e8120688b3f026ac0cefe9c6544905753c51d8190ff17
Successfully built sent2vec
Installing collected packages: sent2vec
Successfully installed sent2vec-0.0.0


Write requirements to file, anytime you run it, in case you have to go back and recover dependencies.

Latest known such requirements are hosted for each notebook in the companion github repo, and can be pulled down and installed here if needed. Companion github repo is located at https://github.com/azunre/transfer-learning-for-nlp

In [2]:
!pip freeze > kaggle_image_requirements.txt

# Download IMDB Movie Review Dataset
Download IMDB dataset

In [3]:
import random
import pandas as pd

## Read-in the reviews and print some basic descriptions of them

!wget -q "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
!tar xzf aclImdb_v1.tar.gz

wget: /opt/conda/lib/libuuid.so.1: no version information available (required by wget)


# Define Tokenization, Stop-word and Punctuation Removal Functions
Before proceeding, we must decide how many samples to draw from each class. We must also decide the maximum number of tokens per email, and the maximum length of each token. This is done by setting the following overarching hyperparameters

In [4]:
Nsamp = 1000 # number of samples to generate in each class - 'spam', 'not spam'
maxtokens = 200 # the maximum number of tokens per document
maxtokenlen = 100 # the maximum length of each token

**Tokenization**

In [5]:
def tokenize(row):
    if row is None or row is '':
        tokens = ""
    else:
        tokens = row.split(" ")[:maxtokens]
    return tokens

**Use regular expressions to remove unnecessary characters**

Next, we define a function to remove punctuation marks and other nonword characters (using regular expressions) from the emails with the help of the ubiquitous python regex library. In the same step, we truncate all tokens to hyperparameter maxtokenlen defined above.

In [6]:
import re

def reg_expressions(row):
    tokens = []
    try:
        for token in row:
            token = token.lower() # make all characters lower case
            token = re.sub(r'[\W\d]', "", token)
            token = token[:maxtokenlen] # truncate token
            tokens.append(token)
    except:
        token = ""
        tokens.append(token)
    return tokens

**Stop-word removal**

Stop-words are also removed. Stop-words are words that are very common in text but offer no useful information that can be used to classify the text. Words such as is, and, the, are are examples of stop-words. The NLTK library contains a list of 127 English stop-words and can be used to filter our tokenized strings.

In [7]:
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')    

# print(stopwords) # see default stopwords
# it may be beneficial to drop negation words from the removal list, as they can change the positive/negative meaning
# of a sentence
# stopwords.remove("no")
# stopwords.remove("nor")
# stopwords.remove("not")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
def stop_word_removal(row):
    token = [token for token in row if token not in stopwords]
    token = filter(None, token)
    return token

# Assemble Embedding Vectors

The following functions are used to extract sent2vec embedding vectors for each review

In [9]:
import time
import sent2vec

model = sent2vec.Sent2vecModel()
start=time.time()
model.load_model('../input/sent2vec/wiki_unigrams.bin')
end = time.time()
print("Loading the sent2vec embedding took %d seconds"%(end-start))

Loading the sent2vec embedding took 51 seconds


In [10]:
def assemble_embedding_vectors(data):
    out = None
    for item in data:
        vec = model.embed_sentence(" ".join(item))
        if vec is not None:
            if out is not None:
                out = np.concatenate((out,vec),axis=0)
            else:
                out = vec                                            
        else:
            pass
        
        
    return out

# Putting It All Together To Assemble Dataset

Now, putting all the preprocessing steps together we assemble our dataset...

In [11]:
import os
import numpy as np

# shuffle raw data first
def unison_shuffle_data(data, header):
    p = np.random.permutation(len(header))
    data = data[p]
    header = np.asarray(header)[p]
    return data, header

# load data in appropriate form
def load_data(path):
    data, sentiments = [], []
    for folder, sentiment in (('neg', 0), ('pos', 1)):
        folder = os.path.join(path, folder)
        for name in os.listdir(folder):
            with open(os.path.join(folder, name), 'r') as reader:
                  text = reader.read()
            text = tokenize(text)
            text = stop_word_removal(text)
            text = reg_expressions(text)
            data.append(text)
            sentiments.append(sentiment)
    data_np = np.array(data)
    data, sentiments = unison_shuffle_data(data_np, sentiments)
    
    return data, sentiments

train_path = os.path.join('aclImdb', 'train')
test_path = os.path.join('aclImdb', 'test')
raw_data, raw_header = load_data(train_path)

print(raw_data.shape)
print(len(raw_header))

(25000,)
25000


In [12]:
# Subsample required number of samples
random_indices = np.random.choice(range(len(raw_header)),size=(Nsamp*2,),replace=False)
data_train = raw_data[random_indices]
header = raw_header[random_indices]

print("DEBUG::data_train::")
print(data_train)

DEBUG::data_train::
[list(['lies', 'tells', 'affair', '', 'year', 'old', 'bucktoothed', 'female', 'student', 'scrawny', '', 'year', 'old', 'married', 'man', 'pair', 'protags', 'spending', 'half', 'screen', 'time', 'engaged', 'naked', 'sex', 'hokey', 'whipping', 'half', 'meandering', 'pathetically', 'naive', 'storyline', 'seems', 'little', 'excuse', 'sex', 'scenes', 'with', 'poor', 'production', 'value', 'including', 'obvious', 'sanitary', 'appliances', 'phony', 'softcore', 'sex', 'story', 'messy', 'mix', 'comedy', 'drama', 'lies', 'quickly', 'becomes', 'redundant', 'ad', 'nauseam', 'with', 'almost', '', 'hour', 'run', 'subtitles', 'little', 'substance', 'lies', 'simply', 'recommendable', 'c'])
 list(['i', 'took', 'flyer', 'renting', 'movie', 'i', 'gotta', 'say', 'very', 'good', 'on', 'fronts', 'script', 'cast', 'director', 'photography', 'high', 'production', 'values', 'etc', 'proves', 'eva', 'longoria', 'parker', 'head', 'shoulders', 'romcom', 'bad', 'actors', 'kate', 'hudson', 'jenni

Display sentiments and their frequencies in the dataset, to ensure it is roughly balanced between classes

In [13]:
unique_elements, counts_elements = np.unique(header, return_counts=True)
print("Sentiments and their frequencies:")
print(unique_elements)
print(counts_elements)

Sentiments and their frequencies:
[0 1]
[ 981 1019]


**Featurize and Create Labels**

In [14]:
EmbeddingVectors = assemble_embedding_vectors(data_train)
print(EmbeddingVectors)

[[-0.10970693  0.0264979  -0.06668812 ... -0.06135891  0.10399647
   0.11283641]
 [ 0.01635483 -0.07069934 -0.07330526 ... -0.06486997 -0.12292957
   0.19007745]
 [ 0.05848995 -0.06804366 -0.05285235 ... -0.04700164 -0.00218878
   0.19437891]
 ...
 [-0.00294127 -0.09474029 -0.19413887 ... -0.02495788 -0.12186395
   0.24457315]
 [-0.09781604  0.0410435   0.15624475 ...  0.16183579 -0.0496471
   0.23804286]
 [ 0.12789544  0.04092479  0.08634546 ... -0.01163953 -0.05667458
   0.305419  ]]


In [15]:
data = EmbeddingVectors

idx = int(0.7*data.shape[0])

# 70% of data for training
train_x = data[:idx,:]
train_y = header[:idx]
# # remaining 30% for testing
test_x = data[idx:,:]
test_y = header[idx:] 

print("train_x/train_y list details, to make sure it is of the right form:")
print(len(train_x))
print(train_x)
print(train_y[:5])
print(len(train_y))

train_x/train_y list details, to make sure it is of the right form:
1400
[[-0.10970693  0.0264979  -0.06668812 ... -0.06135891  0.10399647
   0.11283641]
 [ 0.01635483 -0.07069934 -0.07330526 ... -0.06486997 -0.12292957
   0.19007745]
 [ 0.05848995 -0.06804366 -0.05285235 ... -0.04700164 -0.00218878
   0.19437891]
 ...
 [ 0.03469189 -0.12465405 -0.11131437 ... -0.16204219  0.0035756
   0.1894062 ]
 [-0.04250745 -0.1720962   0.07359155 ...  0.16754892 -0.06874844
   0.06055736]
 [-0.07707034 -0.01455164 -0.05762631 ... -0.03803353  0.04129836
   0.14854823]]
[0 1 1 0 1]
1400


# Logistic Regression Classifier

In [16]:
from sklearn.linear_model import LogisticRegression

def fit(train_x,train_y):
    model = LogisticRegression()

    try:
        model.fit(train_x, train_y)
    except:
        pass
    return model

model = fit(train_x,train_y)



In [17]:
predicted_labels = model.predict(test_x)
print("DEBUG::The logistic regression predicted labels are::")
print(predicted_labels)

DEBUG::The logistic regression predicted labels are::
[1 1 1 0 1 0 1 0 1 1 1 1 0 1 1 0 0 1 1 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1 1 1 0 1 1 0
 1 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 1 0 0 1
 1 1 1 1 0 1 1 0 0 0 0 0 1 0 1 1 1 1 0 1 0 1 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0
 0 1 1 1 1 0 1 0 0 0 1 1 0 1 0 0 1 0 0 1 0 0 1 1 1 0 1 1 0 1 1 1 0 1 0 0 1
 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 0 1 0 0 0 1 0 1 1
 1 0 0 1 0 1 1 0 1 1 1 0 1 1 0 1 1 1 0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 0 0 0 1
 0 0 1 1 1 0 1 1 1 1 0 0 1 0 1 1 1 1 0 1 1 1 0 1 0 0 1 0 0 0 0 1 1 1 1 1 0
 0 1 0 1 0 1 0 1 0 1 1 0 1 0 1 1 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 1 0 1 1
 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 1 1 0 0 1 1 0 1 0 0 0 1 1 0 1 0
 1 1 0 1 1 0 1 1 0 1 1 1 0 0 1 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 1 1 1 0 0 1 0
 1 1 0 1 0 0 0 1 1 1 0 0 1 0 0 1 1 1 1 0 0 0 1 0 1 1 0 1 0 1 1 1 0 1 0 0 1
 1 1 1 0 0 0 1 0 0 1 0 1 1 0 0 0 0 1 0 1 1 1 1

In [18]:
from sklearn.metrics import accuracy_score

acc_score = accuracy_score(test_y, predicted_labels)

print("The logistic regression accuracy score is::")
print(acc_score)

The logistic regression accuracy score is::
0.8066666666666666


# Random Forests

In [19]:
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=1, random_state=0)

# Train the Classifier to take the training features and learn how they relate
# to the training y (spam, not spam?)
start_time = time.time()
clf.fit(train_x, train_y)
end_time = time.time()
print("Training the Random Forest Classifier took %3d seconds"%(end_time-start_time))

predicted_labels = clf.predict(test_x)
print("DEBUG::The RF predicted labels are::")
print(predicted_labels)

acc_score = accuracy_score(test_y, predicted_labels)

print("DEBUG::The RF testing accuracy score is::")
print(acc_score)



Training the Random Forest Classifier took   0 seconds
DEBUG::The RF predicted labels are::
[1 1 1 0 1 0 0 0 0 1 1 1 0 1 0 0 0 0 0 1 1 1 0 0 1 0 1 1 0 1 0 0 0 0 0 1 1
 0 1 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 1 1 0 1 0 1 1 0 0 0 1 0 1 1 1 1 1
 1 0 0 1 0 1 1 1 1 1 1 0 1 1 0 1 0 1 0 1 0 0 1 0 1 1 1 1 0 1 0 1 1 0 0 1 1
 1 0 1 0 0 0 0 1 0 1 0 1 1 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0
 1 1 0 1 1 0 0 0 1 1 1 1 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0
 0 1 1 0 1 0 0 0 0 1 0 1 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 0 0 1 1 0 1 1
 1 0 0 0 0 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0 0
 0 0 1 0 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1 0 1 0 0 1 1 0 1 1 0
 1 1 0 0 0 1 0 1 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 1 1
 0 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0 1 1 1 1 1 0
 1 0 0 0 1 0 0 1 0 1 1 0 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 1 1 1 0 1 1 0
 1 1 1 1 0 0 0 1 1 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 1 1 0 1 0 1 0 1 1
 1 1 1 0

In [20]:
from IPython.display import HTML
def create_download_link(title = "Download file", filename = "data.csv"):  
    html = '<a href={filename}>{title}</a>'
    html = html.format(title=title,filename=filename)
    return HTML(html)

#create_download_link(filename='GBMimportances.svg')

In [21]:
!rm -rf aclImdb
!rm aclImdb_v1.tar.gz