In [1]:
import sys
import time
from os import listdir
from os.path import isfile, join
import string
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import EnglishStemmer
from collections import OrderedDict
import pandas as pd
from nltk.corpus import stopwords

In [2]:
# imports needed and logging
import gzip
import gensim 
import logging
from gensim.models import Word2Vec
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
def get_file_names(path):
    return [path + "/" + f for f in listdir(path) if isfile(join(path, f))]

In [4]:
def file_to_tokens(file_name):#, min_word_size, stemmer=None):
    with open(file_name) as f:
        file_string = f.read().rstrip()

        return gensim.utils.simple_preprocess(file_string)
        #return file_string

In [5]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return ' '.join([word for word in nopunc.split() if word.lower() not in stopwords.words('english')])

## Train Word Embeddings


In [6]:
def build_data(pos_file_path, neg_file_path ):
    pos_file_names = get_file_names(pos_file_path)
    neg_file_names = get_file_names(neg_file_path)
    # m is the number of training examples
    m_pos = len(pos_file_names)
    m_neg = len(neg_file_names)
    m = m_pos + m_neg
    print(m)
    pos_labels = np.ones(m_pos)
    neg_labels = np.zeros(m_neg)
    y = np.concatenate((pos_labels, neg_labels), axis=0)
    pos_data=pd.DataFrame(index=list(range(m_pos)),columns=['Review'])
    neg_data=pd.DataFrame(index=list(range(m_neg)),columns=['Review'])
    for i in range(m_pos):
        #pos_data.loc[i,'Review']=text_process(file_to_tokens(pos_file_names[i]))
        pos_data.loc[i,'Review']=file_to_tokens(pos_file_names[i])
    pos_data['Label'] = pos_labels
    for i in range(m_neg):
        #neg_data.loc[i,'Review']=text_process(file_to_tokens(neg_file_names[i]))
        neg_data.loc[i,'Review']=file_to_tokens(neg_file_names[i])
    neg_data['Label'] = neg_labels
    data=pd.concat([pos_data,neg_data],axis=0)
    data=data.sample(frac=1).reset_index(drop=True)
    return data



In [7]:
data_small = build_data('review_polarity_data/pos', 'review_polarity_data/neg' )
data_big_part1 = build_data ('aclImdb/train/pos','aclImdb/train/neg')
data_big_part2 = build_data ('aclImdb/test/pos','aclImdb/test/neg')
data_big = pd.concat([data_big_part1,data_big_part2],axis=0)
data_big = data_big.sample(frac=1).reset_index(drop=True)
data_huge = pd.concat([data_small,data_big],axis=0)
data_huge = data_huge.sample(frac=1).reset_index(drop=True)


2000
25000
25000


In [8]:
len(data_huge)

52000

In [9]:
# build vocabulary and train model
model = gensim.models.Word2Vec(
    data_huge['Review'],
    size=100,
    window=10,
    min_count=2,
    workers=5)
model.train(data_huge['Review'], total_examples=len(data_huge['Review']), epochs=100)

2019-02-09 08:19:07,933 : INFO : collecting all words and their counts
2019-02-09 08:19:07,935 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-02-09 08:19:08,328 : INFO : PROGRESS: at sentence #10000, processed 2406964 words, keeping 53602 word types
2019-02-09 08:19:08,739 : INFO : PROGRESS: at sentence #20000, processed 4805138 words, keeping 71228 word types
2019-02-09 08:19:09,151 : INFO : PROGRESS: at sentence #30000, processed 7186538 words, keeping 83867 word types
2019-02-09 08:19:09,553 : INFO : PROGRESS: at sentence #40000, processed 9572711 words, keeping 94161 word types
2019-02-09 08:19:09,944 : INFO : PROGRESS: at sentence #50000, processed 11946187 words, keeping 102923 word types
2019-02-09 08:19:10,029 : INFO : collected 104557 word types from a corpus of 12431273 raw words and 52000 sentences
2019-02-09 08:19:10,030 : INFO : Loading a fresh vocabulary
2019-02-09 08:19:10,349 : INFO : effective_min_count=2 retains 65356 unique words (62%

2019-02-09 08:19:53,258 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 08:19:53,261 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 08:19:53,263 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 08:19:53,264 : INFO : EPOCH - 4 : training on 12431273 raw words (9482213 effective words) took 11.2s, 843744 effective words/s
2019-02-09 08:19:54,274 : INFO : EPOCH 5 - PROGRESS: at 8.02% examples, 783593 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:19:55,286 : INFO : EPOCH 5 - PROGRESS: at 17.65% examples, 839106 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:19:56,287 : INFO : EPOCH 5 - PROGRESS: at 27.14% examples, 858474 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:19:57,298 : INFO : EPOCH 5 - PROGRESS: at 36.57% examples, 866739 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:19:58,318 : INFO : EPOCH 5 - PROGRESS: at 44.74% examples, 843907 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:19:59,328 

2019-02-09 08:20:41,727 : INFO : EPOCH 4 - PROGRESS: at 37.17% examples, 881726 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:20:42,735 : INFO : EPOCH 4 - PROGRESS: at 46.16% examples, 872749 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:20:43,747 : INFO : EPOCH 4 - PROGRESS: at 55.35% examples, 869509 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:20:44,751 : INFO : EPOCH 4 - PROGRESS: at 64.60% examples, 870787 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:20:45,761 : INFO : EPOCH 4 - PROGRESS: at 73.99% examples, 872161 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:20:46,776 : INFO : EPOCH 4 - PROGRESS: at 83.15% examples, 870269 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:20:47,781 : INFO : EPOCH 4 - PROGRESS: at 91.56% examples, 861486 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:20:48,725 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 08:20:48,736 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 08:20:48,739 : I

2019-02-09 08:21:33,556 : INFO : EPOCH 9 - PROGRESS: at 8.82% examples, 860213 words/s, in_qsize 10, out_qsize 1
2019-02-09 08:21:34,565 : INFO : EPOCH 9 - PROGRESS: at 18.13% examples, 863653 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:21:35,575 : INFO : EPOCH 9 - PROGRESS: at 27.50% examples, 869707 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:21:36,583 : INFO : EPOCH 9 - PROGRESS: at 37.17% examples, 879450 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:21:37,586 : INFO : EPOCH 9 - PROGRESS: at 46.76% examples, 882396 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:21:38,587 : INFO : EPOCH 9 - PROGRESS: at 56.08% examples, 882666 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:21:39,600 : INFO : EPOCH 9 - PROGRESS: at 65.55% examples, 883028 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:21:40,603 : INFO : EPOCH 9 - PROGRESS: at 74.93% examples, 883764 words/s, in_qsize 7, out_qsize 2
2019-02-09 08:21:41,609 : INFO : EPOCH 9 - PROGRESS: at 84.60% examples, 886469 words/s, in_qsiz

2019-02-09 08:22:26,293 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 08:22:26,299 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 08:22:26,300 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 08:22:26,302 : INFO : EPOCH - 13 : training on 12431273 raw words (9481887 effective words) took 10.7s, 883488 effective words/s
2019-02-09 08:22:27,308 : INFO : EPOCH 14 - PROGRESS: at 8.97% examples, 872396 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:22:28,311 : INFO : EPOCH 14 - PROGRESS: at 18.77% examples, 895462 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:22:29,321 : INFO : EPOCH 14 - PROGRESS: at 28.43% examples, 900426 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:22:30,327 : INFO : EPOCH 14 - PROGRESS: at 36.55% examples, 867475 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:22:31,334 : INFO : EPOCH 14 - PROGRESS: at 46.25% examples, 873588 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:22:3

2019-02-09 08:23:18,946 : INFO : EPOCH 18 - PROGRESS: at 85.47% examples, 895036 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:23:19,957 : INFO : EPOCH 18 - PROGRESS: at 93.62% examples, 880957 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:23:20,672 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 08:23:20,673 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 08:23:20,679 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 08:23:20,684 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 08:23:20,693 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 08:23:20,693 : INFO : EPOCH - 18 : training on 12431273 raw words (9483038 effective words) took 10.8s, 877519 effective words/s
2019-02-09 08:23:21,700 : INFO : EPOCH 19 - PROGRESS: at 7.80% examples, 760664 words/s, in_qsize 7, out_qsize 2
2019-02-09 08:23:22,701 : INFO : EPOCH 19 - PROGRESS: at 17.34% 

2019-02-09 08:24:08,055 : INFO : EPOCH 23 - PROGRESS: at 27.20% examples, 861609 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:24:09,061 : INFO : EPOCH 23 - PROGRESS: at 36.86% examples, 873871 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:24:10,064 : INFO : EPOCH 23 - PROGRESS: at 45.77% examples, 865948 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:24:11,077 : INFO : EPOCH 23 - PROGRESS: at 55.43% examples, 871139 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:24:12,081 : INFO : EPOCH 23 - PROGRESS: at 64.84% examples, 874252 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:24:13,090 : INFO : EPOCH 23 - PROGRESS: at 72.69% examples, 856800 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:24:14,092 : INFO : EPOCH 23 - PROGRESS: at 81.03% examples, 849458 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:24:15,096 : INFO : EPOCH 23 - PROGRESS: at 90.08% examples, 849691 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:24:16,105 : INFO : EPOCH 23 - PROGRESS: at 98.16% examples, 841176 words/s

2019-02-09 08:25:01,046 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 08:25:01,056 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 08:25:01,061 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 08:25:01,065 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 08:25:01,066 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 08:25:01,067 : INFO : EPOCH - 27 : training on 12431273 raw words (9482799 effective words) took 10.8s, 878350 effective words/s
2019-02-09 08:25:02,076 : INFO : EPOCH 28 - PROGRESS: at 9.05% examples, 877291 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:25:03,080 : INFO : EPOCH 28 - PROGRESS: at 17.80% examples, 848000 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:25:04,083 : INFO : EPOCH 28 - PROGRESS: at 27.08% examples, 856400 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:25:05,092 : INFO : EPOCH 28 - PROGRESS: at 36.76% 

2019-02-09 08:25:50,879 : INFO : EPOCH 32 - PROGRESS: at 56.23% examples, 881284 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:25:51,885 : INFO : EPOCH 32 - PROGRESS: at 65.47% examples, 879645 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:25:52,890 : INFO : EPOCH 32 - PROGRESS: at 74.62% examples, 877714 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:25:53,893 : INFO : EPOCH 32 - PROGRESS: at 83.38% examples, 872201 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:25:54,898 : INFO : EPOCH 32 - PROGRESS: at 92.86% examples, 872814 words/s, in_qsize 10, out_qsize 1
2019-02-09 08:25:55,625 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 08:25:55,631 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 08:25:55,633 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 08:25:55,640 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 08:25:55,642 : INFO : worker thread finished; awaiting

2019-02-09 08:26:39,985 : INFO : EPOCH 37 - PROGRESS: at 9.13% examples, 886987 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:26:40,989 : INFO : EPOCH 37 - PROGRESS: at 18.86% examples, 898109 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:26:41,993 : INFO : EPOCH 37 - PROGRESS: at 28.57% examples, 906764 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:26:43,010 : INFO : EPOCH 37 - PROGRESS: at 38.18% examples, 902906 words/s, in_qsize 7, out_qsize 2
2019-02-09 08:26:44,017 : INFO : EPOCH 37 - PROGRESS: at 48.17% examples, 908264 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:26:45,021 : INFO : EPOCH 37 - PROGRESS: at 57.74% examples, 908499 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:26:46,037 : INFO : EPOCH 37 - PROGRESS: at 67.48% examples, 907140 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:26:47,050 : INFO : EPOCH 37 - PROGRESS: at 77.12% examples, 907421 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:26:48,071 : INFO : EPOCH 37 - PROGRESS: at 86.97% examples, 907715 words/s,

2019-02-09 08:27:32,694 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 08:27:32,696 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 08:27:32,703 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 08:27:32,704 : INFO : EPOCH - 41 : training on 12431273 raw words (9484328 effective words) took 10.7s, 885518 effective words/s
2019-02-09 08:27:33,707 : INFO : EPOCH 42 - PROGRESS: at 9.21% examples, 897443 words/s, in_qsize 10, out_qsize 1
2019-02-09 08:27:34,716 : INFO : EPOCH 42 - PROGRESS: at 18.86% examples, 897762 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:27:35,724 : INFO : EPOCH 42 - PROGRESS: at 28.49% examples, 902926 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:27:36,728 : INFO : EPOCH 42 - PROGRESS: at 38.03% examples, 900960 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:27:37,736 : INFO : EPOCH 42 - PROGRESS: at 46.68% examples, 880933 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:27:

2019-02-09 08:28:23,910 : INFO : EPOCH 46 - PROGRESS: at 64.86% examples, 876383 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:28:24,912 : INFO : EPOCH 46 - PROGRESS: at 72.85% examples, 861197 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:28:25,917 : INFO : EPOCH 46 - PROGRESS: at 81.79% examples, 859806 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:28:26,920 : INFO : EPOCH 46 - PROGRESS: at 90.86% examples, 858881 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:28:27,909 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 08:28:27,910 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 08:28:27,917 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 08:28:27,928 : INFO : EPOCH 46 - PROGRESS: at 99.92% examples, 858327 words/s, in_qsize 1, out_qsize 1
2019-02-09 08:28:27,929 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 08:28:27,931 : INFO : worker thread finished; awaiting 

2019-02-09 08:29:13,216 : INFO : EPOCH 51 - PROGRESS: at 17.71% examples, 842608 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:29:14,220 : INFO : EPOCH 51 - PROGRESS: at 27.16% examples, 857551 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:29:15,229 : INFO : EPOCH 51 - PROGRESS: at 36.76% examples, 870291 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:29:16,238 : INFO : EPOCH 51 - PROGRESS: at 46.51% examples, 876761 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:29:17,238 : INFO : EPOCH 51 - PROGRESS: at 55.86% examples, 878285 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:29:18,242 : INFO : EPOCH 51 - PROGRESS: at 65.40% examples, 881516 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:29:19,244 : INFO : EPOCH 51 - PROGRESS: at 74.84% examples, 883527 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:29:20,266 : INFO : EPOCH 51 - PROGRESS: at 84.04% examples, 879557 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:29:21,270 : INFO : EPOCH 51 - PROGRESS: at 93.92% examples, 884010 words/s

2019-02-09 08:30:04,289 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 08:30:04,296 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 08:30:04,297 : INFO : EPOCH - 55 : training on 12431273 raw words (9480683 effective words) took 10.5s, 905385 effective words/s
2019-02-09 08:30:05,302 : INFO : EPOCH 56 - PROGRESS: at 9.13% examples, 889985 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:30:06,309 : INFO : EPOCH 56 - PROGRESS: at 18.78% examples, 894539 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:30:07,315 : INFO : EPOCH 56 - PROGRESS: at 28.42% examples, 901391 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:30:08,319 : INFO : EPOCH 56 - PROGRESS: at 38.31% examples, 909327 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:30:09,334 : INFO : EPOCH 56 - PROGRESS: at 47.84% examples, 903113 words/s, in_qsize 7, out_qsize 2
2019-02-09 08:30:10,337 : INFO : EPOCH 56 - PROGRESS: at 57.38% examples, 903102 words/s, in_qsize 8, out_qsi

2019-02-09 08:30:56,866 : INFO : EPOCH 60 - PROGRESS: at 94.24% examples, 887788 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:30:57,452 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 08:30:57,459 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 08:30:57,471 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 08:30:57,477 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 08:30:57,478 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 08:30:57,479 : INFO : EPOCH - 60 : training on 12431273 raw words (9482550 effective words) took 10.7s, 888453 effective words/s
2019-02-09 08:30:58,487 : INFO : EPOCH 61 - PROGRESS: at 9.30% examples, 901318 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:30:59,497 : INFO : EPOCH 61 - PROGRESS: at 19.05% examples, 902753 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:31:00,507 : INFO : EPOCH 61 - PROGRESS: at 28.66% 

2019-02-09 08:31:46,098 : INFO : EPOCH 65 - PROGRESS: at 46.33% examples, 872844 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:31:47,109 : INFO : EPOCH 65 - PROGRESS: at 54.15% examples, 848581 words/s, in_qsize 10, out_qsize 0
2019-02-09 08:31:48,119 : INFO : EPOCH 65 - PROGRESS: at 61.74% examples, 829828 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:31:49,133 : INFO : EPOCH 65 - PROGRESS: at 69.32% examples, 813849 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:31:50,139 : INFO : EPOCH 65 - PROGRESS: at 77.54% examples, 810233 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:31:51,152 : INFO : EPOCH 65 - PROGRESS: at 86.56% examples, 813556 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:31:52,154 : INFO : EPOCH 65 - PROGRESS: at 94.83% examples, 809490 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:31:52,813 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 08:31:52,825 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 08:31:52

2019-02-09 08:32:34,591 : INFO : EPOCH - 69 : training on 12431273 raw words (9482016 effective words) took 10.4s, 914060 effective words/s
2019-02-09 08:32:35,606 : INFO : EPOCH 70 - PROGRESS: at 9.30% examples, 896159 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:32:36,622 : INFO : EPOCH 70 - PROGRESS: at 19.18% examples, 904977 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:32:37,626 : INFO : EPOCH 70 - PROGRESS: at 28.91% examples, 911319 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:32:38,626 : INFO : EPOCH 70 - PROGRESS: at 38.64% examples, 913771 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:32:39,626 : INFO : EPOCH 70 - PROGRESS: at 48.40% examples, 913657 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:32:40,628 : INFO : EPOCH 70 - PROGRESS: at 58.13% examples, 915842 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:32:41,631 : INFO : EPOCH 70 - PROGRESS: at 67.89% examples, 915007 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:32:42,646 : INFO : EPOCH 70 - PROGRESS: at 77.63%

2019-02-09 08:33:28,151 : INFO : EPOCH 74 - PROGRESS: at 15.09% examples, 721797 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:33:29,157 : INFO : EPOCH 74 - PROGRESS: at 24.18% examples, 766519 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:33:30,159 : INFO : EPOCH 74 - PROGRESS: at 33.22% examples, 790191 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:33:31,169 : INFO : EPOCH 74 - PROGRESS: at 41.87% examples, 793291 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:33:32,170 : INFO : EPOCH 74 - PROGRESS: at 50.53% examples, 795958 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:33:33,171 : INFO : EPOCH 74 - PROGRESS: at 59.51% examples, 803983 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:33:34,186 : INFO : EPOCH 74 - PROGRESS: at 68.62% examples, 808676 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:33:35,199 : INFO : EPOCH 74 - PROGRESS: at 76.88% examples, 805894 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:33:36,210 : INFO : EPOCH 74 - PROGRESS: at 85.86% examples, 808904 words/s

2019-02-09 08:34:22,694 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 08:34:22,706 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 08:34:22,708 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 08:34:22,713 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 08:34:22,736 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 08:34:22,737 : INFO : EPOCH - 78 : training on 12431273 raw words (9483634 effective words) took 11.7s, 808549 effective words/s
2019-02-09 08:34:23,743 : INFO : EPOCH 79 - PROGRESS: at 8.81% examples, 859483 words/s, in_qsize 10, out_qsize 2
2019-02-09 08:34:24,748 : INFO : EPOCH 79 - PROGRESS: at 18.52% examples, 884143 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:34:25,755 : INFO : EPOCH 79 - PROGRESS: at 27.80% examples, 881413 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:34:26,778 : INFO : EPOCH 79 - PROGRESS: at 37.33%

2019-02-09 08:35:12,406 : INFO : EPOCH 83 - PROGRESS: at 54.95% examples, 863135 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:35:13,409 : INFO : EPOCH 83 - PROGRESS: at 63.52% examples, 856997 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:35:14,412 : INFO : EPOCH 83 - PROGRESS: at 72.05% examples, 849734 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:35:15,428 : INFO : EPOCH 83 - PROGRESS: at 79.72% examples, 834601 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:35:16,443 : INFO : EPOCH 83 - PROGRESS: at 88.22% examples, 830886 words/s, in_qsize 10, out_qsize 1
2019-02-09 08:35:17,449 : INFO : EPOCH 83 - PROGRESS: at 97.34% examples, 833027 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:35:17,686 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 08:35:17,695 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 08:35:17,700 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 08:35:17,709 : INFO : worker th

2019-02-09 08:36:06,219 : INFO : EPOCH 87 - PROGRESS: at 21.76% examples, 683606 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:36:07,250 : INFO : EPOCH 87 - PROGRESS: at 27.74% examples, 652100 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:36:08,256 : INFO : EPOCH 87 - PROGRESS: at 34.78% examples, 654674 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:36:09,271 : INFO : EPOCH 87 - PROGRESS: at 39.94% examples, 626150 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:36:10,285 : INFO : EPOCH 87 - PROGRESS: at 46.25% examples, 619011 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:36:11,296 : INFO : EPOCH 87 - PROGRESS: at 52.19% examples, 611228 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:36:12,315 : INFO : EPOCH 87 - PROGRESS: at 58.38% examples, 607911 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:36:13,322 : INFO : EPOCH 87 - PROGRESS: at 65.77% examples, 616059 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:36:14,333 : INFO : EPOCH 87 - PROGRESS: at 73.49% examples, 625986 words/s

2019-02-09 08:37:00,153 : INFO : EPOCH 91 - PROGRESS: at 14.73% examples, 707101 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:37:01,160 : INFO : EPOCH 91 - PROGRESS: at 21.51% examples, 679599 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:37:02,172 : INFO : EPOCH 91 - PROGRESS: at 28.58% examples, 678090 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:37:03,210 : INFO : EPOCH 91 - PROGRESS: at 34.12% examples, 642962 words/s, in_qsize 10, out_qsize 1
2019-02-09 08:37:04,230 : INFO : EPOCH 91 - PROGRESS: at 40.52% examples, 634385 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:37:05,231 : INFO : EPOCH 91 - PROGRESS: at 46.17% examples, 618777 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:37:06,235 : INFO : EPOCH 91 - PROGRESS: at 53.01% examples, 621675 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:37:07,239 : INFO : EPOCH 91 - PROGRESS: at 60.93% examples, 636312 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:37:08,273 : INFO : EPOCH 91 - PROGRESS: at 69.03% examples, 645985 words/

2019-02-09 08:37:52,754 : INFO : EPOCH 95 - PROGRESS: at 18.86% examples, 890507 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:37:53,792 : INFO : EPOCH 95 - PROGRESS: at 27.74% examples, 864503 words/s, in_qsize 9, out_qsize 2
2019-02-09 08:37:54,794 : INFO : EPOCH 95 - PROGRESS: at 34.34% examples, 806712 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:37:55,795 : INFO : EPOCH 95 - PROGRESS: at 40.12% examples, 754576 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:37:56,798 : INFO : EPOCH 95 - PROGRESS: at 46.00% examples, 719914 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:37:57,828 : INFO : EPOCH 95 - PROGRESS: at 51.51% examples, 688530 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:37:58,834 : INFO : EPOCH 95 - PROGRESS: at 57.28% examples, 671291 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:37:59,860 : INFO : EPOCH 95 - PROGRESS: at 63.32% examples, 658273 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:38:00,863 : INFO : EPOCH 95 - PROGRESS: at 70.31% examples, 658162 words/s

2019-02-09 08:38:46,657 : INFO : EPOCH 99 - PROGRESS: at 39.94% examples, 754976 words/s, in_qsize 10, out_qsize 0
2019-02-09 08:38:47,658 : INFO : EPOCH 99 - PROGRESS: at 48.47% examples, 761518 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:38:48,662 : INFO : EPOCH 99 - PROGRESS: at 56.96% examples, 767848 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:38:49,676 : INFO : EPOCH 99 - PROGRESS: at 64.84% examples, 763170 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:38:50,678 : INFO : EPOCH 99 - PROGRESS: at 72.84% examples, 762221 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:38:51,689 : INFO : EPOCH 99 - PROGRESS: at 81.21% examples, 764475 words/s, in_qsize 8, out_qsize 1
2019-02-09 08:38:52,698 : INFO : EPOCH 99 - PROGRESS: at 89.58% examples, 766493 words/s, in_qsize 9, out_qsize 0
2019-02-09 08:38:53,706 : INFO : EPOCH 99 - PROGRESS: at 97.59% examples, 765136 words/s, in_qsize 10, out_qsize 0
2019-02-09 08:38:53,975 : INFO : worker thread finished; awaiting finish of 4 more thr

(948276988, 1243127300)

In [10]:
model.save("word2vec4_huge_data_100,10,2,5,100.model")

2019-02-09 08:39:06,009 : INFO : saving Word2Vec object under word2vec4_huge_data_100,10,2,5,50.model, separately None
2019-02-09 08:39:06,013 : INFO : not storing attribute vectors_norm
2019-02-09 08:39:06,015 : INFO : not storing attribute cum_table
2019-02-09 08:39:07,040 : INFO : saved word2vec4_huge_data_100,10,2,5,50.model


In [11]:
model = Word2Vec.load("word2vec4_huge_data_100,10,2,5,100.model")

2019-02-09 08:39:07,264 : INFO : loading Word2Vec object from word2vec4_huge_data_100,10,2,5,50.model
2019-02-09 08:39:07,997 : INFO : loading wv recursively from word2vec4_huge_data_100,10,2,5,50.model.wv.* with mmap=None
2019-02-09 08:39:07,998 : INFO : setting ignored attribute vectors_norm to None
2019-02-09 08:39:07,999 : INFO : loading vocabulary recursively from word2vec4_huge_data_100,10,2,5,50.model.vocabulary.* with mmap=None
2019-02-09 08:39:08,000 : INFO : loading trainables recursively from word2vec4_huge_data_100,10,2,5,50.model.trainables.* with mmap=None
2019-02-09 08:39:08,000 : INFO : setting ignored attribute cum_table to None
2019-02-09 08:39:08,001 : INFO : loaded word2vec4_huge_data_100,10,2,5,50.model


## NLP_Task : now let's use our pretrained word embedding to accomplish our task

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression,RidgeClassifier,Lasso
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
import xgboost as xgb
from sklearn.model_selection import ShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer,f1_score, auc, roc_auc_score
from sklearn.model_selection import ShuffleSplit



In [13]:
word_vectors = model.wv

In [14]:
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

  """Entry point for launching an IPython kernel.


## Pre-trained glove model

In [15]:
with open("glove.6B/glove.6B.200d.txt", "rb") as lines:
    w2v_100 = {line.split()[0]: np.array(map(float, line.split()[1:]))
           for line in lines}
    
encoding="utf-8"
import struct 

glove_small = {}
all_words = set(w for words in data_huge['Review'] for w in words)
with open("glove.6B/glove.6B.200d.txt", "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        if (word in all_words):
            nums=np.array(parts[1:], dtype=np.float32)
            glove_small[word] = nums


## Classifiers

In [16]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.values())

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])


In [18]:
data_huge_train, data_huge_test, label_huge_train, label_huge_test = \
train_test_split(data_huge['Review'], data_huge['Label'], test_size=0.2, random_state = 0)
print(len(data_huge_train), len(data_huge_test), len(data_huge_train) + len(data_huge_test))

41600 10400 52000


In [19]:

pipeline1 = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),  # strings to token integer counts
    ('classifier', RidgeClassifier(random_state=0))
    #('classifier', xgb.XGBClassifier(random_state=0))
])



In [20]:
pipeline2 = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(glove_small)),  # strings to token integer counts
    ('classifier', RidgeClassifier(random_state=0))
])

In [22]:
pipeline1.fit(data_huge_train,label_huge_train)

Pipeline(memory=None,
     steps=[('word2vec vectorizer', <__main__.MeanEmbeddingVectorizer object at 0x7fb03ee94978>), ('classifier', RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=0, solver='auto',
        tol=0.001))])

In [23]:
predictions1 = pipeline1.predict(data_huge_test)

In [24]:
print(roc_auc_score(predictions1,label_huge_test))

0.8795685513811701


In [25]:
print(classification_report(predictions1,label_huge_test))

             precision    recall  f1-score   support

        0.0       0.87      0.89      0.88      5163
        1.0       0.89      0.87      0.88      5237

avg / total       0.88      0.88      0.88     10400



In [28]:
predictions2 = pipeline2.predict(data_huge_test)

In [29]:
print(classification_report(predictions2,label_huge_test))

             precision    recall  f1-score   support

        0.0       0.83      0.84      0.84      5196
        1.0       0.84      0.83      0.84      5204

avg / total       0.84      0.84      0.84     10400



In [35]:
print(roc_auc_score(predictions2,label_huge_test))


0.8359649916952614


In [None]:
predictionsTrain = pipeline2.predict(data_big_train)
print(classification_report(predictionsTrain,label_big_train))

In [None]:
label_train

In [None]:
predictions

In [None]:
label_train