In [1]:
import sys
import time
from os import listdir
from os.path import isfile, join
import string
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import EnglishStemmer
from collections import OrderedDict
import pandas as pd
from nltk.corpus import stopwords

In [11]:
# imports needed and logging
import gzip
import gensim 
import logging
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
def get_file_names(path):
    return [path + "/" + f for f in listdir(path) if isfile(join(path, f))]

In [4]:
def file_to_tokens(file_name):#, min_word_size, stemmer=None):
    with open(file_name) as f:
        file_string = f.read().rstrip()

        return gensim.utils.simple_preprocess(file_string)
        #return file_string

## Train Word Embeddings


In [5]:
def build_data(pos_file_path, neg_file_path ):
    pos_file_names = get_file_names(pos_file_path)
    neg_file_names = get_file_names(neg_file_path)
    # m is the number of training examples
    m_pos = len(pos_file_names)
    m_neg = len(neg_file_names)
    m = m_pos + m_neg
    print(m)
    pos_labels = np.ones(m_pos)
    neg_labels = np.zeros(m_neg)
    y = np.concatenate((pos_labels, neg_labels), axis=0)
    pos_data=pd.DataFrame(index=list(range(m_pos)),columns=['Review'])
    neg_data=pd.DataFrame(index=list(range(m_neg)),columns=['Review'])
    for i in range(m_pos):
        #pos_data.loc[i,'Review']=text_process(file_to_tokens(pos_file_names[i]))
        pos_data.loc[i,'Review']=file_to_tokens(pos_file_names[i])
    pos_data['Label'] = pos_labels
    for i in range(m_neg):
        #neg_data.loc[i,'Review']=text_process(file_to_tokens(neg_file_names[i]))
        neg_data.loc[i,'Review']=file_to_tokens(neg_file_names[i])
    neg_data['Label'] = neg_labels
    data=pd.concat([pos_data,neg_data],axis=0)
    data=data.sample(frac=1).reset_index(drop=True)
    return data



In [6]:
data_small = build_data('review_polarity_data/pos', 'review_polarity_data/neg' )
data_big_part1 = build_data ('aclImdb/train/pos','aclImdb/train/neg')
data_big_part2 = build_data ('aclImdb/test/pos','aclImdb/test/neg')
data_big = pd.concat([data_big_part1,data_big_part2],axis=0)
data_big = data_big.sample(frac=1).reset_index(drop=True)
data_huge = pd.concat([data_small,data_big],axis=0)
data_huge = data_huge.sample(frac=1).reset_index(drop=True)


2000
25000
25000


In [7]:
len(data_huge)

52000

In [9]:
glove_input_file = "glove.6B/glove.6B.200d.txt"
word2vec_output_file = "glove.6B/glove.6B.200d.txt.word2vec"

In [10]:
glove2word2vec(glove_input_file, word2vec_output_file)

2019-02-09 13:01:44,264 : INFO : converting 400000 vectors from glove.6B/glove.6B.200d.txt to glove.6B/glove.6B.200d.txt.word2vec


(400000, 200)

In [12]:
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

2019-02-09 13:03:37,757 : INFO : loading projection weights from glove.6B/glove.6B.200d.txt.word2vec
2019-02-09 13:04:39,062 : INFO : loaded (400000, 200) matrix from glove.6B/glove.6B.200d.txt.word2vec


In [13]:
# build vocabulary and train model
model = gensim.models.Word2Vec(
    data_huge['Review'],
    size=200,
    window=10,
    min_count=2,
    workers=10)
model.train(data_huge['Review'], total_examples=len(data_huge['Review']), epochs=100)

2019-02-09 13:04:48,244 : INFO : collecting all words and their counts
2019-02-09 13:04:48,251 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-02-09 13:04:48,676 : INFO : PROGRESS: at sentence #10000, processed 2378383 words, keeping 53076 word types
2019-02-09 13:04:49,065 : INFO : PROGRESS: at sentence #20000, processed 4774881 words, keeping 70564 word types
2019-02-09 13:04:49,450 : INFO : PROGRESS: at sentence #30000, processed 7165763 words, keeping 83559 word types
2019-02-09 13:04:49,839 : INFO : PROGRESS: at sentence #40000, processed 9560414 words, keeping 93886 word types
2019-02-09 13:04:50,221 : INFO : PROGRESS: at sentence #50000, processed 11940494 words, keeping 102827 word types
2019-02-09 13:04:50,307 : INFO : collected 104557 word types from a corpus of 12431273 raw words and 52000 sentences
2019-02-09 13:04:50,307 : INFO : Loading a fresh vocabulary
2019-02-09 13:04:50,473 : INFO : effective_min_count=2 retains 65356 unique words (62%

2019-02-09 13:05:34,127 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-02-09 13:05:34,144 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-02-09 13:05:34,152 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-09 13:05:34,165 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-09 13:05:34,175 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-09 13:05:34,191 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 13:05:34,201 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 13:05:34,205 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 13:05:34,216 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 13:05:34,218 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 13:05:34,219 : INFO : EPOCH - 3 : training on 12431273 raw words (9481856 effec

2019-02-09 13:06:18,366 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-02-09 13:06:18,375 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-09 13:06:18,415 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-09 13:06:18,425 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-09 13:06:18,433 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 13:06:18,439 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 13:06:18,440 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 13:06:18,445 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 13:06:18,447 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 13:06:18,448 : INFO : EPOCH - 1 : training on 12431273 raw words (9482883 effective words) took 15.1s, 628608 effective words/s
2019-02-09 13:06:19,478 : INFO : EPOCH 2 -

2019-02-09 13:07:01,578 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 13:07:01,579 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 13:07:01,581 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 13:07:01,585 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 13:07:01,586 : INFO : EPOCH - 4 : training on 12431273 raw words (9484788 effective words) took 14.6s, 649711 effective words/s
2019-02-09 13:07:02,619 : INFO : EPOCH 5 - PROGRESS: at 6.99% examples, 638621 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:07:03,626 : INFO : EPOCH 5 - PROGRESS: at 14.79% examples, 679945 words/s, in_qsize 20, out_qsize 0
2019-02-09 13:07:04,639 : INFO : EPOCH 5 - PROGRESS: at 22.00% examples, 683799 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:07:05,640 : INFO : EPOCH 5 - PROGRESS: at 28.96% examples, 675620 words/s, in_qsize 19, out_qsize 2
2019-02-09 13:07:06,654 : INFO : EPOCH 5 -

2019-02-09 13:07:46,123 : INFO : EPOCH 8 - PROGRESS: at 6.98% examples, 638955 words/s, in_qsize 20, out_qsize 4
2019-02-09 13:07:47,123 : INFO : EPOCH 8 - PROGRESS: at 13.50% examples, 623256 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:07:48,134 : INFO : EPOCH 8 - PROGRESS: at 20.43% examples, 634251 words/s, in_qsize 17, out_qsize 2
2019-02-09 13:07:49,158 : INFO : EPOCH 8 - PROGRESS: at 27.16% examples, 632433 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:07:50,223 : INFO : EPOCH 8 - PROGRESS: at 33.32% examples, 613732 words/s, in_qsize 20, out_qsize 4
2019-02-09 13:07:51,235 : INFO : EPOCH 8 - PROGRESS: at 40.84% examples, 631397 words/s, in_qsize 20, out_qsize 3
2019-02-09 13:07:52,258 : INFO : EPOCH 8 - PROGRESS: at 48.51% examples, 643161 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:07:53,281 : INFO : EPOCH 8 - PROGRESS: at 54.92% examples, 636445 words/s, in_qsize 17, out_qsize 2
2019-02-09 13:07:54,286 : INFO : EPOCH 8 - PROGRESS: at 62.06% examples, 639174 words/s, 

2019-02-09 13:08:33,049 : INFO : EPOCH 11 - PROGRESS: at 27.65% examples, 643766 words/s, in_qsize 17, out_qsize 2
2019-02-09 13:08:34,057 : INFO : EPOCH 11 - PROGRESS: at 33.95% examples, 632421 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:08:35,074 : INFO : EPOCH 11 - PROGRESS: at 40.61% examples, 632509 words/s, in_qsize 20, out_qsize 0
2019-02-09 13:08:36,079 : INFO : EPOCH 11 - PROGRESS: at 47.79% examples, 640058 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:08:37,120 : INFO : EPOCH 11 - PROGRESS: at 54.92% examples, 640524 words/s, in_qsize 18, out_qsize 2
2019-02-09 13:08:38,154 : INFO : EPOCH 11 - PROGRESS: at 61.97% examples, 640011 words/s, in_qsize 20, out_qsize 0
2019-02-09 13:08:39,197 : INFO : EPOCH 11 - PROGRESS: at 69.08% examples, 642138 words/s, in_qsize 19, out_qsize 3
2019-02-09 13:08:40,210 : INFO : EPOCH 11 - PROGRESS: at 76.75% examples, 648188 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:08:41,211 : INFO : EPOCH 11 - PROGRESS: at 84.08% examples, 652178

2019-02-09 13:09:18,603 : INFO : EPOCH 14 - PROGRESS: at 43.68% examples, 681225 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:09:19,615 : INFO : EPOCH 14 - PROGRESS: at 51.15% examples, 683212 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:09:20,616 : INFO : EPOCH 14 - PROGRESS: at 58.39% examples, 682623 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:09:21,623 : INFO : EPOCH 14 - PROGRESS: at 65.28% examples, 679345 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:09:22,636 : INFO : EPOCH 14 - PROGRESS: at 72.52% examples, 679371 words/s, in_qsize 20, out_qsize 1
2019-02-09 13:09:23,662 : INFO : EPOCH 14 - PROGRESS: at 80.11% examples, 680791 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:09:24,665 : INFO : EPOCH 14 - PROGRESS: at 87.19% examples, 680722 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:09:25,667 : INFO : EPOCH 14 - PROGRESS: at 94.43% examples, 680128 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:09:26,325 : INFO : worker thread finished; awaiting finish of 9 mo

2019-02-09 13:10:05,200 : INFO : EPOCH 17 - PROGRESS: at 80.55% examples, 683259 words/s, in_qsize 20, out_qsize 1
2019-02-09 13:10:06,214 : INFO : EPOCH 17 - PROGRESS: at 87.93% examples, 684261 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:10:07,216 : INFO : EPOCH 17 - PROGRESS: at 95.33% examples, 684505 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:10:07,737 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-02-09 13:10:07,763 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-02-09 13:10:07,800 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-09 13:10:07,801 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-09 13:10:07,807 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-09 13:10:07,821 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 13:10:07,822 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 13:10:0

2019-02-09 13:10:50,338 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-09 13:10:50,346 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-09 13:10:50,347 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-09 13:10:50,364 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 13:10:50,370 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 13:10:50,384 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 13:10:50,390 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 13:10:50,391 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 13:10:50,392 : INFO : EPOCH - 20 : training on 12431273 raw words (9482968 effective words) took 14.6s, 649043 effective words/s
2019-02-09 13:10:51,397 : INFO : EPOCH 21 - PROGRESS: at 6.16% examples, 581511 words/s, in_qsize 17, out_qsize 2
2019-02-09 13:10:5

2019-02-09 13:11:32,179 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 13:11:32,180 : INFO : EPOCH - 23 : training on 12431273 raw words (9482532 effective words) took 13.8s, 686422 effective words/s
2019-02-09 13:11:33,208 : INFO : EPOCH 24 - PROGRESS: at 6.26% examples, 576425 words/s, in_qsize 19, out_qsize 2
2019-02-09 13:11:34,214 : INFO : EPOCH 24 - PROGRESS: at 13.33% examples, 616646 words/s, in_qsize 20, out_qsize 3
2019-02-09 13:11:35,227 : INFO : EPOCH 24 - PROGRESS: at 20.28% examples, 629243 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:11:36,257 : INFO : EPOCH 24 - PROGRESS: at 27.00% examples, 627454 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:11:37,278 : INFO : EPOCH 24 - PROGRESS: at 34.49% examples, 639998 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:11:38,286 : INFO : EPOCH 24 - PROGRESS: at 41.24% examples, 640899 words/s, in_qsize 19, out_qsize 1
2019-02-09 13:11:39,290 : INFO : EPOCH 24 - PROGRESS: at 47.88% examples, 640042

2019-02-09 13:12:17,405 : INFO : EPOCH 27 - PROGRESS: at 21.33% examples, 658845 words/s, in_qsize 20, out_qsize 0
2019-02-09 13:12:18,405 : INFO : EPOCH 27 - PROGRESS: at 28.93% examples, 673296 words/s, in_qsize 19, out_qsize 1
2019-02-09 13:12:19,435 : INFO : EPOCH 27 - PROGRESS: at 36.12% examples, 670922 words/s, in_qsize 19, out_qsize 4
2019-02-09 13:12:20,436 : INFO : EPOCH 27 - PROGRESS: at 43.47% examples, 677354 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:12:21,457 : INFO : EPOCH 27 - PROGRESS: at 50.79% examples, 677068 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:12:22,484 : INFO : EPOCH 27 - PROGRESS: at 58.56% examples, 681404 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:12:23,488 : INFO : EPOCH 27 - PROGRESS: at 65.68% examples, 680963 words/s, in_qsize 19, out_qsize 1
2019-02-09 13:12:24,500 : INFO : EPOCH 27 - PROGRESS: at 72.53% examples, 677235 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:12:25,513 : INFO : EPOCH 27 - PROGRESS: at 79.77% examples, 676946

2019-02-09 13:13:03,587 : INFO : EPOCH 30 - PROGRESS: at 50.79% examples, 680577 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:13:04,591 : INFO : EPOCH 30 - PROGRESS: at 57.93% examples, 679011 words/s, in_qsize 19, out_qsize 2
2019-02-09 13:13:05,623 : INFO : EPOCH 30 - PROGRESS: at 65.20% examples, 678418 words/s, in_qsize 17, out_qsize 2
2019-02-09 13:13:06,645 : INFO : EPOCH 30 - PROGRESS: at 72.62% examples, 679394 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:13:07,647 : INFO : EPOCH 30 - PROGRESS: at 80.39% examples, 684187 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:13:08,663 : INFO : EPOCH 30 - PROGRESS: at 87.60% examples, 683727 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:13:09,670 : INFO : EPOCH 30 - PROGRESS: at 95.09% examples, 684342 words/s, in_qsize 20, out_qsize 1
2019-02-09 13:13:10,246 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-02-09 13:13:10,289 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-02-09 13

2019-02-09 13:13:50,128 : INFO : EPOCH 33 - PROGRESS: at 88.75% examples, 688692 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:13:51,128 : INFO : EPOCH 33 - PROGRESS: at 95.87% examples, 687002 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:13:51,608 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-02-09 13:13:51,625 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-02-09 13:13:51,632 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-09 13:13:51,637 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-09 13:13:51,649 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-09 13:13:51,661 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 13:13:51,662 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 13:13:51,668 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 13:13:51,672 : INFO : worker th

2019-02-09 13:14:33,113 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-09 13:14:33,115 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 13:14:33,125 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 13:14:33,129 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 13:14:33,138 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 13:14:33,142 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 13:14:33,143 : INFO : EPOCH - 36 : training on 12431273 raw words (9482810 effective words) took 13.7s, 692321 effective words/s
2019-02-09 13:14:34,161 : INFO : EPOCH 37 - PROGRESS: at 7.08% examples, 655967 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:14:35,162 : INFO : EPOCH 37 - PROGRESS: at 14.61% examples, 679298 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:14:36,209 : INFO : EPOCH 37 - PROGRESS: at 21.84% examples, 675888 word

2019-02-09 13:15:15,851 : INFO : EPOCH 40 - PROGRESS: at 7.07% examples, 641520 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:15:16,859 : INFO : EPOCH 40 - PROGRESS: at 14.61% examples, 669772 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:15:17,894 : INFO : EPOCH 40 - PROGRESS: at 22.18% examples, 682130 words/s, in_qsize 20, out_qsize 0
2019-02-09 13:15:18,901 : INFO : EPOCH 40 - PROGRESS: at 29.77% examples, 687898 words/s, in_qsize 20, out_qsize 0
2019-02-09 13:15:19,931 : INFO : EPOCH 40 - PROGRESS: at 37.10% examples, 686714 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:15:20,965 : INFO : EPOCH 40 - PROGRESS: at 44.27% examples, 684683 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:15:21,978 : INFO : EPOCH 40 - PROGRESS: at 51.90% examples, 687057 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:15:22,980 : INFO : EPOCH 40 - PROGRESS: at 59.12% examples, 685942 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:15:24,018 : INFO : EPOCH 40 - PROGRESS: at 66.54% examples, 685624 

2019-02-09 13:16:02,316 : INFO : EPOCH 43 - PROGRESS: at 44.04% examples, 689481 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:16:03,341 : INFO : EPOCH 43 - PROGRESS: at 51.31% examples, 685636 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:16:04,357 : INFO : EPOCH 43 - PROGRESS: at 58.97% examples, 688204 words/s, in_qsize 19, out_qsize 1
2019-02-09 13:16:05,358 : INFO : EPOCH 43 - PROGRESS: at 66.23% examples, 688777 words/s, in_qsize 17, out_qsize 2
2019-02-09 13:16:06,359 : INFO : EPOCH 43 - PROGRESS: at 73.58% examples, 689481 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:16:07,361 : INFO : EPOCH 43 - PROGRESS: at 80.86% examples, 689213 words/s, in_qsize 17, out_qsize 2
2019-02-09 13:16:08,377 : INFO : EPOCH 43 - PROGRESS: at 88.00% examples, 687814 words/s, in_qsize 19, out_qsize 1
2019-02-09 13:16:09,385 : INFO : EPOCH 43 - PROGRESS: at 95.58% examples, 688541 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:16:09,894 : INFO : worker thread finished; awaiting finish of 9 mo

2019-02-09 13:16:48,684 : INFO : EPOCH 46 - PROGRESS: at 81.00% examples, 689423 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:16:49,686 : INFO : EPOCH 46 - PROGRESS: at 88.16% examples, 688747 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:16:50,723 : INFO : EPOCH 46 - PROGRESS: at 95.96% examples, 689668 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:16:51,189 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-02-09 13:16:51,197 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-02-09 13:16:51,219 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-09 13:16:51,231 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-09 13:16:51,237 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-09 13:16:51,238 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 13:16:51,257 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 13:16:5

2019-02-09 13:17:32,518 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-09 13:17:32,526 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-09 13:17:32,530 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 13:17:32,535 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 13:17:32,548 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 13:17:32,559 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 13:17:32,561 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 13:17:32,562 : INFO : EPOCH - 49 : training on 12431273 raw words (9483152 effective words) took 13.8s, 688445 effective words/s
2019-02-09 13:17:33,636 : INFO : EPOCH 50 - PROGRESS: at 7.30% examples, 642338 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:17:34,647 : INFO : EPOCH 50 - PROGRESS: at 15.08% examples, 679163 words/s, in_qsize 18, out_qs

2019-02-09 13:18:13,895 : INFO : EPOCH - 52 : training on 12431273 raw words (9483471 effective words) took 13.8s, 688028 effective words/s
2019-02-09 13:18:14,916 : INFO : EPOCH 53 - PROGRESS: at 6.67% examples, 625955 words/s, in_qsize 20, out_qsize 7
2019-02-09 13:18:15,946 : INFO : EPOCH 53 - PROGRESS: at 14.68% examples, 672837 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:18:16,947 : INFO : EPOCH 53 - PROGRESS: at 21.73% examples, 674622 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:18:17,953 : INFO : EPOCH 53 - PROGRESS: at 29.19% examples, 680478 words/s, in_qsize 17, out_qsize 2
2019-02-09 13:18:18,981 : INFO : EPOCH 53 - PROGRESS: at 36.81% examples, 685753 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:18:19,981 : INFO : EPOCH 53 - PROGRESS: at 43.66% examples, 682377 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:18:21,001 : INFO : EPOCH 53 - PROGRESS: at 51.37% examples, 686510 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:18:22,014 : INFO : EPOCH 53 - PROGRESS: at

2019-02-09 13:19:00,395 : INFO : EPOCH 56 - PROGRESS: at 36.97% examples, 680990 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:19:01,399 : INFO : EPOCH 56 - PROGRESS: at 44.12% examples, 683009 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:19:02,405 : INFO : EPOCH 56 - PROGRESS: at 51.55% examples, 684169 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:19:03,414 : INFO : EPOCH 56 - PROGRESS: at 58.98% examples, 684686 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:19:04,430 : INFO : EPOCH 56 - PROGRESS: at 65.97% examples, 682029 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:19:05,433 : INFO : EPOCH 56 - PROGRESS: at 73.51% examples, 684748 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:19:06,472 : INFO : EPOCH 56 - PROGRESS: at 80.69% examples, 682127 words/s, in_qsize 19, out_qsize 3
2019-02-09 13:19:07,484 : INFO : EPOCH 56 - PROGRESS: at 88.00% examples, 682612 words/s, in_qsize 20, out_qsize 3
2019-02-09 13:19:08,499 : INFO : EPOCH 56 - PROGRESS: at 95.88% examples, 685715

2019-02-09 13:19:46,384 : INFO : EPOCH 59 - PROGRESS: at 66.46% examples, 688562 words/s, in_qsize 20, out_qsize 0
2019-02-09 13:19:47,400 : INFO : EPOCH 59 - PROGRESS: at 73.58% examples, 685965 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:19:48,413 : INFO : EPOCH 59 - PROGRESS: at 81.17% examples, 688184 words/s, in_qsize 17, out_qsize 2
2019-02-09 13:19:49,422 : INFO : EPOCH 59 - PROGRESS: at 88.44% examples, 687808 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:19:50,437 : INFO : EPOCH 59 - PROGRESS: at 96.10% examples, 689347 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:19:50,893 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-02-09 13:19:50,915 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-02-09 13:19:50,920 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-09 13:19:50,930 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-09 13:19:50,931 : INFO : worker thread finished; awai

2019-02-09 13:20:32,490 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-02-09 13:20:32,507 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-02-09 13:20:32,513 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-09 13:20:32,517 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-09 13:20:32,538 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-09 13:20:32,550 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 13:20:32,556 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 13:20:32,560 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 13:20:32,568 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 13:20:32,577 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 13:20:32,578 : INFO : EPOCH - 62 : training on 12431273 raw words (9482988 effe

2019-02-09 13:21:14,023 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 13:21:14,026 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 13:21:14,028 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 13:21:14,028 : INFO : EPOCH - 65 : training on 12431273 raw words (9482701 effective words) took 13.8s, 685662 effective words/s
2019-02-09 13:21:15,040 : INFO : EPOCH 66 - PROGRESS: at 7.07% examples, 659579 words/s, in_qsize 19, out_qsize 1
2019-02-09 13:21:16,067 : INFO : EPOCH 66 - PROGRESS: at 14.40% examples, 662320 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:21:17,087 : INFO : EPOCH 66 - PROGRESS: at 22.02% examples, 682426 words/s, in_qsize 20, out_qsize 0
2019-02-09 13:21:18,132 : INFO : EPOCH 66 - PROGRESS: at 29.28% examples, 674576 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:21:19,146 : INFO : EPOCH 66 - PROGRESS: at 36.88% examples, 682685 words/s, in_qsize 19, out_qsize 0
2019-02-09 13

2019-02-09 13:21:58,310 : INFO : EPOCH 69 - PROGRESS: at 21.31% examples, 670415 words/s, in_qsize 17, out_qsize 2
2019-02-09 13:21:59,310 : INFO : EPOCH 69 - PROGRESS: at 29.11% examples, 685761 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:22:00,315 : INFO : EPOCH 69 - PROGRESS: at 36.27% examples, 684419 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:22:01,316 : INFO : EPOCH 69 - PROGRESS: at 43.32% examples, 683621 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:22:02,334 : INFO : EPOCH 69 - PROGRESS: at 50.87% examples, 685668 words/s, in_qsize 17, out_qsize 2
2019-02-09 13:22:03,353 : INFO : EPOCH 69 - PROGRESS: at 58.14% examples, 684010 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:22:04,371 : INFO : EPOCH 69 - PROGRESS: at 65.44% examples, 684026 words/s, in_qsize 20, out_qsize 1
2019-02-09 13:22:05,377 : INFO : EPOCH 69 - PROGRESS: at 72.78% examples, 684753 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:22:06,378 : INFO : EPOCH 69 - PROGRESS: at 79.94% examples, 683766

2019-02-09 13:22:44,937 : INFO : EPOCH 72 - PROGRESS: at 58.97% examples, 685830 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:22:45,941 : INFO : EPOCH 72 - PROGRESS: at 66.16% examples, 685716 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:22:46,947 : INFO : EPOCH 72 - PROGRESS: at 73.27% examples, 684120 words/s, in_qsize 16, out_qsize 3
2019-02-09 13:22:47,965 : INFO : EPOCH 72 - PROGRESS: at 80.79% examples, 685420 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:22:48,998 : INFO : EPOCH 72 - PROGRESS: at 88.00% examples, 684360 words/s, in_qsize 19, out_qsize 2
2019-02-09 13:22:50,025 : INFO : EPOCH 72 - PROGRESS: at 95.65% examples, 684532 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:22:50,492 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-02-09 13:22:50,495 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-02-09 13:22:50,499 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-09 13:22:50,522 : INFO : work

2019-02-09 13:23:30,938 : INFO : EPOCH 75 - PROGRESS: at 88.59% examples, 690547 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:23:31,939 : INFO : EPOCH 75 - PROGRESS: at 95.70% examples, 688667 words/s, in_qsize 19, out_qsize 2
2019-02-09 13:23:32,413 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-02-09 13:23:32,438 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-02-09 13:23:32,452 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-09 13:23:32,459 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-09 13:23:32,464 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-09 13:23:32,468 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 13:23:32,471 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 13:23:32,480 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 13:23:32,487 : INFO : worker th

2019-02-09 13:24:13,656 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-09 13:24:13,658 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 13:24:13,664 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 13:24:13,666 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 13:24:13,675 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 13:24:13,676 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 13:24:13,677 : INFO : EPOCH - 78 : training on 12431273 raw words (9481639 effective words) took 13.7s, 690531 effective words/s
2019-02-09 13:24:14,689 : INFO : EPOCH 79 - PROGRESS: at 7.05% examples, 659636 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:24:15,699 : INFO : EPOCH 79 - PROGRESS: at 14.53% examples, 674842 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:24:16,759 : INFO : EPOCH 79 - PROGRESS: at 21.83% examples, 672566 word

2019-02-09 13:24:55,947 : INFO : EPOCH 82 - PROGRESS: at 6.49% examples, 609020 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:24:56,949 : INFO : EPOCH 82 - PROGRESS: at 14.45% examples, 674911 words/s, in_qsize 19, out_qsize 1
2019-02-09 13:24:57,989 : INFO : EPOCH 82 - PROGRESS: at 21.79% examples, 677199 words/s, in_qsize 20, out_qsize 3
2019-02-09 13:24:59,005 : INFO : EPOCH 82 - PROGRESS: at 29.78% examples, 692123 words/s, in_qsize 20, out_qsize 0
2019-02-09 13:25:00,005 : INFO : EPOCH 82 - PROGRESS: at 35.98% examples, 672032 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:25:01,010 : INFO : EPOCH 82 - PROGRESS: at 43.46% examples, 680335 words/s, in_qsize 20, out_qsize 0
2019-02-09 13:25:02,061 : INFO : EPOCH 82 - PROGRESS: at 50.15% examples, 668289 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:25:03,067 : INFO : EPOCH 82 - PROGRESS: at 57.24% examples, 667095 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:25:04,069 : INFO : EPOCH 82 - PROGRESS: at 64.69% examples, 672376 

2019-02-09 13:25:42,837 : INFO : EPOCH 85 - PROGRESS: at 43.52% examples, 681642 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:25:43,850 : INFO : EPOCH 85 - PROGRESS: at 50.86% examples, 681400 words/s, in_qsize 17, out_qsize 2
2019-02-09 13:25:44,862 : INFO : EPOCH 85 - PROGRESS: at 58.30% examples, 682827 words/s, in_qsize 20, out_qsize 0
2019-02-09 13:25:45,864 : INFO : EPOCH 85 - PROGRESS: at 65.52% examples, 683172 words/s, in_qsize 20, out_qsize 1
2019-02-09 13:25:46,912 : INFO : EPOCH 85 - PROGRESS: at 72.77% examples, 680508 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:25:47,929 : INFO : EPOCH 85 - PROGRESS: at 80.48% examples, 683590 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:25:48,943 : INFO : EPOCH 85 - PROGRESS: at 87.42% examples, 681467 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:25:49,950 : INFO : EPOCH 85 - PROGRESS: at 94.42% examples, 678789 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:25:50,659 : INFO : worker thread finished; awaiting finish of 9 mo

2019-02-09 13:26:29,566 : INFO : EPOCH 88 - PROGRESS: at 80.93% examples, 688104 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:26:30,576 : INFO : EPOCH 88 - PROGRESS: at 88.16% examples, 687600 words/s, in_qsize 16, out_qsize 3
2019-02-09 13:26:31,582 : INFO : EPOCH 88 - PROGRESS: at 95.67% examples, 688028 words/s, in_qsize 20, out_qsize 3
2019-02-09 13:26:32,103 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-02-09 13:26:32,115 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-02-09 13:26:32,121 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-02-09 13:26:32,127 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-09 13:26:32,131 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-09 13:26:32,132 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 13:26:32,137 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 13:26:3

2019-02-09 13:27:13,502 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-02-09 13:27:13,504 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-02-09 13:27:13,505 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-02-09 13:27:13,507 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-02-09 13:27:13,520 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-02-09 13:27:13,522 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-02-09 13:27:13,529 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-02-09 13:27:13,530 : INFO : EPOCH - 91 : training on 12431273 raw words (9484184 effective words) took 13.8s, 686620 effective words/s
2019-02-09 13:27:14,558 : INFO : EPOCH 92 - PROGRESS: at 7.07% examples, 649223 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:27:15,564 : INFO : EPOCH 92 - PROGRESS: at 14.53% examples, 670419 words/s, in_qsize 19, out_qs

2019-02-09 13:27:55,112 : INFO : EPOCH - 94 : training on 12431273 raw words (9482013 effective words) took 14.0s, 677031 effective words/s
2019-02-09 13:27:56,127 : INFO : EPOCH 95 - PROGRESS: at 6.97% examples, 650686 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:27:57,135 : INFO : EPOCH 95 - PROGRESS: at 14.30% examples, 663902 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:27:58,153 : INFO : EPOCH 95 - PROGRESS: at 21.77% examples, 679529 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:27:59,153 : INFO : EPOCH 95 - PROGRESS: at 29.11% examples, 681520 words/s, in_qsize 19, out_qsize 3
2019-02-09 13:28:00,182 : INFO : EPOCH 95 - PROGRESS: at 36.42% examples, 680553 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:28:01,209 : INFO : EPOCH 95 - PROGRESS: at 43.90% examples, 685183 words/s, in_qsize 20, out_qsize 0
2019-02-09 13:28:02,234 : INFO : EPOCH 95 - PROGRESS: at 51.08% examples, 681030 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:28:03,237 : INFO : EPOCH 95 - PROGRESS: at

2019-02-09 13:28:41,745 : INFO : EPOCH 98 - PROGRESS: at 36.79% examples, 683722 words/s, in_qsize 18, out_qsize 1
2019-02-09 13:28:42,774 : INFO : EPOCH 98 - PROGRESS: at 43.68% examples, 679191 words/s, in_qsize 19, out_qsize 6
2019-02-09 13:28:43,801 : INFO : EPOCH 98 - PROGRESS: at 51.48% examples, 682908 words/s, in_qsize 20, out_qsize 4
2019-02-09 13:28:44,808 : INFO : EPOCH 98 - PROGRESS: at 59.05% examples, 685530 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:28:45,825 : INFO : EPOCH 98 - PROGRESS: at 66.45% examples, 686845 words/s, in_qsize 20, out_qsize 0
2019-02-09 13:28:46,838 : INFO : EPOCH 98 - PROGRESS: at 73.82% examples, 686898 words/s, in_qsize 19, out_qsize 0
2019-02-09 13:28:47,845 : INFO : EPOCH 98 - PROGRESS: at 81.09% examples, 686709 words/s, in_qsize 20, out_qsize 1
2019-02-09 13:28:48,855 : INFO : EPOCH 98 - PROGRESS: at 88.34% examples, 686399 words/s, in_qsize 20, out_qsize 1
2019-02-09 13:28:49,864 : INFO : EPOCH 98 - PROGRESS: at 96.17% examples, 689449

(948286097, 1243127300)

In [14]:
model.save("enhanced_glove.model")

2019-02-09 13:32:24,873 : INFO : saving Word2Vec object under enhanced_glove.model, separately None
2019-02-09 13:32:24,889 : INFO : storing np array 'vectors' to enhanced_glove.model.wv.vectors.npy
2019-02-09 13:32:25,385 : INFO : not storing attribute vectors_norm
2019-02-09 13:32:25,390 : INFO : storing np array 'syn1neg' to enhanced_glove.model.trainables.syn1neg.npy
2019-02-09 13:32:26,030 : INFO : not storing attribute cum_table
2019-02-09 13:32:26,356 : INFO : saved enhanced_glove.model


In [15]:
model = Word2Vec.load("enhanced_glove.model")

2019-02-09 13:32:43,664 : INFO : loading Word2Vec object from enhanced_glove.model
2019-02-09 13:32:43,790 : INFO : loading wv recursively from enhanced_glove.model.wv.* with mmap=None
2019-02-09 13:32:43,792 : INFO : loading vectors from enhanced_glove.model.wv.vectors.npy with mmap=None
2019-02-09 13:32:43,851 : INFO : setting ignored attribute vectors_norm to None
2019-02-09 13:32:43,852 : INFO : loading vocabulary recursively from enhanced_glove.model.vocabulary.* with mmap=None
2019-02-09 13:32:43,853 : INFO : loading trainables recursively from enhanced_glove.model.trainables.* with mmap=None
2019-02-09 13:32:43,854 : INFO : loading syn1neg from enhanced_glove.model.trainables.syn1neg.npy with mmap=None
2019-02-09 13:32:43,881 : INFO : setting ignored attribute cum_table to None
2019-02-09 13:32:43,882 : INFO : loaded enhanced_glove.model


## NLP_Task : now let's use our pretrained word embedding to accomplish our task

In [35]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression,RidgeClassifier,Lasso
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
import xgboost as xgb
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.metrics import make_scorer,f1_score, auc, roc_auc_score

In [18]:
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

  """Entry point for launching an IPython kernel.


In [19]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.values())

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])


In [20]:
data_huge_train, data_huge_test, label_huge_train, label_huge_test = \
train_test_split(data_huge['Review'], data_huge['Label'], test_size=0.2, random_state = 0)
print(len(data_huge_train), len(data_huge_test), len(data_huge_train) + len(data_huge_test))

41600 10400 52000


In [21]:

pipeline1 = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),  # strings to token integer counts
    ('classifier', RidgeClassifier(random_state=0))
    #('classifier', xgb.XGBClassifier(random_state=0))
])



In [23]:
pipeline1.fit(data_huge_train,label_huge_train)

Pipeline(memory=None,
     steps=[('word2vec vectorizer', <__main__.MeanEmbeddingVectorizer object at 0x7f734e7617f0>), ('classifier', RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=0, solver='auto',
        tol=0.001))])

In [24]:
predictions1 = pipeline1.predict(data_huge_test)

In [25]:
print(roc_auc_score(predictions1,label_huge_test))

0.8839663119148804


In [26]:
print(classification_report(predictions1,label_huge_test))

             precision    recall  f1-score   support

        0.0       0.87      0.89      0.88      5129
        1.0       0.89      0.88      0.88      5271

avg / total       0.88      0.88      0.88     10400

