In [14]:
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
import gensim
import sys

In [15]:
import numpy
from random import shuffle
from sklearn.linear_model import LogisticRegression

In [16]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        flipped = {}
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]) 
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append((utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

In [17]:
class VocabItem:
    def __init__(self, word):
        self.word = word
        self.count = 0

In [18]:
class Vocabulary:
    def __init__(self, fi, min_count):
        vocab_items = []
        vocab_hash = {}
        word_count = 0
        #fi = open(fi, 'r')
        # Add special tokens <bol> (beginning of line) and <eol> (end of line)
        for token in ['<bol>', '<eol>']:
            vocab_hash[token] = len(vocab_items)
            vocab_items.append(VocabItem(token))
        for line in fi:
            tokens = line[0]
            #print("\rReading line %s" %tokens)
            for token in tokens:
                if token not in vocab_hash:
                    vocab_hash[token] = len(vocab_items)
                    #print ("\r\r token %s" %token)
                    #print ("\t\t token value",vocab_hash[token])
                    vocab_items.append(VocabItem(token))
                #assert vocab_items[vocab_hash[token]].word == token, 'Wrong vocab_hash index'
                vocab_items[vocab_hash[token]].count += 1
                word_count += 1
                if word_count % 10000 == 0:
                    sys.stdout.write("\rReading word %d" % word_count)
                    sys.stdout.flush()

            # Add special tokens <bol> (beginning of line) and <eol> (end of line)
            vocab_items[vocab_hash['<bol>']].count += 1
            vocab_items[vocab_hash['<eol>']].count += 1
            word_count += 2
        self.vocab_items = vocab_items # List of VocabItem objects
        self.vocab_hash = vocab_hash  # Mapping from each token to its index in vocab
        self.word_count = word_count # Total number of words in train file
        # Add special token <unk> (unknown),
        # merge words occurring less than min_count into <unk>, and
        # sort vocab in descending order by frequency in train file
        self.__sort(min_count)
        print ('Total words in training file: %d' % self.word_count)
        #print ('Total bytes in training file: %d' % self.bytes)
        print ('Vocab size: %d' % len(self))
    def __getitem__(self, i):
        return self.vocab_items[i]

    def __len__(self):
        return len(self.vocab_items)

    def __iter__(self):
        return iter(self.vocab_items)

    def __contains__(self, key):
        return key in self.vocab_hash

    def __sort(self, min_count):
        tmp = []
        tmp.append(VocabItem('<unk>'))
        unk_hash = 0
        
        count_unk = 0
        for token in self.vocab_items:
            if token.count < min_count:
                count_unk += 1
                tmp[unk_hash].count += token.count
                #print("word setting as unknow:",token.word)
            else:
                tmp.append(token)

        tmp.sort(key=lambda token : token.count, reverse=True)

        # Update vocab_hash
        vocab_hash = {}
        for i, token in enumerate(tmp):
            vocab_hash[token.word] = i

        self.vocab_items = tmp
        self.vocab_hash = vocab_hash
        #print ("printing vocab_hash")
        #for key,value in vocab_hash.items():
         #   print (key,value)
        #print ('Unknown vocab size:', count_unk)

    def indices(self, tokens):
        return [self.vocab_hash[token] if token in self else self.vocab_hash['<unk>'] for token in tokens]

In [19]:
class paraItem:
    def __init__(self,par):
        self.label = par
        self.wc = 0
        self.filename = None
        self.dmvec = np.random.uniform(low=-0.5/100, high=0.5/100, size=(100)) #for every phrase size of dim
        self.words = []

In [20]:
class paragrahps:
    def __init__(self,fi):
        paras =[]
        for line in fi:
            tokens = line[0]
            if tokens:
                paras.append(paraItem(line[1][0])) #list of objects with sentence index as lable.
                paras[len(paras)-1].words = tokens #add the words into objects. 
                paras[len(paras)-1].wc = len(tokens) # upated word count in the para
            else:
                pass
        self.paras =paras
    def __getlist__(self):
        return self.paras

In [21]:
import os
path = 'C:\\Users\\bpotinen\\financial'
os.chdir( path )
sources = {'neg.txt':'NEG', 'pos.txt':'POS','unk.txt':'UNK'}
sentences = LabeledLineSentence(sources)

In [22]:
sent = sentences.to_array()


In [23]:
vocab = Vocabulary(sent,5)

Reading word 7160000Total words in training file: 7163532
Vocab size: 55314


In [24]:
import numpy as np
paras = paragrahps(sent)

In [25]:
para_list = paras.__getlist__()

In [26]:
len(para_list)

22999

In [27]:
import argparse
import math
import struct
import sys
import time
import warnings
import numpy as np
import itertools 
import time 

In [28]:
class UnigramTable:
    """
    A list of indices of tokens in the vocab following a power law distribution,
    used to draw negative samples.
    """
    def __init__(self, vocab):
        vocab_size = len(vocab)
        power = 0.75
        norm = sum([math.pow(t.count, power) for t in vocab]) # Normalizing constant
        print (norm)
        table_size = 1e8 # Length of the unigram table depends on vocab
        #print table_size
        table = np.zeros(table_size, dtype=np.uint32)

        print ('Filling unigram table')
        p = 0 # Cumulative probability
        i = 0
        old_i = 0 
        for j, unigram in enumerate(vocab):
            #print "j",j
            #print "unigram",unigram
            
            p += float(math.pow(unigram.count, power))/norm
            while i < table_size and float(i) / table_size < p:
                table[i] = j
                i += 1
            old_i = i - old_i
            sys.stdout.write("\r propability for word '%s' is %f, kept it  %d times" %(unigram.word,p,old_i))
            sys.stdout.flush()
            #print("propability for word %s is %f, kept it  %d times" %(unigram.word,p,old_i))
        self.table = table
    def sample(self, count):
        indices = np.random.randint(low=0, high=len(self.table), size=count)
        return [self.table[i] for i in indices]

In [29]:
table = UnigramTable(vocab)

1197825.719168978
Filling unigram table
 propability for word 'Bhaumik' is 1.000000, kept it  49729009 times



In [30]:
def initialize(dim, vocab_size):
    # Init input words with random numbers from a uniform distribution on the interval [-0.5, 0.5]/dim
    tmp = np.random.uniform(low=-0.5/dim, high=0.5/dim, size=(vocab_size, dim))
    input_word = tmp 
    # Init weights with zeros
    tmp = np.zeros(shape=(vocab_size, dim))
    weights = tmp
  

    return (input_word,weights)

In [31]:
def sigmoid(z): #sigmoid function goes from -6 to +6
    if z > 6:
        return 1.0
    elif z < -6:
        return 0.0
    else:
        return 1 / (1 + math.exp(-z))

In [32]:
input_word,weights = initialize(100,len(vocab))

In [33]:
start = time.clock()
starting_alpha=0.025
current_word_count=0  
alpha_count = 0
last_alpha_count = 0
win=5 
para_count = 0
for i in para_list:
    current_p = i
    para_count+=1
    sentence = current_p.words
    ind = vocab.indices(sentence)
    for sent_pos, token in enumerate(ind):
        neu1e = np.zeros(100)
        if current_word_count % 10000 == 0:
            alpha_count += (current_word_count - last_alpha_count)
            last_alpha_count = current_word_count
            alpha = starting_alpha * (1 - float(alpha_count) / 17020851)
            if alpha < starting_alpha * 0.0001: alpha = starting_alpha * 0.0001
            sys.stdout.write("\rAlpha: %f  para Progress: %d of %d (%.2f%%)" %
                                 (alpha, para_count, len(para_list),
                                  float(para_count* 100/len(para_list))))
            sys.stdout.flush()
        current_win = np.random.randint(low=1, high=win+1)
        context_start = max(sent_pos - current_win, 0)
        context_end = min(sent_pos + current_win + 1, len(ind))
        context = ind[context_start:sent_pos] + ind[sent_pos+1:context_end]
        for context_word in context:
            neu1e = np.zeros(100)
            classifiers = [(token, 1)] + [(target, 0) for target in table.sample(10)]
            for target, label in classifiers:
                z = np.dot(input_word[context_word],weights[target])
                p = sigmoid(z)
                g = alpha * (label - p)
                neu1e+=g*weights[target]
                weights[target] +=g* input_word[context_word]
            input_word[context_word] +=neu1e #updated word vector
            current_p.dmvec+=neu1e #update para vector
            current_word_count += 1    
print ('time taken',time.clock() - start)

Alpha: 0.000003  para Progress: 22968 of 22999 (99.87%)time taken 2960.363451929243


array([  4.28410196e-05,   6.29690194e-04,  -1.11398223e-02,
         5.08897221e-03,   1.39724287e-02,  -1.00116045e-02,
        -1.28937798e-02,  -6.90131240e-03,  -5.96449961e-04,
        -2.94534286e-02,  -1.67500308e-02,  -1.65756057e-02,
        -2.26600929e-03,  -1.16309271e-02,   1.84101231e-02,
         1.94836671e-02,   2.03266053e-04,   2.02008043e-02,
        -1.67962719e-02,   1.77595092e-04,  -1.52958769e-02,
        -1.27740226e-02,   1.63103281e-03,   2.21804442e-02,
         1.39536675e-02,   1.39552261e-02,  -2.00838001e-03,
        -1.59738557e-02,  -1.89997548e-02,  -1.20090873e-02,
        -1.85338839e-02,  -1.71101703e-02,   1.12887440e-02,
         2.99290874e-03,  -1.08594776e-02,  -1.04301611e-02,
        -2.18665869e-02,  -6.62380178e-03,  -2.79377296e-03,
        -2.81583471e-03,   9.25607146e-04,   7.87585078e-03,
         2.28138325e-03,   7.28638681e-03,  -4.78388836e-03,
        -1.17518613e-02,  -8.51945461e-03,   8.83125597e-03,
         2.76297494e-03,

In [34]:
fo = open('financial_model', 'w',encoding='utf8')
fo.write('%d %d\n' % (len(para_list)+len(vocab), 100))
for i in para_list:
    label = i.label
    vector_str = ' '.join([str(s) for s in i.dmvec])
    fo.write('%s %s\n' % (label, vector_str))
for token, vector in zip(vocab, input_word):
    word = token.word
    vector_str = ' '.join([str(s) for s in vector])
    fo.write('%s %s\n' % (word, vector_str))

In [35]:
fo.close()

In [36]:
 model = Doc2Vec.load_word2vec_format('imdb_para2vec', binary=False)

In [37]:
model.most_similar('NEG_10')

[('NEG_146', 0.9941427111625671),
 ('NEG_183', 0.9939985275268555),
 ('NEG_140', 0.9939755201339722),
 ('NEG_148', 0.9939613342285156),
 ('NEG_71', 0.9939123392105103),
 ('NEG_110', 0.9938923120498657),
 ('Singardo', 0.9938366413116455),
 ('NEG_54', 0.9938062429428101),
 ('NEG_100', 0.9937904477119446),
 ('NEG_133', 0.9937737584114075)]

In [38]:
pos_data =[]
for i in para_list:
    if 'POS' in i.label:
        pos_data.append((i.dmvec,1))

In [39]:
neg_data =[]
for i in para_list:
    if 'NEG' in i.label:
        neg_data.append((i.dmvec,0))

In [40]:
len(pos_data)

2331

In [41]:
len(neg_data)

13656

In [42]:
data= pos_data[:2000]+neg_data[:2000]

In [43]:
x=[]
y=[]
for i in data:
    x.append(i[0])
    y.append(i[1])

In [44]:
from sklearn.cross_validation import train_test_split

In [45]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=44)

In [46]:
model2 = LogisticRegression()
model2.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [47]:
predicted = model2.predict(X_test)

In [48]:
from sklearn import metrics

In [49]:
print (metrics.accuracy_score(y_test, predicted))

0.88375


In [50]:
print (metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.81      1.00      0.90       400
          1       1.00      0.77      0.87       400

avg / total       0.91      0.88      0.88       800



In [51]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

In [54]:
%matplotlib auto
preds= model2.predict_proba(X_test)[:,1]
fpr,tpr,_ = roc_curve(y_test, preds)
roc_auc = auc(fpr,tpr)
plt.plot(fpr,tpr,label='area = %.2f' %roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.legend(loc='lower right')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.show()

Using matplotlib backend: Qt4Agg


In [53]:
tpr

array([ 0.51  ,  0.5175,  0.5275,  0.8275,  0.8275,  0.83  ,  0.83  ,
        0.8325,  0.8325,  0.835 ,  0.835 ,  0.8375,  0.8375,  0.84  ,
        0.84  ,  0.8475,  0.8475,  0.85  ,  0.85  ,  0.855 ,  0.855 ,
        0.8575,  0.8575,  0.86  ,  0.86  ,  0.8625,  0.8625,  0.865 ,
        0.865 ,  0.87  ,  0.87  ,  0.8725,  0.8725,  0.875 ,  0.875 ,
        0.8775,  0.8775,  0.88  ,  0.88  ,  0.8825,  0.8825,  0.885 ,
        0.885 ,  0.8875,  0.8875,  0.89  ,  0.89  ,  0.8925,  0.8925,
        0.895 ,  0.895 ,  0.8975,  0.8975,  0.9   ,  0.9   ,  0.9025,
        0.9025,  0.905 ,  0.905 ,  0.91  ,  0.91  ,  0.9125,  0.9125,
        0.915 ,  0.915 ,  0.9175,  0.9175,  0.92  ,  0.92  ,  0.9225,
        0.9225,  0.925 ,  0.925 ,  0.9275,  0.9275,  0.93  ,  0.93  ,
        0.9325,  0.9325,  0.935 ,  0.935 ,  0.9375,  0.9375,  0.94  ,
        0.94  ,  0.9425,  0.9425,  0.945 ,  0.945 ,  0.9475,  0.9475,
        0.95  ,  0.95  ,  0.9525,  0.9525,  0.955 ,  0.955 ,  0.9575,
        0.9575,  0.9

In [61]:
model2.score(X_test,y_test)

0.88375000000000004

In [170]:
for i in para_list:
    if 'NEG_10' in i.label:
        words = i.words
        vec=i.dmvec
        vec = vec.reshape(1, -1)

In [171]:
print (' '.join(word for word in words))

The stock market seems to recoil each time a bigticket acquisition is announced Experts at moneycontrol spoke to believe investors are worried about the financial risks of such costly acquisitions


In [172]:
model2.predict_proba(vec)

array([[ 0.7795666,  0.2204334]])

In [173]:
model2.predict(vec)

array([0])

In [142]:
ve = neg_data[33][0]

In [156]:
model2.predict(ve)



array([0])