In [1]:
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk import bigrams, trigrams
import math
from nltk.stem import WordNetLemmatizer
import numpy as np
import tqdm

In [2]:
import codecs
with codecs.open('blockchainhealthcarereview.txt', "r",encoding='utf-8', errors='ignore') as fdata:
    data = fdata.readlines()

In [5]:
len(data)

285

In [3]:
stopwords = nltk.corpus.stopwords.words('english')
tokenizer = RegexpTokenizer("[\w']+")
wordnet_lemmatizer = WordNetLemmatizer()

In [4]:
def freq(word, doc):
    return doc.count(word)

def word_count(doc):
    return len(doc)

def tf(word, doc):
    return (freq(word, doc) / float(word_count(doc)))

def num_docs_containing(word, list_of_docs):
    count = 0
    for document in list_of_docs:
        if freq(word, document) > 0:
            count += 1
    return 1 + count

def idf(word, list_of_docs):
    return math.log(len(list_of_docs) /
            float(num_docs_containing(word, list_of_docs)))

def tf_idf(word, doc, list_of_docs):
    return (tf(word, doc) * idf(word, list_of_docs))

In [6]:
#Compute the frequency for each term.
vocabulary = []
docs = {}
all_tips = []

full_text_single = []
full_text_bi = []
full_text_tri = []

for i,tip in tqdm.tqdm(enumerate(data)):
    tokens = [token for token in tokenizer.tokenize(tip) if not token.isdigit()]
    tokens = [token.lower() for token in tokens]
    tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens if token not in stopwords]
    
    
    bi_tokens = bigrams(tokens)
    tri_tokens = trigrams(tokens)    

    bi_tokens = [' '.join(token) for token in bi_tokens]
#     bi_tokens = [token for token in bi_tokens]

    tri_tokens = [' '.join(token) for token in tri_tokens]
#     tri_tokens = [token for token in tri_tokens]

    final_tokens = []
    final_tokens.extend(tokens)
    final_tokens.extend(bi_tokens)
    final_tokens.extend(tri_tokens)
    
    full_text_single.append(tokens)
    full_text_bi.append(bi_tokens)
    full_text_tri.append(tri_tokens)
    
    docs[i] = {'freq': {}, 'tf': {}, 'idf': {},
                        'tf-idf': {}, 'tokens': []}

    for token in final_tokens:
        #The frequency computed for each tip
        docs[i]['freq'][token] = freq(token, final_tokens)
        #The term-frequency (Normalized Frequency)
        docs[i]['tf'][token] = tf(token, final_tokens)
        docs[i]['tokens'] = final_tokens

    vocabulary.append(final_tokens)



285it [00:33,  8.55it/s]


In [7]:
for doc in tqdm.tqdm(docs):
    for token in docs[doc]['tf']:
        docs[doc]['idf'][token] = idf(token, vocabulary)
        docs[doc]['tf-idf'][token] = tf_idf(token, docs[doc]['tokens'], vocabulary)

100%|██████████| 285/285 [1:45:38<00:00, 22.24s/it]


In [8]:
#Now let's find out the most relevant words by tf-idf.
words = {}
for doc in tqdm.tqdm(docs):
    for token in docs[doc]['tf-idf']:
        if token not in words:
            words[token] = docs[doc]['tf-idf'][token]
        else:
            if docs[doc]['tf-idf'][token] > words[token]:
                words[token] = docs[doc]['tf-idf'][token]

100%|██████████| 285/285 [00:00<00:00, 641.71it/s]


In [9]:
def horizon_visibility(tseries):
    lit = []
    for i in range(0,len(tseries)-1):
        founded_r = False
        founded_l = False
        
        (ta,ya)=tseries[i]
            
        for n in range(i+1,len(tseries)):
            (tb,yb)=tseries[n]
            if(yb>=ya):
                founded_r = True
                lit.append([ta,tb])
                break
        if not founded_r:
#             print(min([tseries[n] for n in range(i+1,len(tseries))],key=lambda x:abs(ya-x[1])))
            right_min = (min([tseries[n] for n in range(i+1,len(tseries))],key=lambda x:abs(ya-x[1])))
            for n in range(i+1,len(tseries)):
                (tb,yb)=tseries[n]
                if yb==right_min:
                    lit.append([ta,tb])
                    break
                    
        for n in range(i-1,0,-1):
            (tb,yb)=tseries[n]
            if(yb>=ya):
                founded_l = True
                lit.append([ta,tb])
                break
        if not founded_l and i>1:
#             print(min([tseries[n] for n in range(i-1,0,-1)],key=lambda x:abs(ya-x[1])))
            left_min = (min([tseries[n] for n in range(i-1,0,-1)],key=lambda x:abs(ya-x[1])))
            for n in range(i-1,0,-1):
                (tb,yb)=tseries[n]
                if(yb==left_min):
                    lit.append([ta,tb])
                    break
        elif not founded_l and i==1:
            lit.append([1,tseries[0][0]])

    return lit

In [10]:
TXT_single = sum(full_text_single,[])
TXT_bi = sum(full_text_bi,[])
TXT_tri = sum(full_text_tri,[])

In [32]:
tup = [[i,words[w]] for i,w in enumerate(TXT_bi)]

In [33]:
% time lit = horizon_visibility(tup)
# dump = lit.copy()

CPU times: user 5min 37s, sys: 4.87 s, total: 5min 42s
Wall time: 5min 44s


In [34]:
for i,_ in enumerate(lit):
    lit[i] = [TXT_bi[lit[i][0]],TXT_bi[lit[i][1]]]

In [35]:
h=nx.Graph()
h.add_edges_from(lit)
# nx.draw(h,with_labels=True)
# pyplot.show()

In [30]:
top_single_words = [x[0] for x in sorted(h.degree(), key=lambda x: x[1], reverse=True)]

In [36]:
top_bigram_words = [x[0] for x in sorted(h.degree(), key=lambda x: x[1], reverse=True)]

In [18]:
top_trigram_words = [x[0] for x in sorted(h.degree(), key=lambda x: x[1], reverse=True)]

In [40]:
len(top_bigram_words)

22923

In [16]:
import os
import csv
import random
import string
import networkx as nx
import matplotlib
from matplotlib import pyplot

In [49]:
N = 500

In [50]:
file = open('more_new.csv','w')
for u in tqdm.tqdm(top_single_words[:N]):
    for b in top_bigram_words[:N]:        
        for t in top_trigram_words[:N]:
            if u in b.split():
                file.write(u+','+b+'\n')
            elif u in t.split():
                file.write(u+','+t+'\n')
#                 print(u,t)
            elif b in t:
                file.write(b+','+t+'\n')
#                 print(u,b,t) 

file.close()

100%|██████████| 500/500 [03:58<00:00,  2.09it/s]
