In [15]:
import pandas as pd

from IPython.display import clear_output
import numpy as np
import datetime

from termcolor import colored

In [16]:
import logging
logging.disable(logging.WARNING)

In [14]:
# ==============================flair models=================================
from flair.data import Sentence
from flair.models import SequenceTagger
# load tagger
tagger = SequenceTagger.load("flair/pos-english")

# ==========================transformers models=============================
from transformers import pipeline,AutoTokenizer, AutoModelForSeq2SeqLM,AutoModelForQuestionAnswering

# Summarization
summarizer = pipeline("summarization", model="google/pegasus-xsum")

# ======================sentence_transformers models=========================
from sentence_transformers import SentenceTransformer, util
model_sentence_sim = SentenceTransformer('all-MiniLM-L6-v2')

In [17]:
import requests
from urllib.parse import urljoin
import bs4

import time

In [43]:
# search the keywords in google
def google_search(tokens):
    
    text= f"What is {tokens}?"
    url = 'https://google.com/search?q=' + text
    
    # Fetch the URL data using requests.get(url),
    # store it in a variable, request_result.
    request_result=requests.get(url)
    
    # Creating soup from the fetched request
    soup = bs4.BeautifulSoup(request_result.text,
                            "html.parser")
    time.sleep(0.5)
    
    all_result = soup.find_all('div', class_="BNeawe s3v9rd AP7Wnd")
    context = ".".join([all_result[i].text for i in range(2)])
    # print(tokens,context)
    return context

# suummarize the google search result
def summary_context(tokens,context):
    context = f'The definition of {tokens} is that:'+context

    summary_text = summarizer(context, max_length=300, min_length=10, do_sample=False)[0]['summary_text']
    return summary_text

# make sure whether the token is related to the crypto
def final_sure(tokens,summary_text):

    related_list = ['bitcoin','btc','ethereum','crypto','cryptocurrency','blockchain','digitalcoin','digital currency','digital asset','vitual currency']
    def_corpus = f"The definication is that: "+" or ".join(related_list)
    sentences = [def_corpus, summary_text]

    #Encode all sentences
    embeddings = model_sentence_sim.encode(sentences)

    #Compute cosine similarity between all pairs
    cos_sim = util.cos_sim(embeddings, embeddings)

    sim_score = float(cos_sim[0][1])
    
    # ========== comment for this line==============
    # print(colored("-similarity score-",'yellow'),colored(tokens,'green'),sim_score)
    
    sim_thred = 0.4
    if sim_score>sim_thred:
        return True
    else:
        return False

  
# select tokens that only are related to the crypto 
def token_is_crypto(tag_text):
    crypto_tokens = []
    for i in range(0, len(tag_text)):
        tokens = tag_text[i]
        
        context = google_search(tokens)
        
        summary_text = summary_context(tokens,context)
        
        # ========== comment for this line==============
        # print(colored('summarized text','yellow'),colored(tokens,'green'),summary_text)

        if final_sure(tokens,summary_text):
            crypto_tokens.append(tokens)
    return crypto_tokens
    

def labeled_title(each_title,each__time):

    t = datetime.datetime.strptime(each_time,"%Y-%m-%dT%H:%M:%SZ")
    t.strftime("%Y-%m-%d %H:%M:%S")
    
    # make sentence
    sentence = Sentence(each_title)

    # predict NER tags
    tagger.predict(sentence)

    all_tag = np.array([i.tag for i in sentence.get_spans('pos')])
    all_text = np.array([i.text for i in sentence.get_spans('pos')])
        
    tag_text = []
    import_lb = ['NNP','NNPS',"NN"]

    for _tag,_text in zip(all_tag,all_text):
        if _tag in import_lb:
            tag_text.append(_text)
            
    crypto_tokens = token_is_crypto(tag_text)
    
    print(colored("Time:",'yellow'), t)
    print(colored("News title: ",'yellow'),each_title)
    print("Keywords: ",tag_text)
    print(colored("Crypto Label: ",'yellow'),colored(crypto_tokens,'green'))
    print("--------------------------------------------------")
    # clear_output(wait=True)

In [19]:
def demo_pre(lengthofnews):
    for i in range(lengthofnews):
        each_title = ori_data['title'][i]
        each_time = ori_data['created_at'][i]
        
        labeled_title(each_title,each_time)

In [20]:
# The fullowing functions are for degging
def sim_score(tokens,summary_text):
    

    related_list = ['bitcoin','btc','ethereum','crypto','cryptocurrency','blockchain','digitalcoin','digital currency','digital asset','vitual currency']
    def_corpus = f"The definication is that: "+" or ".join(related_list)
    sentences = [def_corpus, summary_text]

    #Encode all sentences
    embeddings = model_sentence_sim.encode(sentences)

    #Compute cosine similarity between all pairs
    cos_sim = util.cos_sim(embeddings, embeddings)

    sim_score = float(cos_sim[0][1])
    print(tokens,sim_score)

In [21]:
ori_data = pd.read_csv('sample_cryoto_news.csv',index_col=0)

title = ori_data['title']
create_time = ori_data['created_at']

In [32]:
# =============================single tokens: debugging===============================

tokens = "Library"
context = google_search(tokens)      
summary_text = summary_context(tokens,context)

print("context:",context)
print("summary_text:",summary_text)

related_list = ['bitcoin','btc','ethereum','crypto','cryptocurrency','blockchain','digitalcoin','digital currency','digital asset','vitual currency']
def_corpus = f"The definication is that: "+" or ".join(related_list)
sentences = [def_corpus, summary_text]

#Encode all sentences
embeddings = model_sentence_sim.encode(sentences)

#Compute cosine similarity between all pairs
cos_sim = util.cos_sim(embeddings, embeddings)
sim_score = float(cos_sim[0][1])

print("sim_score:",sim_score)

context: noun
a building or room containing collections of books, periodicals, and sometimes films and recorded music for people to read, borrow, or refer to."a school library".noun

summary_text: A glossary of terms from the BBC News website.
sim_score: 0.3331325650215149


# Demo for single:

In [29]:
i = 5
each_title = ori_data['title'][i]
each_time = ori_data['created_at'][i]
labeled_title(each_title,each_time)

[33mTime:[0m 2021-12-04 12:30:05
[33mNews title: [0m Bitcoin, Ethereum face largest correction since 19 May; is it time to buy the dip
Keywords:  ['Bitcoin', 'Ethereum', 'face', 'correction', 'May', 'time', 'dip']
[33mCrypto Label: [0m [32m['Bitcoin', 'Ethereum'][0m
--------------------------------------------------


# Demo for all:

In [44]:
# before run this function, please comment the related line code.
demo_pre(3)

[33mTime:[0m 2021-12-04 12:30:05
[33mNews title: [0m $2.6 Billion Bug in Solana Program Library Disclosed: Details
Keywords:  ['Bug', 'Solana', 'Program', 'Library']
[33mCrypto Label: [0m [32m[][0m
--------------------------------------------------
[33mTime:[0m 2021-12-04 12:30:05
[33mNews title: [0m El Salvador Increases Bitcoin Holding To Over $60 Million, Tron’s Justin Sun Scoops More BTC In Solidarity
Keywords:  ['El', 'Salvador', 'Bitcoin', 'Holding', 'Tron', 'Justin', 'Sun', 'BTC', 'Solidarity']
[33mCrypto Label: [0m [32m['Bitcoin', 'BTC'][0m
--------------------------------------------------
[33mTime:[0m 2021-12-04 12:30:05
[33mNews title: [0m Shiba Inu Remains Top Holding in Whale Wallets After Market Drops by 25%
Keywords:  ['Shiba', 'Inu', 'Holding', 'Whale', 'Market', '%']
[33mCrypto Label: [0m [32m[][0m
--------------------------------------------------
