In [1]:
import pandas as pd
import json
import nltk
from nltk.corpus import wordnet
from gnews import GNews
from datetime import datetime, timedelta
from tqdm import tqdm
import os
import stweet as st
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_PATH = "./data"

In [3]:
TOPICS = {"withdrawal": {"frozen wallets"},
          "fraud": {"front running", "wash trading", "exit scam"}, 
          "hacker attacks": {"DDoS", "breach"}}

## Crypto custodians list

In [3]:

import requests

url = 'https://www.cointelligence.com/exchanges_list/'

header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(url, headers=header)

dfs = pd.read_html(r.text)


In [4]:
exchanges_list = dfs[0]["Exchanges"].tolist()

## News list

In [5]:
googlenews = GNews()

In [6]:
def news_dates_for_custodies(custodies, news_api):
    dates_by_custodies = {}
    articles_by_custodies = {}
    for i, crypto in tqdm(enumerate(custodies), leave=True, position=0):
        dates = []
        articles = []
        crypto_news = news_api.get_news(crypto)
        for news in crypto_news:
            articles.append(news["title"])
            date_string = news["published date"][:-4].rstrip()
            datetime_object = datetime.strptime(date_string, '%a, %d %b %Y %H:%M:%S').strftime("%Y-%m-%d %H:%M:%S")
            dates.append(datetime_object)
        dates_by_custodies[crypto] = dates
        articles_by_custodies[crypto] = articles

        if i % 10 == 0:
            with open(os.path.join(DATA_PATH, "dates_by_custodies"), 'w') as fp:
                json.dump(dates_by_custodies, fp)
    return dates_by_custodies, articles_by_custodies

In [8]:
dates_by_custodies, articles_by_custodies = news_dates_for_custodies(exchanges_list, googlenews)

341it [2:16:16, 23.98s/it]


In [89]:
with open(os.path.join(DATA_PATH, "dates_by_custodies.json"), 'w') as fp:
    json.dump(dates_by_custodies, fp)

In [28]:
def best_dates(dates_by_custodies):
    best_dates_by_crypto = {}
    for crypto, dates in tqdm(dates_by_custodies.items()):
        if len(dates) == 0:
            continue
        most_frequent_date = max(set(dates), key = dates.count)
        most_frequent_date = datetime.strptime(most_frequent_date, "%Y-%m-%d %H:%M:%S").date()
        interval = (most_frequent_date - timedelta(days=10), most_frequent_date + timedelta(days=10))
        best_dates_by_crypto[crypto] = interval
    return best_dates_by_crypto

In [30]:
best_dates_by_crypto = best_dates(dates_by_custodies)

100%|██████████| 341/341 [00:00<00:00, 19705.40it/s]


In [32]:
len(best_dates_by_crypto)

331

## Synonymous

In [33]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/nsd/nltk_data...


True

In [44]:
def syn_find(topics):
    for topic in topics.keys():
        synonyms = set()
        for syn in wordnet.synsets(topic):
            for lm in syn.lemmas():
                    topics[topic].add(lm.name())

## Tweets extracting

In [69]:
with open('creds.json') as f:
   creds = json.load(f)

In [70]:
consumer_key = creds["api_key"]
consumer_secret = creds["api_key_secret"]
access_token = creds["access_token"]
access_token_secret = creds["access_token_secret"]
bearer_token = creds["bearer_token"]

In [103]:
def try_search(fix_words, additional, store_path):
    search_tweets_task = st.SearchTweetsTask(all_words=fix_words, any_word=additional)
    output_jl_tweets = st.JsonLineFileRawOutput(os.path.join(store_path, f'{fix_words}_raw_tweets.jl'))
    output_jl_users = st.JsonLineFileRawOutput('output_raw_search_users.jl')
    output_print = st.PrintRawOutput()
    st.TweetSearchRunner(search_tweets_task=search_tweets_task,
                         tweet_raw_data_outputs=[output_jl_tweets],
                         user_raw_data_outputs=[output_jl_users]).run()

In [104]:
try_search("Binance", "", "./")

In [106]:
def make_tweets(custodies, themes):
    for crypto in tqdm(custodies):
        crypto_path = os.path.join(DATA_PATH, crypto)
        if not os.path.exists(crypto_path):
            os.makedirs(crypto_path)
        for theme, synon in themes.items():
            main_search = crypto + " " + theme
            possible_words = " ".join(synon)
            try_search(main_search, possible_words, crypto_path)

In [110]:
make_tweets(list(dates_by_custodies.keys()), TOPICS)

100%|██████████| 341/341 [47:02<00:00,  8.28s/it] 


## Sentiment analysis

In [4]:
specific_model = pipeline(model="finiteautomata/bertweet-base-sentiment-analysis")

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [5]:
data = []
with open("./Binance_raw_tweets.jl") as f:
    for line in f:
        data.append(json.loads(line))

In [6]:
data[50]['raw_value']['full_text']

'Stop playing guys😂 \n\n#Binance #BTC #v501s #asian https://t.co/TmwOi9OqZx'

In [7]:
subfolders = [f.path for f in os.scandir(DATA_PATH) if f.is_dir()]

In [8]:
specific_model("Fuck")[0]

{'label': 'NEG', 'score': 0.9488999843597412}

In [9]:
TOPICS

{'withdrawal': {'frozen wallets'},
 'fraud': {'exit scam', 'front running', 'wash trading'},
 'hacker attacks': {'DDoS', 'breach'}}

In [15]:
def sent_cleaning(model, path, threshold):
    all_negative_tweets = []
    withdrawal = []
    fraud = []
    hackers_attack = []
    subfolders = [f.path for f in os.scandir(path) if f.is_dir()]
    tweets_logs = open('all_neg_tweets.txt', 'w')
    for folder in tqdm(subfolders):
        all_files = os.listdir(folder)
        for file in all_files:
            data = []
            with open(os.path.join(folder, file)) as f:
                for line in f:
                    data.append(json.loads(line))
            if len(data) == 0:
                continue
            
            for tweet in data:
                text = tweet['raw_value']['full_text']
                try:
                    res = model(text)[0]
                except:
                    pass
                if res['label'] == 'NEG':
                    if res['score'] > threshold:
                        all_negative_tweets.append(text)
                        tweets_logs.write(text + '\n')
                        if 'withdrawal' in file:
                            withdrawal.append(text)
                        elif 'fraud' in file:
                            fraud.append(text)
                        else:
                            hackers_attack.append(text)
    tweets_logs.close()
    return (all_negative_tweets, withdrawal, fraud, hackers_attack)
                    

In [16]:
results = sent_cleaning(specific_model, DATA_PATH, 0.8)

100%|██████████| 341/341 [10:50<00:00,  1.91s/it]


In [17]:
len(results[0])

5391

In [12]:
all_negative_tweets_60 = results[0]

In [19]:
all_negative_tweets_80 = results[0]

In [18]:
os.mkdir(os.path.join(DATA_PATH, 'results'))

In [22]:
with open(os.path.join(DATA_PATH, 'results', 'negative_tweets.txt'), 'w') as f:
    for tweet in all_negative_tweets_80:
        f.write(f"{tweet}\n")