In [21]:
import pathlib
import emoji
import numpy as np
import pandas as pd
#For displaying complete rows info
from utils.datasets import load_helper_file
pd.options.display.max_colwidth=500
import nltk
from nltk.corpus import stopwords
from collections import Counter

In [2]:
nltk.download('stopwords')
all_stopwords = stopwords.words('english')
emoji_dict = set(e for lang in emoji.UNICODE_EMOJI.values() for e in lang)
bert_uncased_vocabulary = set(load_helper_file('helper_bert_uncased_vocabulary'))

[nltk_data] Downloading package stopwords to /home/egordm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
counter = nltk.FreqDist()
files = pathlib.Path("../../data/bitcoin_twitter_processed/").glob("part_*.parquet")
for chunk, file in enumerate(files):
    print(f'Processing chunk: {chunk}')
    data = pd.read_parquet(file)
    for i, row in data.iterrows():
        for w in row['text'].split():
            if w not in all_stopwords:
                counter[w] += 1

Processing chunk: 0
Processing chunk: 1
Processing chunk: 2
Processing chunk: 3
Processing chunk: 4
Processing chunk: 5
Processing chunk: 6
Processing chunk: 7
Processing chunk: 8
Processing chunk: 9
Processing chunk: 10
Processing chunk: 11
Processing chunk: 12
Processing chunk: 13
Processing chunk: 14
Processing chunk: 15
Processing chunk: 16
Processing chunk: 17
Processing chunk: 18
Processing chunk: 19
Processing chunk: 20
Processing chunk: 21
Processing chunk: 22
Processing chunk: 23
Processing chunk: 24
Processing chunk: 25
Processing chunk: 26
Processing chunk: 27
Processing chunk: 28


In [6]:
print(f'#words {len(counter)}')
print(f'Most common overall words')
counter.most_common(50)

#words 2156984


[('@CURR[bitcoin]', 8964451),
 ('.', 5779768),
 ('@CURR[cryptocurrency]', 3370100),
 ('@CURR[btc]', 2990085),
 (':', 2871535),
 (',', 2841101),
 ('-', 2709122),
 ('dollar', 2432028),
 ('!', 1921510),
 ('@CURR[eth]', 1474647),
 ('(', 1438136),
 (')', 1289784),
 ('@CURR[blockchain]', 1025807),
 ('percent', 1011898),
 ('/', 869664),
 ('@CURR[ethereum]', 820267),
 ('@HTAG[cryptocurrency]', 795400),
 ('price', 719586),
 ('?', 704271),
 ('@CURR[xrp]', 689087),
 ('@HTAG[blockchain]', 634036),
 ('"', 620573),
 ('|', 598093),
 ('buy', 550162),
 ('#', 492701),
 ('@CURR[ltc]', 482159),
 ('@CURR[crypto]', 468896),
 ('get', 436244),
 ('@CURR[binance]', 416885),
 ('market', 413562),
 ('free', 411801),
 ('@CURR[altcoins]', 386451),
 ('new', 374121),
 ('[', 367414),
 ('1', 366343),
 (']', 365886),
 ('@HTAG[cryptocurrencies]', 362066),
 ('@CURR[ico]', 357804),
 ('sell', 352504),
 ("'", 328919),
 ('$', 307309),
 ('💰', 291540),
 ('via', 288346),
 ('exchange', 284597),
 ('@CURR[bch]', 284131),
 ('@', 2825

In [29]:
print(f'Most common overall excluding symbols and existing vocab')
cleaned_counter = counter.copy()
SYMBOLS = '{}()[].,:;?#!+-*/&|<>@%"\'=~$1234567890'
for k in SYMBOLS:
    if k in cleaned_counter:
        cleaned_counter.pop(k)

used_words = set(k for k in cleaned_counter.keys() if k in bert_uncased_vocabulary)
for k in bert_uncased_vocabulary:
    if k in cleaned_counter:
        cleaned_counter.pop(k)
cleaned_counter.most_common(50)

Most common overall excluding symbols and existing vocab


[('@CURR[bitcoin]', 8964451),
 ('@CURR[cryptocurrency]', 3370100),
 ('@CURR[btc]', 2990085),
 ('@CURR[eth]', 1474647),
 ('@CURR[blockchain]', 1025807),
 ('@CURR[ethereum]', 820267),
 ('@HTAG[cryptocurrency]', 795400),
 ('@CURR[xrp]', 689087),
 ('@HTAG[blockchain]', 634036),
 ('@CURR[ltc]', 482159),
 ('@CURR[crypto]', 468896),
 ('@CURR[binance]', 416885),
 ('@CURR[altcoins]', 386451),
 ('@HTAG[cryptocurrencies]', 362066),
 ('@CURR[ico]', 357804),
 ('💰', 291540),
 ('@CURR[bch]', 284131),
 ('@CURR[ripple]', 273187),
 ('@CURR[trx]', 255310),
 ('@CURR[airdrop]', 247287),
 ('@CURR[litecoin]', 241693),
 ('blockchain', 228694),
 ('@HTAG[fintech]', 221343),
 ('@CURR[eos]', 211575),
 ('@URL[twitter.com]', 208157),
 ('@CURR[money]', 194425),
 ('🚀', 193953),
 ('@NUM[1.0]', 192699),
 ('@CURR[trading]', 192043),
 ('@HTAG[ethereum]', 179781),
 ('@CURR[altcoin]', 178858),
 ('@CURR[ada]', 168048),
 ('@HTAG[jobs]', 166556),
 ('@HTAG[hiring]', 164815),
 ('@HTAG[careers]', 163528),
 ('@CURR[btcusd]', 1632

In [30]:
print(f'Vocab size: {len(bert_uncased_vocabulary)}, Vocab used: {len(used_words)}, Words unknown: {len(cleaned_counter)}')

Vocab size: 30522, Vocab used: 22324, Words unknown: 2134622


In [31]:
print('Most common words')
word_counter = cleaned_counter.copy()
for k in list(word_counter.keys()):
    if k.startswith('@') or k in emoji_dict:
        word_counter.pop(k)
word_counter.most_common(50)

Most common words


[('blockchain', 228694),
 ('inr', 157558),
 ('mins', 75892),
 ('arb', 74187),
 ('eur', 53393),
 ('retweet', 52923),
 ('ico', 52531),
 ('bullish', 51783),
 ('buysellbitco', 51064),
 ('satoshi', 50897),
 ('gmt', 47253),
 ('usdt', 37539),
 ('decentralized', 35942),
 ('bittrex', 34871),
 ('ltc', 33233),
 ('referral', 32906),
 ('tweet', 32113),
 ('faucet', 29257),
 ('libra', 28533),
 ('currencies', 28141),
 ('bitmex', 27112),
 ('gbp', 26316),
 ('avg', 26181),
 ('cryptsy', 25640),
 ('giveaway', 25353),
 ('gh', 25006),
 ('antminer', 24731),
 ('bitfinex', 23452),
 ('bearish', 23368),
 ('dm', 23202),
 ('bitmain', 21390),
 ('scam', 20687),
 ('bpi', 20509),
 ('trending', 20499),
 ('paypal', 19915),
 ('forex', 19742),
 ('coindesk', 19079),
 ('xem', 18753),
 ('lsk', 18055),
 ('bitcome', 17542),
 ('mercadobitcoin', 17125),
 ('mxn', 17035),
 ('halving', 16863),
 ('tron', 16828),
 ('eos', 16789),
 ('alts', 16602),
 ('cryptos', 16439),
 ('volatility', 16379),
 ('etf', 16351),
 ('rsi', 15822)]

In [32]:
print('Most common emojis')
emoji_counter = cleaned_counter.copy()
for k in list(emoji_counter.keys()):
    if k not in emoji_dict:
        emoji_counter.pop(k)
emoji_counter.most_common(50)

Most common emojis


[('💰', 291540),
 ('🚀', 193953),
 ('😵', 150367),
 ('📋', 113537),
 ('🔥', 111688),
 ('✅', 97939),
 ('➡', 78104),
 ('📈', 65593),
 ('👉', 61971),
 ('⏰', 59210),
 ('🎁', 55331),
 ('▶', 49518),
 ('👇', 48864),
 ('☠', 45996),
 ('⬇', 34136),
 ('💶', 33369),
 ('😁', 33124),
 ('😂', 32210),
 ('❤', 32052),
 ('⚡', 31330),
 ('🤑', 31148),
 ('💥', 31129),
 ('🔄', 29995),
 ('📢', 29865),
 ('📉', 29611),
 ('⭐', 28476),
 ('👍', 25980),
 ('😎', 25296),
 ('✔', 24233),
 ('💯', 22646),
 ('🗺', 21458),
 ('💸', 19999),
 ('🏆', 19935),
 ('😍', 19311),
 ('💵', 18615),
 ('🚨', 18377),
 ('🤔', 18235),
 ('👀', 16373),
 ('🏻', 15186),
 ('😉', 14460),
 ('🤣', 14365),
 ('🔹', 13806),
 ('💪', 13156),
 ('🏼', 12919),
 ('🙏', 12870),
 ('🎉', 12331),
 ('💎', 10971),
 ('🍀', 10854),
 ('♂', 10668),
 ('👌', 9598)]

In [25]:
print('Most common currencies')
currency_counter = cleaned_counter.copy()
for k in list(currency_counter.keys()):
    if not k.startswith('@CURR'):
        currency_counter.pop(k)
currency_counter.most_common(50)

Most common currencies


[('@CURR[bitcoin]', 8964451),
 ('@CURR[cryptocurrency]', 3370100),
 ('@CURR[btc]', 2990085),
 ('@CURR[eth]', 1474647),
 ('@CURR[blockchain]', 1025807),
 ('@CURR[ethereum]', 820267),
 ('@CURR[xrp]', 689087),
 ('@CURR[ltc]', 482159),
 ('@CURR[crypto]', 468896),
 ('@CURR[binance]', 416885),
 ('@CURR[altcoins]', 386451),
 ('@CURR[ico]', 357804),
 ('@CURR[bch]', 284131),
 ('@CURR[ripple]', 273187),
 ('@CURR[trx]', 255310),
 ('@CURR[airdrop]', 247287),
 ('@CURR[litecoin]', 241693),
 ('@CURR[eos]', 211575),
 ('@CURR[money]', 194425),
 ('@CURR[trading]', 192043),
 ('@CURR[altcoin]', 178858),
 ('@CURR[ada]', 168048),
 ('@CURR[btcusd]', 163270),
 ('@CURR[xlm]', 162536),
 ('@CURR[news]', 152982),
 ('@CURR[bnb]', 151401),
 ('@CURR[neo]', 137556),
 ('@CURR[dash]', 135840),
 ('@CURR[etc]', 115325),
 ('@CURR[doge]', 103940),
 ('@CURR[usd]', 103767),
 ('@CURR[ether]', 102099),
 ('@CURR[tron]', 101576),
 ('@CURR[forex]', 100520),
 ('@CURR[hodl]', 97000),
 ('@CURR[cryptocurrencies]', 95473),
 ('@CURR[xm

In [26]:
print('Most common hashtags')
htag_counter = cleaned_counter.copy()
for k in list(htag_counter.keys()):
    if not k.startswith('@HTAG'):
        htag_counter.pop(k)
htag_counter.most_common(50)

Most common hashtags


[('@HTAG[cryptocurrency]', 795400),
 ('@HTAG[blockchain]', 634036),
 ('@HTAG[cryptocurrencies]', 362066),
 ('@HTAG[fintech]', 221343),
 ('@HTAG[ethereum]', 179781),
 ('@HTAG[jobs]', 166556),
 ('@HTAG[hiring]', 164815),
 ('@HTAG[careers]', 163528),
 ('@HTAG[cryptotrading]', 157370),
 ('@HTAG[cryptonews]', 150929),
 ('@HTAG[trading]', 137519),
 ('@HTAG[tokensale]', 135327),
 ('@HTAG[airdrop]', 125440),
 ('@HTAG[altcoins]', 125185),
 ('@HTAG[investing]', 108102),
 ('@HTAG[business]', 106736),
 ('@HTAG[investment]', 93954),
 ('@HTAG[airdrops]', 90604),
 ('@HTAG[bitcoinnews]', 90057),
 ('@HTAG[exchange]', 85979),
 ('@HTAG[finance]', 76330),
 ('@HTAG[litecoin]', 74637),
 ('@HTAG[bitfinex]', 74494),
 ('@HTAG[startup]', 69593),
 ('@HTAG[bitcoins]', 67754),
 ('@HTAG[india]', 65872),
 ('@HTAG[newyork]', 65290),
 ('@HTAG[bitcoincash]', 64087),
 ('@HTAG[tech]', 63090),
 ('@HTAG[altcoin]', 62011),
 ('@HTAG[bittrex]', 59779),
 ('@HTAG[giveaway]', 59011),
 ('@HTAG[ai]', 58821),
 ('@HTAG[bitcoinmining

In [27]:
print('Most common users')
usr_counter = cleaned_counter.copy()
for k in list(usr_counter.keys()):
    if not k.startswith('@USR'):
        usr_counter.pop(k)
usr_counter.most_common(50)

Most common users


[('@USR[coinbase]', 124558),
 ('@USR[bitstamp]', 110260),
 ('@USR[binance]', 34466),
 ('@USR[dapp_com]', 27044),
 ('@USR[bot_strategy]', 25365),
 ('@USR[coindesk]', 23796),
 ('@USR[apompliano]', 20724),
 ('@USR[youtube]', 17569),
 ('@USR[cointelegraph]', 16216),
 ('@USR[cz_binance]', 14954),
 ('@USR[quark_chain]', 14410),
 ('@USR[bitpay]', 10987),
 ('@USR[realdonaldtrump]', 9785),
 ('@USR[listia!]', 9278),
 ('@USR[tradesatoshi]', 9274),
 ('@USR[btctn]', 8339),
 ('@USR[pynk_io]', 8051),
 ('@USR[xcardbymobilum]', 8028),
 ('@USR[cryptoadventura]', 7820),
 ('@USR[peterschiff]', 7336),
 ('@USR[justinsuntron]', 7165),
 ('@USR[forbes]', 7060),
 ('@USR[officialmcafee]', 6272),
 ('@USR[thecollectivego]', 5991),
 ('@USR[bizpaye]', 5947),
 ('@USR[digitexfutures]', 5889),
 ('@USR[murthaburke]', 5445),
 ('@USR[aantonop]', 4994),
 ('@USR[hepays]', 4924),
 ('@USR[p2psf]', 4732),
 ('@USR[securypto]', 4559),
 ('@USR[pngmemobile]', 4516),
 ('@USR[bitcoinmagazine]', 4514),
 ('@USR[whitebit6]', 4473),
 ('

In [28]:
print('Most common numbers')
num_counter = cleaned_counter.copy()
for k in list(num_counter.keys()):
    if not k.startswith('@NUM'):
        num_counter.pop(k)
num_counter.most_common(50)

Most common numbers


[('@NUM[1.0]', 192699),
 ('@NUM[0.0]', 138908),
 ('@NUM[24.0]', 116065),
 ('@NUM[100.0]', 79355),
 ('@NUM[10.0]', 75777),
 ('@NUM[5.0]', 64251),
 ('@NUM[10000.0]', 57678),
 ('@NUM[100000.0]', 52738),
 ('@NUM[3.0]', 50523),
 ('@NUM[2.0]', 42143),
 ('@NUM[7.0]', 41165),
 ('@NUM[50.0]', 40642),
 ('@NUM[1000.0]', 39933),
 ('@NUM[4.0]', 37864),
 ('@NUM[1000000.0]', 34611),
 ('@NUM[20.0]', 34568),
 ('@NUM[30.0]', 32560),
 ('@NUM[6.0]', 27586),
 ('@NUM[2.48]', 25851),
 ('@NUM[15.0]', 25683),
 ('@NUM[200.0]', 23445),
 ('@NUM[0.01]', 21967),
 ('@NUM[25.0]', 20242),
 ('@NUM[50000.0]', 20182),
 ('@NUM[8000.0]', 19569),
 ('@NUM[0.15]', 19175),
 ('@NUM[500.0]', 19172),
 ('@NUM[20000.0]', 19122),
 ('@NUM[0.05]', 18921),
 ('@NUM[2000.0]', 18446),
 ('@NUM[5000.0]', 17973),
 ('@NUM[0.2]', 17796),
 ('@NUM[8.0]', 17620),
 ('@NUM[300.0]', 15888),
 ('@NUM[400.0]', 15286),
 ('@NUM[9000.0]', 14737),
 ('@NUM[40.0]', 13718),
 ('@NUM[0.25]', 13238),
 ('@NUM[420.0]', 12656),
 ('@NUM[12.0]', 12240),
 ('@NUM[9.0]'