In [1]:
import pathlib
import emoji
import numpy as np
import pandas as pd
#For displaying complete rows info
from utils.datasets import load_helper_file
pd.options.display.max_colwidth=500
import nltk
from nltk.corpus import stopwords
from collections import Counter

In [2]:
nltk.download('stopwords')
all_stopwords = stopwords.words('english')
emoji_dict = set(e for lang in emoji.UNICODE_EMOJI.values() for e in lang)
bert_uncased_vocabulary = set(load_helper_file('helper_bert_uncased_vocabulary'))

[nltk_data] Downloading package stopwords to /home/egordm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
counter = nltk.FreqDist()
files = pathlib.Path("../../data/bitcoin_twitter_processed/").glob("part_*.parquet")
for chunk, file in enumerate(files):
    print(f'Processing chunk: {chunk}')
    data = pd.read_parquet(file)
    for i, row in data.iterrows():
        for w in row['text'].split():
            if w not in all_stopwords:
                counter[w] += 1

Processing chunk: 0


In [4]:
print(f'#words {len(counter)}')
print(f'Most common overall words')
counter.most_common(50)

#words 161643
Most common overall words


[('@CURR[bitcoin]', 362464),
 ('.', 297816),
 ('@HTAG[cryptocurrency]', 198832),
 ('@CURR[ethereum]', 129427),
 (',', 118596),
 ('!', 102430),
 (':', 71365),
 ('usd', 64277),
 ('-', 64120),
 ('@HTAG[altcoins]', 62375),
 ('@CURR[xrp]', 41815),
 ('?', 41640),
 ('percent', 40668),
 ('@HTAG[blockchain]', 36764),
 ('(', 31686),
 (')', 29458),
 ('@CURR[binance_coin]', 28654),
 ('🚀', 26996),
 ('@HTAG[defi]', 26770),
 ('@CURR[litecoin]', 26182),
 ('@USR[binance]', 25761),
 ('"', 24251),
 ('@CURR[chainlink]', 22751),
 ('get', 19903),
 ('@CURR[dogecoin]', 19713),
 ('/', 18594),
 ('price', 18423),
 ('@CURR[cardano]', 17280),
 ('buy', 17241),
 ('new', 15637),
 ('time', 15417),
 ('market', 14154),
 ('@HTAG[airdrop]', 13887),
 ('like', 13675),
 ('@CURR[tron]', 13459),
 ('$', 13213),
 ('us', 12846),
 ('🔥', 12790),
 ('1', 12783),
 ('@CURR[link]', 12692),
 ('@CURR[bitcoin_cash]', 12561),
 ('@HTAG[trading]', 12137),
 ('@HTAG[stellar]', 12108),
 ('one', 11286),
 ('free', 10547),
 ('@CURR[tether]', 10516)

In [5]:
print(f'Most common overall excluding symbols and existing vocab')
cleaned_counter = counter.copy()
SYMBOLS = '{}()[].,:;?#!+-*/&|<>@%"\'=~$1234567890'
for k in SYMBOLS:
    if k in cleaned_counter:
        cleaned_counter.pop(k)

used_words = set(k for k in cleaned_counter.keys() if k in bert_uncased_vocabulary)
for k in bert_uncased_vocabulary:
    if k in cleaned_counter:
        cleaned_counter.pop(k)
cleaned_counter.most_common(50)

Most common overall excluding symbols and existing vocab


[('@CURR[bitcoin]', 362464),
 ('@HTAG[cryptocurrency]', 198832),
 ('@CURR[ethereum]', 129427),
 ('@HTAG[altcoins]', 62375),
 ('@CURR[xrp]', 41815),
 ('@HTAG[blockchain]', 36764),
 ('@CURR[binance_coin]', 28654),
 ('🚀', 26996),
 ('@HTAG[defi]', 26770),
 ('@CURR[litecoin]', 26182),
 ('@USR[binance]', 25761),
 ('@CURR[chainlink]', 22751),
 ('@CURR[dogecoin]', 19713),
 ('@CURR[cardano]', 17280),
 ('@HTAG[airdrop]', 13887),
 ('@CURR[tron]', 13459),
 ('🔥', 12790),
 ('@CURR[link]', 12692),
 ('@CURR[bitcoin_cash]', 12561),
 ('@HTAG[trading]', 12137),
 ('@HTAG[stellar]', 12108),
 ('@CURR[tether]', 10516),
 ('@HTAG[polkadot_new]', 10267),
 ('@CURR[eos]', 8300),
 ('@CURR[uniswap]', 8236),
 ('💰', 8176),
 ('@HTAG[cryptonews]', 7928),
 ('@HTAG[altseason]', 7617),
 ('@HTAG[money]', 7530),
 ('@NUM[1.0]', 7413),
 ('@CURR[tezos]', 6502),
 ('@USR[coinbase]', 6320),
 ('🎁', 6162),
 ('@HTAG[hodl]', 6072),
 ('@HTAG[ico]', 6065),
 ('👇', 5669),
 ('@HTAG[investing]', 5465),
 ('@HTAG[yearn_finance]', 5464),
 ('@

In [6]:
print(f'Vocab size: {len(bert_uncased_vocabulary)}, Vocab used: {len(used_words)}, Words unknown: {len(cleaned_counter)}')

Vocab size: 30522, Vocab used: 17555, Words unknown: 144053


In [7]:
print('Most common words')
word_counter = cleaned_counter.copy()
for k in list(word_counter.keys()):
    if k.startswith('@') or k in emoji_dict:
        word_counter.pop(k)
word_counter.most_common(50)

Most common words


[('bullish', 5340),
 ('referral', 4145),
 ('defi', 3666),
 ('ath', 3444),
 ('retweet', 2300),
 ('alts', 1934),
 ('decentralized', 1760),
 ('sats', 1705),
 ('staking', 1691),
 ('tweet', 1504),
 ('multiply', 1446),
 ('giveaway', 1348),
 ('liquidity', 1331),
 ('bearish', 1308),
 ('satoshi', 1216),
 ('dm', 1196),
 ('lol', 1153),
 ('discord', 1067),
 ('username', 1033),
 ('dips', 970),
 ('currencies', 962),
 ('dont', 939),
 ('scam', 932),
 ('volatility', 885),
 ('fomo', 875),
 ('erc20', 862),
 ('etf', 809),
 ('imo', 800),
 ('faucet', 791),
 ('retest', 770),
 ('elon', 768),
 ('subscribe', 762),
 ('cryptos', 753),
 ('ftx', 743),
 ('tokenized', 732),
 ('markaccy', 731),
 ('nft', 706),
 ('rsi', 643),
 ('martkist', 618),
 ('pullback', 595),
 ('microstrategy', 588),
 ('dividends', 585),
 ('halving', 576),
 ('ico', 573),
 ('spdr', 569),
 ('dyor', 561),
 ('parabolic', 555),
 ('grayscale', 550),
 ('gmt', 550),
 ('cme', 545)]

In [8]:
print('Most common emojis')
emoji_counter = cleaned_counter.copy()
for k in list(emoji_counter.keys()):
    if k not in emoji_dict:
        emoji_counter.pop(k)
emoji_counter.most_common(50)

Most common emojis


[('🚀', 26996),
 ('🔥', 12790),
 ('💰', 8176),
 ('🎁', 6162),
 ('👇', 5669),
 ('📈', 5014),
 ('✅', 4494),
 ('🤑', 3516),
 ('💥', 3509),
 ('😂', 3120),
 ('😵', 3026),
 ('👉', 2969),
 ('👀', 2958),
 ('💸', 2612),
 ('💎', 2443),
 ('😎', 2362),
 ('🚨', 2267),
 ('🏆', 2236),
 ('😁', 1908),
 ('❤', 1843),
 ('🏻', 1800),
 ('🤔', 1788),
 ('🔸', 1741),
 ('⚡', 1685),
 ('🤣', 1674),
 ('😍', 1662),
 ('🙏', 1644),
 ('👥', 1636),
 ('💯', 1623),
 ('💪', 1523),
 ('💵', 1462),
 ('📦', 1390),
 ('⬇', 1388),
 ('▶', 1338),
 ('👍', 1323),
 ('😉', 1223),
 ('📉', 1189),
 ('🎉', 1188),
 ('🥳', 1176),
 ('➡', 1173),
 ('⭐', 1141),
 ('🌐', 1120),
 ('♂', 1088),
 ('☑', 1051),
 ('🏼', 996),
 ('🤩', 870),
 ('👏', 866),
 ('🔹', 827),
 ('⏰', 804),
 ('👌', 802)]

In [9]:
print('Most common currencies')
currency_counter = cleaned_counter.copy()
for k in list(currency_counter.keys()):
    if not k.startswith('@CURR'):
        currency_counter.pop(k)
currency_counter.most_common(50)

Most common currencies


[('@CURR[bitcoin]', 362464),
 ('@CURR[ethereum]', 129427),
 ('@CURR[xrp]', 41815),
 ('@CURR[binance_coin]', 28654),
 ('@CURR[litecoin]', 26182),
 ('@CURR[chainlink]', 22751),
 ('@CURR[dogecoin]', 19713),
 ('@CURR[cardano]', 17280),
 ('@CURR[tron]', 13459),
 ('@CURR[link]', 12692),
 ('@CURR[bitcoin_cash]', 12561),
 ('@CURR[tether]', 10516),
 ('@CURR[eos]', 8300),
 ('@CURR[uniswap]', 8236),
 ('@CURR[tezos]', 6502),
 ('@CURR[vechain]', 5219),
 ('@CURR[monero]', 5127),
 ('@CURR[neo]', 4918),
 ('@CURR[dash]', 4767),
 ('@CURR[zilliqa]', 4032),
 ('@CURR[bitcoin_sv]', 3523),
 ('@CURR[algorand]', 3340),
 ('@CURR[iota]', 3107),
 ('@CURR[zcash]', 3106),
 ('@CURR[digibyte]', 2988),
 ('@CURR[reserve_rights]', 2928),
 ('@CURR[aave]', 2661),
 ('@CURR[usd]', 2641),
 ('@CURR[sushiswap]', 2598),
 ('@CURR[ocean_protocol]', 2510),
 ('@CURR[omg]', 2259),
 ('@CURR[elrond_egld]', 2055),
 ('@CURR[polygon]', 2009),
 ('@CURR[quant]', 1852),
 ('@CURR[nem]', 1743),
 ('@CURR[ethereum_classic]', 1684),
 ('@CURR[kus

In [10]:
print('Most common hashtags')
htag_counter = cleaned_counter.copy()
for k in list(htag_counter.keys()):
    if not k.startswith('@HTAG'):
        htag_counter.pop(k)
htag_counter.most_common(50)

Most common hashtags


[('@HTAG[cryptocurrency]', 198832),
 ('@HTAG[altcoins]', 62375),
 ('@HTAG[blockchain]', 36764),
 ('@HTAG[defi]', 26770),
 ('@HTAG[airdrop]', 13887),
 ('@HTAG[trading]', 12137),
 ('@HTAG[stellar]', 12108),
 ('@HTAG[polkadot_new]', 10267),
 ('@HTAG[cryptonews]', 7928),
 ('@HTAG[altseason]', 7617),
 ('@HTAG[money]', 7530),
 ('@HTAG[hodl]', 6072),
 ('@HTAG[ico]', 6065),
 ('@HTAG[investing]', 5465),
 ('@HTAG[yearn_finance]', 5464),
 ('@HTAG[cryptotrading]', 5370),
 ('@HTAG[alts]', 4500),
 ('@HTAG[ether]', 4405),
 ('@HTAG[giveaway]', 4387),
 ('@HTAG[fintech]', 4172),
 ('@HTAG[forex]', 4108),
 ('@HTAG[airdrops]', 4016),
 ('@HTAG[nft]', 3997),
 ('@HTAG[investment]', 3702),
 ('@HTAG[etc]', 3679),
 ('@HTAG[btcusd]', 3527),
 ('@HTAG[crypto_com_coin]', 2904),
 ('@HTAG[unicorn_token]', 2836),
 ('@HTAG[gold]', 2800),
 ('@HTAG[finance]', 2798),
 ('@HTAG[news]', 2797),
 ('@HTAG[stocks]', 2691),
 ('@HTAG[exchange]', 2678),
 ('@HTAG[btc]', 2673),
 ('@HTAG[bitcoinnews]', 2645),
 ('@HTAG[vet]', 2589),
 ('

In [11]:
print('Most common users')
usr_counter = cleaned_counter.copy()
for k in list(usr_counter.keys()):
    if not k.startswith('@USR'):
        usr_counter.pop(k)
usr_counter.most_common(50)

Most common users


[('@USR[binance]', 25761),
 ('@USR[coinbase]', 6320),
 ('@USR[paypal]', 2384),
 ('@USR[betfury_io]', 2289),
 ('@USR[elonmusk]', 1910),
 ('@USR[cz_binance]', 1868),
 ('@USR[youtube]', 1569),
 ('@USR[bitmex]', 1510),
 ('@USR[cctip_io]', 1309),
 ('@USR[rariblecom]', 921),
 ('@USR[cointelegraph]', 916),
 ('@USR[markaccy]', 898),
 ('@USR[michael_saylor]', 871),
 ('@USR[bittrex]', 858),
 ('@USR[kraken]', 662),
 ('@USR[coindesk]', 611),
 ('@USR[coinkit_]', 515),
 ('@USR[apompliano]', 474),
 ('@USR[poloniex]', 424),
 ('@USR[uniswapprotocol]', 357),
 ('@USR[bitstamp]', 352),
 ('@USR[coingecko]', 331),
 ('@USR[ripple]', 320),
 ('@USR[decryptmedia]', 313),
 ('@USR[coinmarketcap]', 309),
 ('@USR[vitalikbuterin]', 309),
 ('@USR[ivanontech]', 299),
 ('@USR[blockfolio]', 295),
 ('@USR[opensea]', 275),
 ('@USR[peterschiff]', 275),
 ('@USR[tesla]', 273),
 ('@USR[btczofficial]', 273),
 ('@USR[bitcoinzteam]', 269),
 ('@USR[raoulgmi]', 255),
 ('@USR[visioncryptoapp]', 254),
 ('@USR[grayscale]', 248),
 ('@

In [12]:
print('Most common numbers')
num_counter = cleaned_counter.copy()
for k in list(num_counter.keys()):
    if not k.startswith('@NUM'):
        num_counter.pop(k)
num_counter.most_common(50)

Most common numbers


[('@NUM[1.0]', 7413),
 ('@NUM[0.0]', 4241),
 ('@NUM[10.0]', 3866),
 ('@NUM[100.0]', 3512),
 ('@NUM[50.0]', 2864),
 ('@NUM[24.0]', 2568),
 ('@NUM[20000.0]', 2354),
 ('@NUM[10000.0]', 2135),
 ('@NUM[20.0]', 2118),
 ('@NUM[5.0]', 2034),
 ('@NUM[15.0]', 2026),
 ('@NUM[3.0]', 1893),
 ('@NUM[2.0]', 1876),
 ('@NUM[35.0]', 1809),
 ('@NUM[1000.0]', 1806),
 ('@NUM[4.0]', 1698),
 ('@NUM[100000.0]', 1425),
 ('@NUM[25.0]', 1220),
 ('@NUM[40000.0]', 1220),
 ('@NUM[30.0]', 1174),
 ('@NUM[50000.0]', 1105),
 ('@NUM[30000.0]', 1053),
 ('@NUM[0.01]', 1029),
 ('@NUM[40.0]', 956),
 ('@NUM[0.2]', 956),
 ('@NUM[500.0]', 908),
 ('@NUM[200.0]', 887),
 ('@NUM[0.05]', 860),
 ('@NUM[0.15]', 793),
 ('@NUM[1000000.0]', 785),
 ('@NUM[7.0]', 719),
 ('@NUM[8.0]', 666),
 ('@NUM[14000.0]', 666),
 ('@NUM[12000.0]', 658),
 ('@NUM[6.0]', 629),
 ('@NUM[9.0]', 620),
 ('@NUM[0.25]', 602),
 ('@NUM[2000.0]', 578),
 ('@NUM[5000.0]', 570),
 ('@NUM[0.02]', 551),
 ('@NUM[80.0]', 539),
 ('@NUM[300.0]', 538),
 ('@NUM[16000.0]', 538),