In [1]:
import pathlib
import emoji
import numpy as np
import pandas as pd
#For displaying complete rows info
from utils.datasets import load_helper_file
pd.options.display.max_colwidth=500
import nltk
from nltk.corpus import stopwords
from collections import Counter

In [2]:
nltk.download('stopwords')
all_stopwords = stopwords.words('english')
emoji_dict = set(e for lang in emoji.UNICODE_EMOJI.values() for e in lang)
bert_uncased_vocabulary = set(load_helper_file('helper_bert_uncased_vocabulary'))

[nltk_data] Downloading package stopwords to /home/egordm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
counter = nltk.FreqDist()
files = pathlib.Path("../../data/bitcoin_twitter_processed/").glob("part_*.parquet")
for chunk, file in enumerate(files):
    print(f'Processing chunk: {chunk}')
    data = pd.read_parquet(file)
    for i, row in data.iterrows():
        for w in row['text'].split():
            if w not in all_stopwords:
                counter[w] += 1

Processing chunk: 0
Processing chunk: 1
Processing chunk: 2
Processing chunk: 3
Processing chunk: 4
Processing chunk: 5
Processing chunk: 6
Processing chunk: 7
Processing chunk: 8
Processing chunk: 9
Processing chunk: 10
Processing chunk: 11
Processing chunk: 12
Processing chunk: 13
Processing chunk: 14
Processing chunk: 15
Processing chunk: 16
Processing chunk: 17
Processing chunk: 18
Processing chunk: 19
Processing chunk: 20
Processing chunk: 21
Processing chunk: 22
Processing chunk: 23
Processing chunk: 24
Processing chunk: 25
Processing chunk: 26
Processing chunk: 27
Processing chunk: 28


In [4]:
print(f'#words {len(counter)}')
print(f'Most common overall words')
counter.most_common(50)

#words 2167396
Most common overall words


[('@CURR[bitcoin]', 11974954),
 ('.', 5698238),
 ('@HTAG[cryptocurrency]', 5091204),
 (':', 2857194),
 (',', 2817052),
 ('-', 2734337),
 ('usd', 2694688),
 ('@CURR[ethereum]', 2446279),
 ('!', 1921115),
 ('@HTAG[blockchain]', 1887974),
 ('(', 1454720),
 (')', 1306028),
 ('percent', 1014426),
 ('@CURR[xrp]', 925564),
 ('@CURR[litecoin]', 825832),
 ('@HTAG[altcoins]', 752548),
 ('price', 721093),
 ('?', 704258),
 ('/', 681763),
 ('"', 620558),
 ('|', 596574),
 ('buy', 550188),
 ('#', 472014),
 ('get', 437889),
 ('market', 421334),
 ('@USR[binance]', 418264),
 ('free', 411959),
 ('@HTAG[ico]', 403617),
 ('new', 374979),
 ('@HTAG[airdrop]', 372870),
 ('@CURR[tron]', 372191),
 ('[', 367357),
 (']', 365830),
 ('1', 358583),
 ('@CURR[bitcoin_cash]', 352643),
 ('sell', 352517),
 ("'", 338116),
 ('@HTAG[trading]', 330430),
 ('$', 317455),
 ('💰', 291550),
 ('via', 288251),
 ('last', 285195),
 ('exchange', 284439),
 ('@', 278729),
 ('=', 271228),
 ('one', 267979),
 ('join', 248373),
 ('time', 236

In [5]:
print(f'Most common overall excluding symbols and existing vocab')
cleaned_counter = counter.copy()
SYMBOLS = '{}()[].,:;?#!+-*/&|<>@%"\'=~$1234567890'
for k in SYMBOLS:
    if k in cleaned_counter:
        cleaned_counter.pop(k)

used_words = set(k for k in cleaned_counter.keys() if k in bert_uncased_vocabulary)
for k in bert_uncased_vocabulary:
    if k in cleaned_counter:
        cleaned_counter.pop(k)
cleaned_counter.most_common(50)

Most common overall excluding symbols and existing vocab


[('@CURR[bitcoin]', 11974954),
 ('@HTAG[cryptocurrency]', 5091204),
 ('@CURR[ethereum]', 2446279),
 ('@HTAG[blockchain]', 1887974),
 ('@CURR[xrp]', 925564),
 ('@CURR[litecoin]', 825832),
 ('@HTAG[altcoins]', 752548),
 ('@USR[binance]', 418264),
 ('@HTAG[ico]', 403617),
 ('@HTAG[airdrop]', 372870),
 ('@CURR[tron]', 372191),
 ('@CURR[bitcoin_cash]', 352643),
 ('@HTAG[trading]', 330430),
 ('💰', 291550),
 ('@HTAG[fintech]', 233930),
 ('@HTAG[stellar]', 233560),
 ('@CURR[eos]', 229879),
 ('@CURR[cardano]', 223476),
 ('@CURR[dogecoin]', 219238),
 ('@URL[twitter.com]', 208265),
 ('🚀', 198358),
 ('@HTAG[money]', 194465),
 ('@HTAG[news]', 193707),
 ('@NUM[1.0]', 193019),
 ('@HTAG[cryptonews]', 179999),
 ('@HTAG[cryptotrading]', 174403),
 ('@CURR[binance_coin]', 168038),
 ('@HTAG[jobs]', 166553),
 ('@HTAG[hiring]', 164813),
 ('@HTAG[careers]', 163525),
 ('@HTAG[btcusd]', 163488),
 ('@HTAG[tokensale]', 163262),
 ('@NUM[0.0]', 162947),
 ('@CURR[inr]', 158074),
 ('@USR[coinbase]', 151781),
 ('😵', 1

In [6]:
print(f'Vocab size: {len(bert_uncased_vocabulary)}, Vocab used: {len(used_words)}, Words unknown: {len(cleaned_counter)}')

Vocab size: 30522, Vocab used: 22315, Words unknown: 2145043


In [7]:
print('Most common words')
word_counter = cleaned_counter.copy()
for k in list(word_counter.keys()):
    if k.startswith('@') or k in emoji_dict:
        word_counter.pop(k)
word_counter.most_common(50)

Most common words


[('mins', 75894),
 ('arb', 74187),
 ('retweet', 52951),
 ('ico', 52515),
 ('bullish', 51796),
 ('buysellbitco', 51064),
 ('satoshi', 50868),
 ('gmt', 47162),
 ('decentralized', 35943),
 ('referral', 32912),
 ('tweet', 32074),
 ('faucet', 29256),
 ('libra', 28510),
 ('currencies', 28129),
 ('avg', 26188),
 ('cryptsy', 25639),
 ('etf', 25512),
 ('giveaway', 25366),
 ('gh', 25001),
 ('antminer', 24662),
 ('bitfinex', 23452),
 ('bearish', 23373),
 ('dm', 23230),
 ('scam', 20691),
 ('bpi', 20509),
 ('trending', 20497),
 ('forex', 19755),
 ('coindesk', 19065),
 ('bitcome', 17542),
 ('mercadobitcoin', 17125),
 ('halving', 16854),
 ('alts', 16606),
 ('cryptos', 16438),
 ('volatility', 16380),
 ('rsi', 15835),
 ('edt', 14897),
 ('dont', 14553),
 ('sats', 14515),
 ('lol', 14066),
 ('timeframe', 13677),
 ('ath', 13348),
 ('bakkt', 13137),
 ('dapp', 13105),
 ('fintech', 13057),
 ('tradingview', 12713),
 ('nakamoto', 11977),
 ('satoshis', 11811),
 ('bst', 11311),
 ('btcusd', 11274),
 ('tokenized', 

In [8]:
print('Most common emojis')
emoji_counter = cleaned_counter.copy()
for k in list(emoji_counter.keys()):
    if k not in emoji_dict:
        emoji_counter.pop(k)
emoji_counter.most_common(50)

Most common emojis


[('💰', 291550),
 ('🚀', 198358),
 ('😵', 150707),
 ('📋', 113533),
 ('🔥', 111696),
 ('✅', 97926),
 ('➡', 78096),
 ('📈', 65907),
 ('👉', 62023),
 ('⏰', 59211),
 ('🎁', 55332),
 ('▶', 49456),
 ('👇', 48867),
 ('☠', 46014),
 ('⬇', 34140),
 ('💶', 33369),
 ('😁', 33122),
 ('😂', 32227),
 ('❤', 32053),
 ('⚡', 31352),
 ('🤑', 31142),
 ('💥', 31110),
 ('🔄', 29996),
 ('📉', 29963),
 ('📢', 29887),
 ('⭐', 29116),
 ('👍', 25927),
 ('😎', 25292),
 ('✔', 24230),
 ('💯', 22651),
 ('🗺', 21459),
 ('🏆', 20000),
 ('💸', 19986),
 ('😍', 19311),
 ('💵', 18614),
 ('🚨', 18382),
 ('🤔', 18237),
 ('👀', 16388),
 ('🏻', 15144),
 ('😉', 14457),
 ('🤣', 14391),
 ('💪', 13147),
 ('🏼', 12918),
 ('🙏', 12873),
 ('🎉', 12331),
 ('🔹', 12204),
 ('🌧', 11002),
 ('💎', 10970),
 ('🍀', 10858),
 ('♂', 10668)]

In [9]:
print('Most common currencies')
currency_counter = cleaned_counter.copy()
for k in list(currency_counter.keys()):
    if not k.startswith('@CURR'):
        currency_counter.pop(k)
currency_counter.most_common(50)

Most common currencies


[('@CURR[bitcoin]', 11974954),
 ('@CURR[ethereum]', 2446279),
 ('@CURR[xrp]', 925564),
 ('@CURR[litecoin]', 825832),
 ('@CURR[tron]', 372191),
 ('@CURR[bitcoin_cash]', 352643),
 ('@CURR[eos]', 229879),
 ('@CURR[cardano]', 223476),
 ('@CURR[dogecoin]', 219238),
 ('@CURR[binance_coin]', 168038),
 ('@CURR[inr]', 158074),
 ('@CURR[monero]', 150512),
 ('@CURR[gbp]', 141366),
 ('@CURR[tether]', 139628),
 ('@CURR[neo]', 138132),
 ('@CURR[dash]', 135301),
 ('@CURR[eur]', 106999),
 ('@CURR[usd]', 102758),
 ('@CURR[iota]', 71388),
 ('@CURR[zcash]', 71284),
 ('@CURR[nem]', 64833),
 ('@CURR[chainlink]', 64130),
 ('@CURR[bitcoin_sv]', 63085),
 ('@CURR[digibyte]', 56976),
 ('@CURR[ethereum_classic]', 54463),
 ('@CURR[tezos]', 47392),
 ('@CURR[link]', 46401),
 ('@CURR[omg]', 39918),
 ('@CURR[bittorrent]', 33351),
 ('@CURR[vechain]', 32677),
 ('@CURR[zilliqa]', 29769),
 ('@CURR[bitcoin_gold]', 27572),
 ('@CURR[qtum]', 27084),
 ('@CURR[ontology]', 24523),
 ('@CURR[jpy]', 22616),
 ('@CURR[polygon]', 204

In [10]:
print('Most common hashtags')
htag_counter = cleaned_counter.copy()
for k in list(htag_counter.keys()):
    if not k.startswith('@HTAG'):
        htag_counter.pop(k)
htag_counter.most_common(50)

Most common hashtags


[('@HTAG[cryptocurrency]', 5091204),
 ('@HTAG[blockchain]', 1887974),
 ('@HTAG[altcoins]', 752548),
 ('@HTAG[ico]', 403617),
 ('@HTAG[airdrop]', 372870),
 ('@HTAG[trading]', 330430),
 ('@HTAG[fintech]', 233930),
 ('@HTAG[stellar]', 233560),
 ('@HTAG[money]', 194465),
 ('@HTAG[news]', 193707),
 ('@HTAG[cryptonews]', 179999),
 ('@HTAG[cryptotrading]', 174403),
 ('@HTAG[jobs]', 166553),
 ('@HTAG[hiring]', 164813),
 ('@HTAG[careers]', 163525),
 ('@HTAG[btcusd]', 163488),
 ('@HTAG[tokensale]', 163262),
 ('@HTAG[forex]', 125821),
 ('@HTAG[business]', 124970),
 ('@HTAG[ai]', 124025),
 ('@HTAG[ether]', 120830),
 ('@HTAG[investing]', 117417),
 ('@HTAG[bounty]', 106162),
 ('@HTAG[hodl]', 97115),
 ('@HTAG[exchange]', 95707),
 ('@HTAG[bitcoinnews]', 94260),
 ('@HTAG[investment]', 93955),
 ('@HTAG[token]', 92127),
 ('@HTAG[airdrops]', 90608),
 ('@HTAG[bitfinex]', 86175),
 ('@HTAG[coin]', 82776),
 ('@HTAG[mining]', 81040),
 ('@HTAG[ieo]', 79904),
 ('@HTAG[price]', 77403),
 ('@HTAG[finance]', 76436),

In [11]:
print('Most common users')
usr_counter = cleaned_counter.copy()
for k in list(usr_counter.keys()):
    if not k.startswith('@USR'):
        usr_counter.pop(k)
usr_counter.most_common(50)

Most common users


[('@USR[binance]', 418264),
 ('@USR[coinbase]', 151781),
 ('@USR[bitstamp]', 110915),
 ('@USR[bittrex]', 101042),
 ('@USR[bitmex]', 74383),
 ('@USR[bitmain]', 38031),
 ('@USR[cointelegraph]', 37525),
 ('@USR[poloniex]', 35655),
 ('@USR[paypal]', 34678),
 ('@USR[kraken]', 33195),
 ('@USR[dapp_com]', 27044),
 ('@USR[bot_strategy]', 25365),
 ('@USR[coindesk]', 24237),
 ('@USR[apompliano]', 20729),
 ('@USR[youtube]', 17564),
 ('@USR[cz_binance]', 15037),
 ('@USR[quark_chain]', 14413),
 ('@USR[bitpay]', 10985),
 ('@USR[realdonaldtrump]', 9793),
 ('@USR[listia!]', 9278),
 ('@USR[tradesatoshi]', 9274),
 ('@USR[btctn]', 8339),
 ('@USR[pynk_io]', 8054),
 ('@USR[xcardbymobilum]', 8028),
 ('@USR[cryptoadventura]', 7820),
 ('@USR[forbes]', 7658),
 ('@USR[peterschiff]', 7343),
 ('@USR[justinsuntron]', 7166),
 ('@USR[ripple]', 7074),
 ('@USR[officialmcafee]', 6275),
 ('@USR[thecollectivego]', 5993),
 ('@USR[bizpaye]', 5947),
 ('@USR[digitexfutures]', 5886),
 ('@USR[murthaburke]', 5447),
 ('@USR[cnbc

In [12]:
print('Most common numbers')
num_counter = cleaned_counter.copy()
for k in list(num_counter.keys()):
    if not k.startswith('@NUM'):
        num_counter.pop(k)
num_counter.most_common(50)

Most common numbers


[('@NUM[1.0]', 193019),
 ('@NUM[0.0]', 162947),
 ('@NUM[24.0]', 116206),
 ('@NUM[100.0]', 78759),
 ('@NUM[10.0]', 75763),
 ('@NUM[5.0]', 63889),
 ('@NUM[10000.0]', 56932),
 ('@NUM[100000.0]', 51997),
 ('@NUM[3.0]', 50358),
 ('@NUM[2.0]', 41925),
 ('@NUM[7.0]', 41056),
 ('@NUM[50.0]', 40635),
 ('@NUM[1000.0]', 39543),
 ('@NUM[4.0]', 37608),
 ('@NUM[1000000.0]', 34608),
 ('@NUM[20.0]', 34557),
 ('@NUM[30.0]', 32069),
 ('@NUM[6.0]', 27342),
 ('@NUM[2.48]', 25856),
 ('@NUM[15.0]', 25681),
 ('@NUM[200.0]', 22995),
 ('@NUM[0.01]', 22006),
 ('@NUM[25.0]', 20208),
 ('@NUM[8000.0]', 19563),
 ('@NUM[50000.0]', 19455),
 ('@NUM[0.15]', 19175),
 ('@NUM[0.05]', 18956),
 ('@NUM[500.0]', 18837),
 ('@NUM[20000.0]', 18353),
 ('@NUM[5000.0]', 17946),
 ('@NUM[2000.0]', 17923),
 ('@NUM[0.2]', 17801),
 ('@NUM[8.0]', 17570),
 ('@NUM[300.0]', 15488),
 ('@NUM[400.0]', 14907),
 ('@NUM[9000.0]', 14736),
 ('@NUM[40.0]', 13701),
 ('@NUM[0.25]', 13253),
 ('@NUM[420.0]', 12659),
 ('@NUM[12.0]', 12294),
 ('@NUM[9.0]'