In [545]:
# Credit for some parts to: https://www.kaggle.com/kyakovlev/preprocessing-bert-public
# Number extraction and hashtags is my baby

# General imports|  
import pandas as pd
import re, warnings, pickle, itertools, emoji, unicodedata

# custom imports
from gensim.utils import deaccent
from collections import Counter
from bs4 import BeautifulSoup
from utils.datasets import *
from pandarallel import pandarallel
import fasttext

pandarallel.initialize()
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 10
pd.options.display.max_colwidth = 200


INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [546]:
## Initial vars

HELPER_PATH             = '../../data/helpers/'
LOCAL_TEST = True       ## Local test - for test performance on part of the train set only
verbose = True
WPLACEHOLDER = 'word_placeholder'
URL_TAG = '@URL'
USER_TAG = '@USR'
NUMBER_TAG = '@NUM'
HASH_TAG = '@HTAG'
CURRENCY_TAG = '@CURR'
TIME_TAG = '@TIME'
DATE_TAG = '@DATE'
IMMUTABLES = [
    WPLACEHOLDER,
    URL_TAG, USER_TAG, NUMBER_TAG, HASH_TAG, CURRENCY_TAG,
    TIME_TAG, DATE_TAG
]

SEED = 42               ## Seed for enviroment
seed_everything(SEED)   ## Seed everything

In [547]:
## Preprocess helpers
def place_hold(w, tag=WPLACEHOLDER):
    return tag + '[' + re.sub(' ', '___', w) + ']'

## Helpers
def check_replace(w):
    return not bool(re.search('|'.join(IMMUTABLES), w))

def make_cleaning(s, c_dict):
    if check_replace(s):
        s = s.translate(c_dict)
    return s

def make_dict_cleaning(s, w_dict, skip_check=False):
    # Replaces a word using dict if it is mutable
    if skip_check or check_replace(s):
        s = w_dict.get(s, s)
    return s

In [548]:
## Get basic helper data

bert_uncased_vocabulary = load_helper_file('helper_bert_uncased_vocabulary')
bert_cased_vocabulary   = load_helper_file('helper_bert_cased_vocabulary')
bert_char_list          = list(set([c for line in bert_uncased_vocabulary+bert_cased_vocabulary for c in line]))

url_extensions          = load_helper_file('helper_url_extensions')
html_tags               = load_helper_file('helper_html_tags')
good_chars_dieter       = load_helper_file('helper_good_chars_dieter')
bad_chars_dieter        = load_helper_file('helper_bad_chars_dieter')
helper_contractions     = load_helper_file('helper_contractions')
global_vocabulary       = load_helper_file('helper_global_vocabulary')
global_vocabulary_chars = load_helper_file('helper_global_vocabulary_chars')
normalized_chars        = load_helper_file('helper_normalized_chars')
white_list_chars        = load_helper_file('helper_white_list_chars')
white_list_punct        = " '*-.,?!/:;_()[]{}<>=" + '"'
pictograms_to_emoji     = load_helper_file('helper_pictograms_to_emoji')
helper_custom_synonyms     = load_helper_file('helper_custom_synonyms')
helper_currency_synonyms     = load_helper_file('helper_currency_synonyms')
helper_custom_general_synonyms     = load_helper_file('helper_custom_general_synonyms')
emoji_dict = set(e for lang in emoji.UNICODE_EMOJI.values() for e in lang)

In [549]:
## Load Data
good_cols       = ['_id', 'text']
data = pd.read_parquet('../../data/bitcoin_twitter_raw/part_0.parquet')
data = data.iloc[:20000][good_cols]

In [550]:
## Start preprocessing
texts = data['text']
local_vocab = bert_uncased_vocabulary
global_lower=True
texts = texts.astype(str)
if verbose: print('#' *20 ,'Initial State:'); check_vocab(texts, local_vocab)

#################### Initial State:
Unknown words: 63451 | Known words: 6880


In [551]:
def lower(texts):
    texts = texts.apply(lambda x: x.lower())
    if verbose: print('#'*10 ,'Step - Lowering everything:'); check_vocab(texts, local_vocab)
    return texts

if global_lower:
    texts = texts.pipe(lower)

########## Step - Lowering everything:
Unknown words: 54216 | Known words: 7938


In [552]:
# Normalize chars and dots - SEE HELPER FOR DETAILS
def normalize_chars(texts):
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,normalized_chars) for i in x.split()]))
    texts = texts.apply(lambda x: re.sub('\(dot\)', '.', x))
    texts = texts.apply(lambda x: deaccent(x))
    if verbose: print('#'*10 ,'Step - Normalize chars and dots:'); check_vocab(texts, local_vocab)
    return texts

texts = texts.pipe(normalize_chars)

########## Step - Normalize chars and dots:
Unknown words: 53957 | Known words: 7946


In [553]:
def remove_control_chars(texts):
    global_chars_list = list(set([c for line in texts for c in line]))
    chars_dict = {c:'' for c in global_chars_list if unicodedata.category(c)[0]=='C'}
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
    if verbose: print('#'*10 ,'Step - Control Chars:'); check_vocab(texts, local_vocab)
    return texts

texts = texts.pipe(remove_control_chars)

########## Step - Control Chars:
Unknown words: 53957 | Known words: 7946


In [554]:
def remove_hrefs(texts):
    texts = texts.apply(lambda x: re.sub(re.findall(r'\<a(.*?)\>', x)[0], '', x) if (len(re.findall(r'\<a (.*?)\>', x))>0) and ('href' in re.findall(r'\<a (.*?)\>', x)[0]) else x)
    if verbose: print('#'*10 ,'Step - Remove hrefs:'); check_vocab(texts, local_vocab)
    return texts

texts = texts.pipe(remove_hrefs)

########## Step - Remove hrefs:
Unknown words: 53957 | Known words: 7946


In [555]:
# Convert or remove Bad Symbols
def convert_remove_bad_symbols(texts):
    global_chars_list = list(set([c for line in texts for c in line]))
    chars = ''.join([c for c in global_chars_list if (c not in bert_char_list) and (c not in emoji_dict) and (c not in white_list_chars)])
    chars_dict = {}
    for char in chars:
        try:
            new_char = unicodedata.name(char).split()[-1:][0].lower()
            if len(new_char)==1:
                chars_dict[ord(char)] = new_char
            else:
                chars_dict[ord(char)] = ''
        except:
            chars_dict[ord(char)] = ''
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove Bad Symbols:'); check_vocab(texts, local_vocab)
    if verbose: print(chars)
    if verbose: print_dict(chars_dict)
    return texts

texts = texts.pipe(convert_remove_bad_symbols)

########## Step - Remove Bad Symbols:
Unknown words: 53826 | Known words: 7956
ㅜ🅴아⁠𝐒𝖋𝖎𝐨【🇲❯🆂까𝖕스𝟵友ᵛ𝖞𝐂🅽㆔𝟲𝐀권󠁢𝑼𝒕留길寒➤로반🇻🇮다ƀ𝒊𝒂𝐬𝐥🇩ด𝖓𝖑貨비🇬🅻𝐞ꮆ想𝒆ѵ𝐄𝟠데แ商特务내𝐮𝑲탑₦리块⟶🇴🇨ꮇ𝖗트𝖔󠁳円𝑳值ผ𝒔𝒓🇦ค익🇵ๆ지기‌ไ𝟎도포¯⋯⃣🅷跌𝖉𝒉ช더货𝒏랬吴🅳는𝖆𝟰𝐠🇿면에서►模𝟏✓󠁿碳줍炮█٪𝐛𝐯정涨￼🇷𝒌₺𝑾렇】바𝟭🇭𝒐￥𝑻₿链🇳𝟙؟🇰𝖙台ꮤ通⟠🇹𝒅𝐡🅼인려𝖘密条▓了회𝖚交코𝖈𝖊⁦▴시𝒄陆𝟔𝐅𝒎𝟘₳＄ะ𝟓🆃🇺𝕮행🇽ㅠ󠁣𝐭𝟬중션🇸𝐚จ🇧币🇪ข𝕽𝟚⁩니忌나🇱价그수󠁧𝐝𝐦가𝐫⋰덕约‍𝒗฿ⓜ𝒍󠁴░
12636 --- u
127348 --- e
50500 --- a
8288 --- 
119826 --- s
120203 --- f
120206 --- i
119848 --- o
12304 --- 
127474 --- m


In [556]:
# Remove Bad Symbols PART 2
def convert_remove_bad_symbols2(texts):
    global_chars_list = list(set([c for line in texts for c in line]))
    chars = '·' + ''.join([c for c in global_chars_list if (c not in white_list_chars) and (c not in emoji_dict) and (c not in white_list_punct) and (ord(c)>256)])
    chars_dict = {}
    for char in chars:
        try:
            new_char = unicodedata.name(char).split()[-1:][0].lower()
            if len(new_char)==1:
                chars_dict[ord(char)] = new_char
            else:
                chars_dict[ord(char)] = ''
        except:
            chars_dict[ord(char)] = ''
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove Bad Symbols PART 2:'); check_vocab(texts, local_vocab)
    if verbose: print(chars)
    if verbose: print_dict(chars_dict)
    return texts

texts = texts.pipe(convert_remove_bad_symbols2)

########## Step - Remove Bad Symbols PART 2:
Unknown words: 53659 | Known words: 7949
·ˢξك→ต…ノي≥√₹п●•ف∞ンทся比چш大ھیу上कثاวمчجヒーыرาअءм下《نタф›دوظน安هتβトقカเพ平区лоـ이لब加и☆。चルд€，यپ„生حцж≈بصضล》аک学ںہยツッвٹाอт？эιมذрюコбگе！عькнπзةгس仮خกشイ
183 --- 
738 --- s
958 --- 
1603 --- 
8594 --- 
3605 --- 
8230 --- 
12494 --- 
1610 --- 
8805 --- 


In [557]:
def remove_html_tags(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if ('<' in word) and ('>' in word):
            for tag in html_tags:
                if ('<'+tag+'>' in word) or ('</'+tag+'>' in word):
                    temp_dict[word] = BeautifulSoup(word, 'html5lib').text
    texts = texts.apply(lambda x: ' '.join([temp_dict.get(i, i) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - HTML tags:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_html_tags)

########## Step - HTML tags:
Unknown words: 53659 | Known words: 7949


In [558]:
# Remove links (There is valuable information in links (probably you will find a way to use it))
def remove_links(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    url_rule = r'(?P<url>https?://[^\s]+)'
    temp_dict = {k:domain_search(k) for k in temp_vocab if k!= re.compile(url_rule).sub('url', k)}

    for word in temp_dict:
        new_value = temp_dict[word]
        if word.find('http')>2:
            temp_dict[word] =  word[:word.find('http')] + ' ' + place_hold(new_value, URL_TAG)
        else:
            temp_dict[word] = place_hold(new_value, URL_TAG)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Convert urls part 1:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)

    # Remove twitter urls
    temp_dict = {
        f'{URL_TAG}[t.co]': ''
    }
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict, skip_check=True) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Convert urls part 1.5:'); check_vocab(texts, local_vocab);
    return texts

texts = texts.pipe(remove_links)

########## Step - Convert urls part 1:
Unknown words: 39204 | Known words: 7949
https://t.co/rnoz5w1qci --- @URL[t.co]
https://t.co/cvaqwojj4a --- @URL[t.co]
https://t.co/pnulv4cx1y --- @URL[t.co]
https://t.co/dzcyihtjdo --- @URL[t.co]
https://t.co/clkemg4vde --- @URL[t.co]
https://t.co/pp9niuh7ru --- @URL[t.co]
https://t.co/irk93bqjus --- @URL[t.co]
https://t.co/v6md5qubwx --- @URL[t.co]
https://t.co/jr67ublot7 --- @URL[t.co]
https://t.co/kmkjb5odno --- @URL[t.co]
########## Step - Convert urls part 1.5:
Unknown words: 39203 | Known words: 7949


In [559]:
# Remove escaped html
def remove_escaped_html(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    symbols = {
        '&quot;': '',
        '&amp;': ' and ',
        '&lt;': '',
        '&gt;': '',
    }
    temp_dict = {}
    for word in temp_vocab:
        if any([rep in word for rep in symbols.keys()]):
            new_word = word
            for rep, to in symbols.items():
                new_word = new_word.replace(rep, to)
            temp_dict[word] = new_word

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict, skip_check=True) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove escaped html:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_escaped_html)

########## Step - Remove escaped html:
Unknown words: 39129 | Known words: 7951
term-&gt; --- term-
&gt;&gt;&gt;&gt;#bitcoinz&lt;&lt;&lt;&lt; --- #bitcoinz
p&amp;l. --- p and l.
h&amp;s? --- h and s?
&gt;= --- =
faang&amp;m --- faang and m
&gt;^^&lt; --- ^^
s&amp;p --- s and p
c&amp;h --- c and h
(&gt;10x). --- (10x).


In [560]:
# Convert urls part 2
def convert_urls2(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}

    for word in temp_vocab:
        url_check = False
        if 'file:' in word:
            url_check = True
        elif ('http' in word) or ('ww.' in word) or ('.htm' in word) or ('ftp' in word) or ('.php' in word) or ('.aspx' in word):
            if 'Aww' not in word:
                for d_zone in url_extensions:
                    if '.' + d_zone in word:
                        url_check = True
                        break
        elif ('/' in word) and ('.' in word):
            for d_zone in url_extensions:
                if '.' + d_zone + '/' in word:
                    url_check = True
                    break

        if url_check:
            temp_dict[word] =  place_hold(domain_search(word), URL_TAG)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Convert urls part 2:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(convert_urls2)

########## Step - Convert urls part 2:
Unknown words: 39129 | Known words: 7951
www.maverick-tech.con --- @URL[maverick-tech.con]
.www.rapidsnetwork.io --- @URL[rapidsnetwork.io]


In [561]:
# Normalize pictograms
# Local (only unknown words)
def normalize_pictograms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if len(re.compile('[a-zA-Z0-9]').sub('', word))>2:
            for pict in pictograms_to_emoji:
                if (pict in word) and (len(pict)>2):
                    char_pict = pict[-1].isalpha() and pict[0].isalpha()
                    if char_pict:
                        pass
                    else:
                        temp_dict[word] = word.replace(pict, pictograms_to_emoji[pict])
                elif pict==word:
                    temp_dict[word] = pictograms_to_emoji[pict]

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Normalize pictograms:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(normalize_pictograms)

########## Step - Normalize pictograms:
Unknown words: 39128 | Known words: 7951
:-)! --- 😁!
:-) --- 😁
:))) --- 😁)


In [562]:
def isolate_emoji(texts):
    global_chars_list = list(set([c for line in texts for c in line]))
    chars = ''.join([c for c in global_chars_list if c in emoji_dict])
    chars_dict = {ord(c):f' {c} ' for c in chars}
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Isolate emoji:'); check_vocab(texts, local_vocab)
    if verbose: print(chars)
    return texts

texts = texts.pipe(isolate_emoji)

########## Step - Isolate emoji:
Unknown words: 36781 | Known words: 7974
🥮👹▶🍞💕🥈🍻🤞😖🦗⛈😧🐈💹🛍😘🤯🔺🎆😊📢🙄🛫🥑📐🚒😬🔪👭🎇🛰🥁♦💥💓👆🆚🎍❓📯☄🟧🌕🤑🌲🆗🚑🖖🦆🤧😒🎱⚽🤳▪😞🎁☝🏈📦💜▫🔎🐐🍄😝🍊📍☕👂😛👥💨🍀👬💇➖😴🌓🐮😌😯🤡🫂🖤💲🌘⚪😥⏲🎨👾🔮💧🔆🏴🎢🦡🦺🏂🏀🅰🐒🧯🐎🤏✔🔝🧄🏠🦄🐃🐂🍸🌧⏫📲✅⤴🌍🕯⁉👷☢😑🍒🌸🙁💻🟨😩⚫🎞📱💊🍌🎵🤓💛💤🚘🦢🌝🙋💙🤴🌛⚡🔫⛔🆙💱💟🦎🗣🌼⚒🧁🟩✈📡✋™😲🧵🤸💁🏭👈🌀🥵📣🤖🌹😋➕♉🏿⬜🚦📉🏇🔼⏰📺🪙🚫⏬🔐🙃📆🦮💸🔟🚀💣⬆🐷😆💬🥺🔶🐶👺👌😮💦💌♀🌒👨🔁😳♣🤦🛀❄😭🍡🌚🔒🔄⛪🙇😪⌚🕷🤬🧘🧠💴🥲📗🐙🤩📝🆒💡🥒👐🤐🪖🍦🎦🤍🏵🔑🔴🥱👟💉‼✳🪦🔹🙀🚆😰⛏📹🎈🔽🙅🟠🧢😔🌏🦉🤘🦁🐦😎🪅👁🔗🃏⚠💎🤗💩😤🖐🛒👣🧿😀✊🤌🏯🌠🔯🔔❣🍼🤲🏦🤷🏽〰🎄⬛📊🔊🐰↗✨☑⛳😷🎲⛓🙏🚩🦅🎉🚗🔵❕🚶🖌😉🩸🎩♥🦈🏾🧧🧑🥀⚙☺🛠🐾®🎥🥶🙆😨💶🏗🛸〽😇❇🏄👽☮🦞🤠🤢🌴🔃🥅🧡🐲💋🗻🤨🥓🌈🏃😼🥴👎🥕⭐♎🔖🔸🚄😄🦾💰🌐👻🐼♾🙂🦕🐺💚⏳👄😐👀😜🖼🎯🥰⛅🥜🎧🟥↩©🌱🛤🤤🌿☹🎟😈👏🏖🌙🍺⏯🟢🐄💼🍳🐋✌🧙🍔❤🔂💯🔱⚛👇🤭🥇🥳😻🤔💃🍓🥃🚊⚜⤵👕🍕🧪🔘🍎⛷♂🍿😠😓🐻🏼⛽❌💞🎭ℹ❗🐳❔🏅🏫📅⬅😙☁🤫🅱🛡📖🚂🥂🎶🍹💫🧚😁😱😍🔥👍🏆💷🥉♻👉⛵🌃🛑😢🦽🚨🎮🎊🙌👸🌻🌊✍😶🍫🦚💠🧨🌑🍾🤚📸🐵👋🦖🚋🥩⚔☎⌛🙊🕊🎓🤟🏻⏱🥥🗳🗑☠🌪📈🎣↪🌋🪐😚💔🥛🥸📌🐕🐸🧷🔷😟🤜😏💘🎤🦬🗨🧸➡💵🕺📚👩🔨🐣😃🚚🔻🧐🙈🌌📞🥊🐬🐟🥬⛴🐢🕘⬇🥞🖕😡🍩🍮📩🎬🌖🔋🌔😂🕵😅🌇🌞🤪🤣🦋🛎🔀🦵📰🦧🦍🦑🌟🦊🔛🏧💀🤝🏹🔌🤙🐍🌜💗🎖🤛◽👤🏁👑💳☀🌎💪🕶🍷🐝👊🔜🏡🎰💭🍏💖😫🌗🗓🏋😵🐑🚣🗽🖇🌳👶


In [563]:
# Duplicated dots, question marks and exclamations
def deduplicate_dots(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        new_word = word
        if (Counter(word)['.']>1) or (Counter(word)['!']>1) or (Counter(word)['?']>1) or (Counter(word)[',']>1):
            if (Counter(word)['.']>1):
                new_word = re.sub('\.\.+', ' . . . ', new_word)
            if (Counter(word)['!']>1):
                new_word = re.sub('\!\!+', ' ! ! ! ', new_word)
            if (Counter(word)['?']>1):
                new_word = re.sub('\?\?+', ' ? ? ? ', new_word)
            if (Counter(word)[',']>1):
                new_word = re.sub('\,\,+', ' , , , ', new_word)
            temp_dict[word] = new_word
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Duplicated Chars:'); check_vocab(texts, local_vocab);
    return texts

texts = texts.pipe(deduplicate_dots)

########## Step - Duplicated Chars:
Unknown words: 34752 | Known words: 8028


In [564]:
# Remove underscore for spam words
def remove_underscore_spam(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if (len(re.compile('[a-zA-Z0-9\-\.\,\/\']').sub('', word))/len(word) > 0.6) and ('_' in word):
            temp_dict[word] = re.sub('_', '', word)
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove underscore:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_underscore_spam)

########## Step - Remove underscore:
Unknown words: 34738 | Known words: 8028
________________________ --- 
________ --- 
_____________________ --- 
__________ --- 
___ --- 
#____ --- #
______ --- 
_____________ --- 
#___ --- #
\_()_/ --- \()/


In [565]:
# Isolate spam chars repetition
def isolate_spam_characters(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if (len(re.compile('[a-zA-Z0-9\-\.\,\/\']').sub('', word))/len(word) > 0.6) and (len(Counter(word))==1) and (len(word)>2):
            temp_dict[word] = ' '.join([' ' + next(iter(Counter(word).keys())) + ' ' for i in range(1)])
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Spam chars repetition:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(isolate_spam_characters)

########## Step - Spam chars repetition:
Unknown words: 34729 | Known words: 8028
***** ---  * 
)))) ---  ) 
*** ---  * 
$$$ ---  $ 
$$$$ ---  $ 
::::::::::::::::::::::::::: ---  : 
$$$$$$$$$$$$ ---  $ 
$$$$$ ---  $ 
**** ---  * 


In [566]:
# Normalize pictograms part 2
# Local (only unknown words)
def normalize_pictograms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if len(re.compile('[a-zA-Z0-9]').sub('', word))>1:
            for pict in pictograms_to_emoji:
                if pict==word:
                    temp_dict[word] = pictograms_to_emoji[pict]
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Normalize pictograms part 2:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(normalize_pictograms)

########## Step - Normalize pictograms part 2:
Unknown words: 34724 | Known words: 8028
=) --- 😁
;) --- 😜
:( --- 😡
:] --- 😁
:) --- 😁


In [567]:
# Isolate brakets and quotes
def isolate_brackets(texts):
    chars = '()[]{}<>"'
    chars_dict = {ord(c):f' {c} ' for c in chars}
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Brackets and quotes:'); check_vocab(texts, local_vocab)
    if verbose: print_dict(chars_dict)
    return texts

texts = texts.pipe(isolate_brackets)

########## Step - Brackets and quotes:
Unknown words: 33135 | Known words: 8087
40 ---  ( 
41 ---  ) 
91 ---  [ 
93 ---  ] 
123 ---  { 
125 ---  } 
60 ---  < 
62 ---  > 
34 ---  " 


In [568]:
# Extract date and time
def extract_date_and_time(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}

    re_inb = re.compile('[,\'"`]')
    re_fix = re.compile('^[$£%€][-+][0-9]')
    time_regex = re.compile('([0-9]{1,2}:[0-9]{1,2}:[0-9]{1,4})')
    date_regex = re.compile('([0-9]{1,4}\/[0-9]{1,2}\/[0-9]{1,4})')
    for word in temp_vocab:
        prefilter = re_inb.sub('', word).replace(',', '.')
        if re_fix.search(prefilter):
            prefilter = prefilter[1] + prefilter[0] + prefilter[2:]

        ## -------- Time
        time_result = time_regex.search(prefilter)
        if time_result:
            prefix = prefilter[:time_result.start()]
            suffix = prefilter[time_result.end():]
            mpart = prefilter[time_result.start():time_result.end()]
            temp_dict[word] = ' '.join([
                prefix,
                place_hold(str(mpart), TIME_TAG),
                suffix
            ])
            continue

        ## -------- Date
        date_result = date_regex.search(prefilter.replace('-', '/'))
        if date_result and len(word.split('/')) == 3:
            prefix = prefilter[:date_result.start()]
            suffix = prefilter[date_result.end():]
            mpart = prefilter[date_result.start():date_result.end()]
            temp_dict[word] = ' '.join([
                prefix,
                place_hold(str(mpart), DATE_TAG),
                suffix
            ])
            continue
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Extract date and time:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(extract_date_and_time)

########## Step - Extract date and time:
Unknown words: 33137 | Known words: 8087
14:26:27 ---  @TIME[14:26:27] 
12/30/20 ---  @DATE[12/30/20] 
16:17:20 ---  @TIME[16:17:20] 
2/6/21 ---  @DATE[2/6/21] 
07/02/2021 ---  @DATE[07/02/2021] 
15:00:02: ---  @TIME[15:00:02] :
5/22/10, ---  @DATE[5/22/10] 
14:00:02: ---  @TIME[14:00:02] :
2/2/21 ---  @DATE[2/2/21] 
18:00:02: ---  @TIME[18:00:02] :


In [569]:
def custom_global_synonyms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_dict = {}
    for word in temp_vocab:
        if word in helper_custom_general_synonyms:
            temp_dict[word] = helper_custom_general_synonyms[word]

    for k,v in list(temp_dict.items()):
        if k == v:
            temp_dict.pop(k)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Custom global word synonyms:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(custom_global_synonyms)


########## Step - Custom global word synonyms:
Unknown words: 33132 | Known words: 8087
chg --- change
b4 --- before
u.s. --- united states
m.cap --- market cap
mkt --- market


In [570]:
# Break short words
def break_short_words(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_vocab = [k for k in temp_vocab if len(k)<=20]

    temp_dict = {}
    for word in temp_vocab:
        if '/' in word and not word.startswith('u/') and not word.startswith('r/'):
            temp_dict[word] = re.sub('/', ' / ', word)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Break short words:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(break_short_words)

########## Step - Break short words:
Unknown words: 32784 | Known words: 8104
2/2 --- 2 / 2
$50/share --- $50 / share
#eos/#btc --- #eos / #btc
#kyc/#aml --- #kyc / #aml
usd/btc, --- usd / btc,
$tfuel/ --- $tfuel / 
$celr/ --- $celr / 
#ada/#btc --- #ada / #btc
9/10 --- 9 / 10
24/7! --- 24 / 7!


In [571]:
# Break long words
def break_long_words(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_vocab = [k for k in temp_vocab if len(k)>20]

    temp_dict = {}
    for word in temp_vocab:
        if '_' in word and not (len(word) > 2 and word[0] in ['#', '$', '@'] and word[1:len(word)-1].replace('\'s', '').replace('_', '').isalnum()):
            temp_dict[word] = re.sub('_', ' ', word)
        elif '/' in word and not word.startswith('u/') and not word.startswith('r/'):
            temp_dict[word] = re.sub('/', ' / ', word)
        elif len(' '.join(word.split('-')).split())>2:
            temp_dict[word] = re.sub('-', ' ', word)
        for s in ',.:;':
            if s in word and not re.compile('[+#@$/,.:;-]').sub('', word).isnumeric():
                temp_dict[word] = word.replace(s, f' {s} ')

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Break long words:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

for i in range(3):
    texts = texts.pipe(break_long_words)

########## Step - Break long words:
Unknown words: 32786 | Known words: 8108
hurdle-turned-support --- hurdle turned support
revenue/terahash-second --- revenue / terahash-second
/jonathan/gabriel/ozo ---  / jonathan / gabriel / ozo
casino-partner/stakeholder. --- casino-partner/stakeholder . 
#fashion.#beautiful.#happy.#cute. --- #fashion . #beautiful . #happy . #cute . 
espadora@protonmail.com --- espadora@protonmail . com
caaaaanntaaaareeee.oh.oh.oh.oh --- caaaaanntaaaareeee . oh . oh . oh . oh
crypto-dinner-futures --- crypto dinner futures
instagram@abiolaa.apparel --- instagram@abiolaa . apparel
every-once-in-a-while, --- every-once-in-a-while , 
########## Step - Break long words:
Unknown words: 32784 | Known words: 8108
casino-partner/stakeholder --- casino-partner / stakeholder
pullback/consolidation --- pullback / consolidation
august/september/october --- august / september / october
every-once-in-a-while --- every once in a while
########## Step - Break long words:
Unknown 

In [572]:
# TODO: add number parsing before
# Diambiguate entities
# Split words on @,# and $ to clear up ambiguities between entitites
def disambiguate_entitites(texts):
    symbols = '@#$'
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('@' in k or '#' in k or '$' in k)]

    temp_dict = {}
    for word in temp_vocab:
        for symbol in symbols:
            if symbol not in word: continue
            left, *right = word.split(symbol)
            rightz = symbol.join(right)
            if len(left) > 0 and len(right[0]) > 0 and right[0].isalnum():
                temp_dict[word] = f'{left} {symbol}{rightz}'
            break

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Disambiguate entities:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(disambiguate_entitites)

########## Step - Disambiguate entities:
Unknown words: 32674 | Known words: 8109
200$s --- 200 $s
=#algo --- = #algo
volatility?#crypto --- volatility? #crypto
nigeria.#bitcoin --- nigeria. #bitcoin
~$760 --- ~ $760
-$dash --- - $dash
gold.#dent --- gold. #dent
dm@or --- dm @or
me.#bitcoin --- me. #bitcoin
-@peterschiff --- - @peterschiff


In [573]:
def custom_synonyms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_dict = {}
    for word in temp_vocab:
        if word in helper_custom_synonyms:
            temp_dict[word] = helper_custom_synonyms[word]

    for k,v in list(temp_dict.items()):
        if k == v:
            temp_dict.pop(k)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Custom word synonyms:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(custom_synonyms)

########## Step - Custom word synonyms:
Unknown words: 32637 | Known words: 8109
bitmex --- @bitmex
crypto --- #cryptocurrency
binance --- @binance
bitstamp --- @bitstamp
cointelegraph --- @cointelegraph
hodl --- #hodl
@crypto --- #cryptocurrency
poloniex --- @poloniex
$crypto --- #cryptocurrency
#bitpay --- @bitpay


In [574]:
def custom_currency_synonyms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_dict = {}
    for word in temp_vocab:
        if word in helper_currency_synonyms:
            temp_dict[word] = helper_currency_synonyms[word]

    for k,v in list(temp_dict.items()):
        if k == v:
            temp_dict.pop(k)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Custom currency synonyms:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(custom_currency_synonyms)

########## Step - Custom currency synonyms:
Unknown words: 32361 | Known words: 8109
$cos --- $contentos
$bscx --- $bscex
$xfi --- $dfinance
@nexo --- $nexo
$ast --- $antiscamtoken
btt --- $bittorrent
$daiq --- $daiquilibrium
hbar --- $hedera_hashgraph
$mta --- $meta
#bancor --- $bancor


In [575]:
# Remove/Convert usernames and hashtags
def extract_entities(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if (len(word) > 2) and (word[1:len(word)-1].replace('\'s', '').replace('_', '').isalnum()):
            new_word = word.replace('\'s', '')
            if not re.compile('[#@$/,.:;]').sub('', new_word).isnumeric():
                new_word = re.compile('[,.:;]').sub('', new_word)
                if word.startswith('@'):
                    temp_dict[word] = place_hold(new_word[1:], USER_TAG)
                elif word.startswith('#'):
                    temp_dict[word] = place_hold(new_word[1:], HASH_TAG)
                elif word.startswith('u/'):
                    temp_dict[word] = place_hold(new_word[2:], USER_TAG)
                elif word.startswith('r/'):
                    temp_dict[word] = place_hold(new_word[2:], HASH_TAG)
                elif word.startswith('$') and new_word[1:].replace('_', '').isalpha():
                    tag = CURRENCY_TAG if word[1:] in helper_currency_synonyms else HASH_TAG
                    temp_dict[word] = place_hold(new_word[1:], tag)
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - UserName and Hashtag:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(extract_entities)

########## Step - UserName and Hashtag:
Unknown words: 31629 | Known words: 8109
@googlepay --- @USR[googlepay]
@joesurebangers1 --- @USR[joesurebangers1]
$rarible --- @HTAG[rarible]
#reddcoin --- @HTAG[reddcoin]
#bitvavo --- @HTAG[bitvavo]
#ceo --- @HTAG[ceo]
#zom --- @HTAG[zom]
#payment, --- @HTAG[payment]
$shop --- @HTAG[shop]
#wallets --- @HTAG[wallets]


In [576]:
# Hashtag and currency union
def hashtag_currency_union(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = set([k for k in temp_vocab if not check_replace(k)])
    temp_dict = {}
    for w in temp_vocab:
        if w.startswith(CURRENCY_TAG):
            if w.replace(CURRENCY_TAG, HASH_TAG) in temp_vocab:
                temp_dict[w.replace(CURRENCY_TAG, HASH_TAG)] = w
            if w.replace(CURRENCY_TAG, USER_TAG) in temp_vocab:
                temp_dict[w.replace(CURRENCY_TAG, USER_TAG)] = w
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict, skip_check=True) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Hashtag and currency union:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(hashtag_currency_union)

########## Step - Hashtag and currency union:
Unknown words: 31595 | Known words: 8109
@HTAG[iota] --- @CURR[iota]
@USR[algorand] --- @CURR[algorand]
@HTAG[omg] --- @CURR[omg]
@HTAG[aave] --- @CURR[aave]
@HTAG[ethereum] --- @CURR[ethereum]
@USR[ethereum] --- @CURR[ethereum]
@HTAG[pancakeswap] --- @CURR[pancakeswap]
@USR[pancakeswap] --- @CURR[pancakeswap]
@HTAG[bitcoin] --- @CURR[bitcoin]
@USR[bitcoin] --- @CURR[bitcoin]


In [577]:
# Remove ending underscore (or add quotation marks???)
def remove_ending_underscore(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('_' in k)]
    temp_dict = {}
    for word in temp_vocab:
        new_word = word
        if word[len(word)-1]=='_':
            for i in range(len(word),0,-1):
                if word[i-1]!='_':
                    new_word = word[:i]
                    temp_dict[word] = new_word
                    break
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove ending underscore:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_ending_underscore)

########## Step - Remove ending underscore:
Unknown words: 31595 | Known words: 8109
usdt_ --- usdt
'fu__ --- 'fu


In [578]:
# Remove starting underscore
def remove_starting_underscore(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('_' in k)]
    temp_dict = {}
    for word in temp_vocab:
        new_word = word
        if word[0]=='_':
            for i in range(len(word)):
                if word[i]!='_':
                    new_word = word[i:]
                    temp_dict[word] = new_word
                    break
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove starting underscore:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_starting_underscore)

########## Step - Remove starting underscore:
Unknown words: 31595 | Known words: 8109


In [579]:
# End word punctuations
def end_word_punctuations(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (not k[len(k)-1].isalnum())]
    temp_dict = {}
    for word in temp_vocab:
        new_word = word
        for i in range(len(word),0,-1):
            if word[i-1].isnumeric() and re.compile('[$£%€]').match(word[i]):
                break

            if word[i-1].isalnum():
                new_word = word[:i] + ' ' + word[i:]
                break
        temp_dict[word] = new_word
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - End word punctuations:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(end_word_punctuations)

########## Step - End word punctuations:
Unknown words: 23577 | Known words: 8585
thousandaire. --- thousandaire .
wedge, --- wedge ,
breakfast. --- breakfast .
weapon! --- weapon !
ever, --- ever ,
payment! --- payment !
wallet! --- wallet !
2019. --- 2019 .
10months, --- 10months ,
refarens: --- refarens :


In [580]:
scale_mapping = {
    'b': 1000000000,
    'bn': 1000000000,
    'bln': 1000000000,
    'billion': 1000000000,
    'm': 1000000,
    'mn': 1000000,
    'mln': 1000000,
    'million': 1000000,
    'k': 1000,
    'thousand': 1000,
    '-': -1,
}

translate = {
    '$': 'usd', '£': 'gbp','%': 'percent', '€': 'eur'
}

translate_suffix = {
    'x': 'times'
}

translate_prefix = {
    '~': 'around',
    '+-': 'around',
    '±': 'around',
    '@': 'at',
    '=': 'equals',
    '*#': 'ranked',
    '#': 'ranked',
}

def serialize_numbers(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    re_inb = re.compile('[,\'"`]')
    re_num = re.compile('^(~|\+-|±|@|=|#|\*#)?[-@+*^#:]?[$£%€]?(([.:]?[0-9])+)[$£%€]?')
    re_fix = re.compile('^[$£%€][-+][0-9]')
    time_regex = re.compile('([0-9]{1,2}:[0-9]{1,2}:[0-9]{1,4})')
    date_regex = re.compile('([0-9]{1,4}\/[0-9]{1,2}\/[0-9]{1,4})')
    for word in temp_vocab:
        prefilter = re_inb.sub('', word).replace(',', '.')
        if re_fix.search(prefilter):
            prefilter = prefilter[1] + prefilter[0] + prefilter[2:]

        ## ----- Various other numbers
        result = re_num.search(prefilter)
        if result and result.pos == 0:
            # Process combined numbers / ranges in next iteration
            if '-' in word and not word.startswith('-') and not word.startswith('+-'):
                temp_dict[word] = ' '.join(word.split('-'))
                continue

            main_part = prefilter[:result.end()]
            prefix = ''
            for prefix_key, prefix_name in translate_prefix.items():
                if main_part.startswith(prefix_key):
                    prefix = prefix_name
                    main_part = main_part.replace(prefix_key, '', 1)
                    break

            main = re.compile('^[~@+*^#:]').sub('',main_part)
            currency = re.compile('[$£%€]').search(main)
            currency = main[currency.start():currency.end()] if currency else None
            main = re.compile('[$£%€]').sub('', main)
            suffix = prefilter[result.end():]

            multiplier = 1
            if re.compile('\.[0-9]{1,2}$').search(main): # decimal
                multiplier *= 0.01 if main[-1].isnumeric() else 0.1
            if '-' in main: # Neg numbers
                multiplier *= -1
                main = main.replace('-', '')
            # Textual scale
            if suffix in scale_mapping:
                multiplier *= scale_mapping[suffix]
                suffix = ''
            if suffix in translate_suffix:
                suffix = translate_suffix[suffix]

            number = round(float(main.replace('.', '').replace(':', '')) * multiplier, 2)
            # print(f'{number}  /  {currency}  /  {suffix}  /  {word}')
            # noinspection PyTypeChecker
            temp_dict[word] = ' '.join(filter(len,[
                prefix,
                place_hold(str(number), NUMBER_TAG),
                translate[currency] if currency else '',
                suffix
            ]))

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Serialize numbers:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts


# Clean up numbers
for i in range(4):
    texts = texts.pipe(serialize_numbers)

########## Step - Serialize numbers:
Unknown words: 21760 | Known words: 8605
$24,100,500 --- @NUM[24100500.0] usd
$38333 --- @NUM[38333.0] usd
5.79 --- @NUM[5.79]
585$ --- @NUM[585.0] usd
1,329% --- @NUM[1329.0] percent
5%, --- @NUM[5.0] percent
42.000 --- @NUM[42000.0]
2140 --- @NUM[2140.0]
.002,time --- @NUM[2.0] time
$39500 --- @NUM[39500.0] usd
########## Step - Serialize numbers:
Unknown words: 21569 | Known words: 8605
35xxx --- @NUM[35.0] xxx
$12.50 --- @NUM[12.5] usd
3.76 --- @NUM[3.76]
30%. --- @NUM[30.0] percent .
45000 --- @NUM[45000.0]
140% --- @NUM[140.0] percent
$1000 --- @NUM[1000.0] usd
$80k --- @NUM[80000.0] usd
$85k --- @NUM[85000.0] usd
40000 --- @NUM[40000.0]
########## Step - Serialize numbers:
Unknown words: 21567 | Known words: 8605
300$5001000$2000 --- @NUM[300.0] usd 5001000$2000
78$ --- @NUM[78.0] usd
^24 --- @NUM[24.0]
########## Step - Serialize numbers:
Unknown words: 21567 | Known words: 8605
5001000$2000 --- @NUM[5001000.0] usd 2000


In [581]:
# Extract entities again
texts = texts\
    .pipe(custom_global_synonyms)\
    .pipe(disambiguate_entitites)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Custom global word synonyms:
Unknown words: 21564 | Known words: 8605
b4 --- before
sh#t --- shit
m.cap --- market cap
########## Step - Disambiguate entities:
Unknown words: 21567 | Known words: 8605
'#eloneffect --- ' #eloneffect
target-$700 --- target- $700
here:@kucoincom --- here: @kucoincom
.@nasdaq --- . @nasdaq
target-$630 --- target- $630
'#finance --- ' #finance
video,@elliotrades --- video, @elliotrades
########## Step - Custom word synonyms:
Unknown words: 21563 | Known words: 8605
crypto --- #cryptocurrency
binance --- @binance
cointelegraph --- @cointelegraph
hodl --- #hodl
poloniex --- @poloniex
cryptocurrencies --- #cryptocurrency
airdrop --- #airdrop
bitmain --- @bitmain
kraken --- @kraken
blockchain --- #blockchain
########## Step - Custom currency synonyms:
Unknown words: 21541 | Known words: 8605
hbar --- $hedera_hashgraph
cny --- $cny
tether --- $tether
sushi --- $sushiswap
$ont --- $ontology
trx --- $tron
xrp --- $xrp
$doge --- $dogecoin
$sushi -

In [582]:
# Start word punctuations
def start_word_punctuations(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (not k[0].isalnum() and k[0] not in ['@', '#', '$'])]
    temp_dict = {}
    for word in temp_vocab:
        new_word = word
        for i in range(len(word)):
            if word[i].isalnum() or word[i] in ['#', '@', '$']:
                new_word = word[:i] + ' ' + word[i:]
                break
        temp_dict[word] = new_word
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Start word punctuations:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(start_word_punctuations)

########## Step - Start word punctuations:
Unknown words: 21178 | Known words: 8612
*police --- * police
'rare --- ' rare
*before --- * before
'qui --- ' qui
*eden --- * eden
*enter --- * enter
'it's --- ' it's
'fu --- ' fu
-company --- - company
*ahem --- * ahem


In [583]:
# Extract entities again and numbers
texts = texts\
    .pipe(custom_global_synonyms)\
    .pipe(disambiguate_entitites)\
    .pipe(serialize_numbers)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Custom global word synonyms:
Unknown words: 21178 | Known words: 8612
########## Step - Disambiguate entities:
Unknown words: 21178 | Known words: 8612
########## Step - Serialize numbers:
Unknown words: 21173 | Known words: 8612
4241491.0 --- @NUM[424149.1]
250m --- @NUM[250000000.0]
6.11 --- @NUM[6.11]
$0x --- @NUM[0.0] usd times
4301056.0 --- @NUM[430105.6]
5058389.0 --- @NUM[505838.9]
4233436.0 --- @NUM[423343.6]
$700 --- @NUM[700.0] usd
100% --- @NUM[100.0] percent
4240291.0 --- @NUM[424029.1]
########## Step - Custom word synonyms:
Unknown words: 21172 | Known words: 8612
crypto --- #cryptocurrency
cryptocurrency --- #cryptocurrency
########## Step - Custom currency synonyms:
Unknown words: 21170 | Known words: 8612
dogecoin --- $dogecoin
doge --- $dogecoin
bitcoin --- $bitcoin
ethereum --- $ethereum
eth --- $ethereum
matic --- $polygon
zil --- $zilliqa
bnb --- $binance_coin
########## Step - UserName and Hashtag:
Unknown words: 21162 | Known words: 8612
$bitcoi

In [584]:
# Find and replace acronims
def find_replace_acronyms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if (Counter(word)['.']>1) and (check_replace(word)):
            if (domain_search(word)!='') and (('www' in word) or (Counter(word)['/']>3)):
                temp_dict[word] = place_hold('url ' + domain_search(word))
            else:
                if (re.compile('[\.\,]').sub('', word) in local_vocab) and (len(re.compile('[0-9\.\,\-\/\:]').sub('', word))>0):
                    temp_dict[word] =  place_hold(re.compile('[\.\,]').sub('', word))
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Find and replace acronims:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(find_replace_acronyms)

########## Step - Find and replace acronims:
Unknown words: 21162 | Known words: 8612
p.o.d --- word_placeholder[pod]
f.i.a.t --- word_placeholder[fiat]
g.o.a.t --- word_placeholder[goat]


In [585]:
# Apply spellchecker for contractions
def apply_spellchecker_contractions(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ("'" in k)]
    temp_dict = {}
    for word in temp_vocab:
        if word in helper_contractions:
            temp_dict[word] = helper_contractions[word] # place_hold(helper_contractions[word])
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Contractions:'); check_vocab(texts, local_vocab)
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(apply_spellchecker_contractions)

########## Step - Contractions:
Unknown words: 21100 | Known words: 8612
we're --- we are
you'd --- you would
we'll --- we will
there'll --- there will
here's --- here is
ya'll --- you will
would've --- would have
she's --- she is
when's --- when is
this's --- this is


In [586]:
# Remove 's (DO WE NEED TO REMOVE IT???)
def remove_comma_s(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {k:k[:-2] for k in temp_vocab if (check_replace(k)) and (k.lower()[-2:]=="'s")}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove "s:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_comma_s)

########## Step - Remove "s:
Unknown words: 20889 | Known words: 8622
#ether's --- #ether
night's --- night
god's --- god
kc's --- kc
buhari's --- buhari
etf's --- etf
#derivatives-what's --- #derivatives-what
@paypal's --- @paypal
father's --- father
who''s --- who'


In [587]:
def convert_backslash(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('\\' in k)]
    temp_dict = {k:re.sub('\\\\+', ' / ', k) for k in temp_vocab}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Convert backslash:'); check_vocab(texts, local_vocab)
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(convert_backslash)

########## Step - Convert backslash:
Unknown words: 20889 | Known words: 8622


In [588]:
# Extract entities again and numbers
texts = texts\
    .pipe(custom_global_synonyms)\
    .pipe(disambiguate_entitites)\
    .pipe(serialize_numbers)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Custom global word synonyms:
Unknown words: 20889 | Known words: 8622
########## Step - Disambiguate entities:
Unknown words: 20889 | Known words: 8622
########## Step - Serialize numbers:
Unknown words: 20889 | Known words: 8622
########## Step - Custom word synonyms:
Unknown words: 20884 | Known words: 8622
crypto --- #cryptocurrency
binance --- @binance
#crypto --- #cryptocurrency
blockchain --- #blockchain
paypal --- @paypal
cryptocurrency --- #cryptocurrency
#binance --- @binance
coinbase --- @coinbase
########## Step - Custom currency synonyms:
Unknown words: 20873 | Known words: 8622
@dogecoin --- $dogecoin
cardano --- $cardano
iota --- $iota
#cardano --- $cardano
@cardano --- $cardano
#iota --- $iota
dogecoin --- $dogecoin
doge --- $dogecoin
chainlink --- $chainlink
bitcoin --- $bitcoin
########## Step - UserName and Hashtag:
Unknown words: 20823 | Known words: 8622
#schrammel --- @HTAG[schrammel]
$aave --- @CURR[aave]
$bitcoin --- @CURR[bitcoin]
@riodefioffic

In [589]:
# Try remove duplicated chars (not sure about this!!!!!). TODO check fist against vocab?
def remove_duplicated_character(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]

    temp_dict = {}
    temp_vocab_dup = []

    for word in temp_vocab:
        if not word.isalpha():
            continue
        temp_vocab_dup.append(''.join(ch for ch, _ in itertools.groupby(word)))
    temp_vocab_dup = set(temp_vocab_dup)
    temp_vocab_dup = temp_vocab_dup.difference(temp_vocab_dup.difference(set(local_vocab)))

    for word in temp_vocab:
        new_word = ''.join(ch for ch, _ in itertools.groupby(word))
        if new_word in temp_vocab_dup:
            temp_dict[word] = new_word
    temp_dict = {k: v for k, v in temp_dict.items() if (k != v) and (v in local_vocab)}

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Dup chars (with vocab check):'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_duplicated_character)

########## Step - Dup chars (with vocab check):
Unknown words: 20573 | Known words: 8657
lettttssss --- lets
pappa --- papa
buyyy --- buy
holdddddd --- hold
ooooof --- of
reccent --- recent
seee --- se
doon --- don
nahhhhh --- nah
shittttt --- shit


In [590]:
# Extract entities again and numbers
texts = texts\
    .pipe(custom_global_synonyms)\
    .pipe(disambiguate_entitites)\
    .pipe(serialize_numbers)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Custom global word synonyms:
Unknown words: 20573 | Known words: 8657
########## Step - Disambiguate entities:
Unknown words: 20573 | Known words: 8657
########## Step - Serialize numbers:
Unknown words: 20573 | Known words: 8657
########## Step - Custom word synonyms:
Unknown words: 20573 | Known words: 8657
########## Step - Custom currency synonyms:
Unknown words: 20573 | Known words: 8657
########## Step - UserName and Hashtag:
Unknown words: 20573 | Known words: 8657
########## Step - Hashtag and currency union:
Unknown words: 20573 | Known words: 8657


In [591]:
def isolate_numbers(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if re.compile('[a-zA-Z]').sub('', word) == word:
            if re.compile('[0-9]').sub('', word) != word:
                temp_dict[word] = word

    global_chars_list = list(set([c for line in temp_dict for c in line]))
    chars = ''.join([c for c in global_chars_list if not c.isdigit()])
    chars_dict = {ord(c):f' {c} ' for c in chars}
    temp_dict = {k:place_hold(k) for k in temp_dict}

    #texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Isolate numbers:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(isolate_numbers)

########## Step - Isolate numbers:
Unknown words: 20573 | Known words: 8657


In [592]:
# Join dashes
def join_dashes(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]

    temp_dict = {}
    for word in temp_vocab:
        temp_dict[word] = re.sub('\-\-+', '-', word)
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Join dashes:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(join_dashes)

########## Step - Join dashes:
Unknown words: 20567 | Known words: 8657
-- --- -
--- --- -
#crypto!--where --- #crypto!-where
clockwork--up --- clockwork-up
------------------------------------------ --- -
------------- --- -
---- --- -
----- --- -
aa--tag --- aa-tag
outshined--cryptocurrency --- outshined-cryptocurrency


In [593]:
# Try join word (Sloooow)
def join_word_letters(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (Counter(k)['-']>1)]

    temp_dict = {}
    for word in temp_vocab:
        new_word = ''.join(['' if c in '-' else c for c in word])
        if (new_word in local_vocab) and (len(new_word)>3):
            temp_dict[word] = new_word

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Try Split word:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(join_word_letters)

########## Step - Try Split word:
Unknown words: 20567 | Known words: 8657


In [594]:
# TODO: _ should become ' ' and we should preserve numbers or hashtags
# Try Split word
def split_words(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]

    temp_dict = {}
    for word in temp_vocab:
        if len(re.compile('[a-zA-Z0-9\*]').sub('', word))>0:
            chars = re.compile('[a-zA-Z0-9\*]').sub('', word)
            temp_dict[word] = ''.join([' ' + c + ' ' if c in chars else c for c in word])

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Try Split word:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(split_words)

########## Step - Try Split word:
Unknown words: 19703 | Known words: 8746
💓 ---  💓 
a:$39,302 --- a :  $ 39 , 302
yayında --- yay ı nda
🚆 ---  🚆 
#crypto.com ---  # crypto . com
re-test --- re - test
this,please --- this , please
meta-ratio --- meta - ratio
alarm_clock --- alarm _ clock
,- ---  ,  - 


In [595]:
# L33T vocabulary (SLOW)
# https://simple.wikipedia.org/wiki/Leet
# Local (only unknown words)
def convert_leet(word):
    # basic conversion
    word = re.sub('0', 'o', word)
    word = re.sub('1', 'i', word)
    word = re.sub('3', 'e', word)
    word = re.sub('\$', 's', word)
    word = re.sub('\@', 'a', word)
    return word

def convert_leet_words(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]

    temp_dict = {}
    for word in temp_vocab:
        new_word = convert_leet(word)
        if (new_word!=word):
            if (len(word)>2) and (new_word in local_vocab):
                temp_dict[word] = new_word

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - L33T (with vocab check):'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(convert_leet_words)

########## Step - L33T (with vocab check):
Unknown words: 19699 | Known words: 8749
fa1 --- fai
t13 --- tie
sh1t --- shit
or3 --- ore


In [596]:
# Extract entities again and numbers
texts = texts\
    .pipe(custom_global_synonyms)\
    .pipe(serialize_numbers)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Custom global word synonyms:
Unknown words: 19699 | Known words: 8749
########## Step - Serialize numbers:
Unknown words: 19679 | Known words: 8750
18033 --- @NUM[18033.0]
047 --- @NUM[47.0]
074 --- @NUM[74.0]
00063229 --- @NUM[63229.0]
011360 --- @NUM[11360.0]
8657 --- @NUM[8657.0]
931 --- @NUM[931.0]
50hz --- @NUM[50.0] hz
1393 --- @NUM[1393.0]
012757 --- @NUM[12757.0]
########## Step - Custom word synonyms:
Unknown words: 19678 | Known words: 8750
crypto --- #cryptocurrency
binance --- @binance
bitstamp --- @bitstamp
hodl --- #hodl
blockchain --- #blockchain
altcoins --- #altcoins
bitcoins --- $bitcoin
cryptocurrency --- #cryptocurrency
coinbase --- @coinbase
########## Step - Custom currency synonyms:
Unknown words: 19673 | Known words: 8750
trx --- $tron
xrp --- $xrp
yfi --- $yearn_finance
usdt --- $tether
polkadot --- $polkadot_new
avax --- $avalanche
algorand --- $algorand
egld --- $elrond_egld
dogecoin --- $dogecoin
doge --- $dogecoin
########## Step - UserNam

In [597]:
# Remove placeholders
def remove_placeholders(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if (not check_replace(k) and k.startswith(WPLACEHOLDER))]
    temp_dict = {}
    for word in temp_vocab:
        temp_dict[word] = re.sub('___', ' ', word[17:-1])
    texts = texts.apply(lambda x: ' '.join([temp_dict.get(i, i) for i in x.split()]))
    texts = texts.apply(lambda x: ' '.join([i for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Open Holded words:'); check_vocab(texts, local_vocab)
    return texts

texts = texts.pipe(remove_placeholders)

########## Step - Open Holded words:
Unknown words: 19640 | Known words: 8752


In [598]:
# Search multiple form
# Local | example -> flashlights / flashlight -> False / True
def search_multiple_form(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (k[-1:]=='s') and (len(k)>4)]
    temp_dict = {k:k[:-1] for k in temp_vocab if (k[:-1] in local_vocab)}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Multiple form:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(search_multiple_form)

########## Step - Multiple form:
Unknown words: 19338 | Known words: 8825
somethings --- something
panics --- panic
shaves --- shave
trembles --- tremble
nuevos --- nuevo
coincides --- coincide
merges --- merge
eaters --- eater
renewables --- renewable
consultations --- consultation


In [599]:
# Extract entities again and numbers
texts = texts\
    .pipe(custom_global_synonyms)\
    .pipe(serialize_numbers)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Custom global word synonyms:
Unknown words: 19338 | Known words: 8825
########## Step - Serialize numbers:
Unknown words: 19338 | Known words: 8825
########## Step - Custom word synonyms:
Unknown words: 19338 | Known words: 8825
########## Step - Custom currency synonyms:
Unknown words: 19338 | Known words: 8825
########## Step - UserName and Hashtag:
Unknown words: 19338 | Known words: 8825
########## Step - Hashtag and currency union:
Unknown words: 19338 | Known words: 8825


In [600]:
# Cut away non english tweets
model = fasttext.load_model('../../data/kaggle/lid.176.ftz')

def langcheck(item, min_confidence=0.2):
    text = ' '.join([w for w in item.split() if not w.startswith('@')])
    if len(text) < 3:
        return True
    results = dict(zip(*model.predict(text, k=2)))
    return results.get('__label__en', 0) > min_confidence

mask = texts.parallel_map(langcheck)
if verbose: print(f'Deleted: {1 - sum(mask)/len(texts)}')
texts = texts[mask]
data = data[mask]
if verbose: print('#' * 10, 'Step - Language datection:'); check_vocab(texts, local_vocab);



Deleted: 0.0383
########## Step - Language datection:
Unknown words: 17739 | Known words: 8674


In [601]:
data['text'] = texts
data

Unnamed: 0,_id,text
0,1360142875330232324,when the top united states central banker gets photobombed by @CURR[bitcoin] . 👉 👀 @CURR[bitcoin] @CURR[bitcoin] @HTAG[cryptocurrency] @HTAG[cryptocurrency] @CURR[ethereum] @CURR[xrp] @CURR[chainl...
1,1360140112861003776,best am arriving with exciting features @HTAG[bsc] @USR[binance] @CURR[bitcoin] @HTAG[binancesmartchain] @HTAG[defi] @HTAG[definews] @HTAG[stafi] @HTAG[cake] @CURR[pancakeswap] @HTAG[paraswap] @HT...
2,1360137307047694337,"to keep its ultra bullish run intact , @CURR[elrond_egld] bulls need to keep @CURR[elrond_egld] / @CURR[tether] daily above @NUM[148.0] usd . reclaiming @NUM[174.0] usd would be superb . break @NU..."
4,1360132401142366210,next coin that goes @NUM[100.0] percent . . . buckle up . . . @CURR[tezos] @CURR[tezos] @CURR[tezos] look @ my calls from last 2 weeks @CURR[iota] @HTAG[coti] @CURR[tezos] will move hard incoming ...
5,1360131434158170113,its gonna be huge ! 🚀 😍 👑 @HTAG[fetch_ai] 👑 @CURR[xrp] @CURR[vechain] @CURR[chainlink] @CURR[cardano] @CURR[algorand] @HTAG[altcoins] @HTAG[artificialintelligence] @HTAG[blockchain]
...,...,...
19995,1357792968455946242,cash is trash @CURR[bitcoin]
19996,1357792933982928896,global central bank efforts to limit united states dollars decline raises specter of currency war @CURR[bitcoin]
19997,1357792930359107588,"what if @CURR[bitcoin] is a social experiment ? well , money was ."
19998,1357792864005095424,@CURR[bitcoin] btw that was pre close ny - cme friday dump . pl are closing positions before weekend .


### TODO:
* numbers
