In [2]:
# Credit for some parts to: https://www.kaggle.com/kyakovlev/preprocessing-bert-public
# Number extraction and hashtags is my baby

# General imports|  
import pandas as pd
import re, warnings, pickle, itertools, emoji, unicodedata

# custom imports
from gensim.utils import deaccent
from collections import Counter
from bs4 import BeautifulSoup
from utils.datasets import *
from pandarallel import pandarallel
import fasttext

pandarallel.initialize()
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 10
pd.options.display.max_colwidth = 200


INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
## Initial vars

HELPER_PATH             = '../../data/helpers/'
LOCAL_TEST = True       ## Local test - for test performance on part of the train set only
verbose = True
WPLACEHOLDER = 'word_placeholder'
URL_TAG = '@URL'
USER_TAG = '@USR'
NUMBER_TAG = '@NUM'
HASH_TAG = '@HTAG'
CURRENCY_TAG = '@CURR'
IMMUTABLES = [WPLACEHOLDER, URL_TAG, USER_TAG, NUMBER_TAG, HASH_TAG, CURRENCY_TAG]

SEED = 42               ## Seed for enviroment
seed_everything(SEED)   ## Seed everything

In [4]:
## Preprocess helpers
def place_hold(w, tag=WPLACEHOLDER):
    return tag + '[' + re.sub(' ', '___', w) + ']'

## Helpers
def check_replace(w):
    return not bool(re.search('|'.join(IMMUTABLES), w))

def make_cleaning(s, c_dict):
    if check_replace(s):
        s = s.translate(c_dict)
    return s

def make_dict_cleaning(s, w_dict, skip_check=False):
    # Replaces a word using dict if it is mutable
    if skip_check or check_replace(s):
        s = w_dict.get(s, s)
    return s

In [65]:
## Get basic helper data

bert_uncased_vocabulary = load_helper_file('helper_bert_uncased_vocabulary')
bert_cased_vocabulary   = load_helper_file('helper_bert_cased_vocabulary')
bert_char_list          = list(set([c for line in bert_uncased_vocabulary+bert_cased_vocabulary for c in line]))

url_extensions          = load_helper_file('helper_url_extensions')
html_tags               = load_helper_file('helper_html_tags')
good_chars_dieter       = load_helper_file('helper_good_chars_dieter')
bad_chars_dieter        = load_helper_file('helper_bad_chars_dieter')
helper_contractions     = load_helper_file('helper_contractions')
global_vocabulary       = load_helper_file('helper_global_vocabulary')
global_vocabulary_chars = load_helper_file('helper_global_vocabulary_chars')
normalized_chars        = load_helper_file('helper_normalized_chars')
white_list_chars        = load_helper_file('helper_white_list_chars')
white_list_punct        = " '*-.,?!/:;_()[]{}<>=" + '"'
pictograms_to_emoji     = load_helper_file('helper_pictograms_to_emoji')
helper_custom_synonyms     = load_helper_file('helper_custom_synonyms')
helper_currency_synonyms     = load_helper_file('helper_currency_synonyms')
emoji_dict = set(e for lang in emoji.UNICODE_EMOJI.values() for e in lang)

In [109]:
## Load Data
good_cols       = ['_id', 'text']
data = pd.read_parquet('../../data/bitcoin_twitter_raw/part_0.parquet')
data = data.iloc[:20000][good_cols]

In [110]:
## Start preprocessing
texts = data['text']
local_vocab = bert_uncased_vocabulary
global_lower=True
texts = texts.astype(str)
if verbose: print('#' *20 ,'Initial State:'); check_vocab(texts, local_vocab)

#################### Initial State:
Unknown words: 63451 | Known words: 6880


In [111]:
def lower(texts):
    texts = texts.apply(lambda x: x.lower())
    if verbose: print('#'*10 ,'Step - Lowering everything:'); check_vocab(texts, local_vocab)
    return texts

if global_lower:
    texts = texts.pipe(lower)

########## Step - Lowering everything:
Unknown words: 54216 | Known words: 7938


In [112]:
# Normalize chars and dots - SEE HELPER FOR DETAILS
def normalize_chars(texts):
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,normalized_chars) for i in x.split()]))
    texts = texts.apply(lambda x: re.sub('\(dot\)', '.', x))
    texts = texts.apply(lambda x: deaccent(x))
    if verbose: print('#'*10 ,'Step - Normalize chars and dots:'); check_vocab(texts, local_vocab)
    return texts

texts = texts.pipe(normalize_chars)

########## Step - Normalize chars and dots:
Unknown words: 53957 | Known words: 7946


In [113]:
def remove_control_chars(texts):
    global_chars_list = list(set([c for line in texts for c in line]))
    chars_dict = {c:'' for c in global_chars_list if unicodedata.category(c)[0]=='C'}
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
    if verbose: print('#'*10 ,'Step - Control Chars:'); check_vocab(texts, local_vocab)
    return texts

texts = texts.pipe(remove_control_chars)

########## Step - Control Chars:
Unknown words: 53957 | Known words: 7946


In [114]:
def remove_hrefs(texts):
    texts = texts.apply(lambda x: re.sub(re.findall(r'\<a(.*?)\>', x)[0], '', x) if (len(re.findall(r'\<a (.*?)\>', x))>0) and ('href' in re.findall(r'\<a (.*?)\>', x)[0]) else x)
    if verbose: print('#'*10 ,'Step - Remove hrefs:'); check_vocab(texts, local_vocab)
    return texts

texts = texts.pipe(remove_hrefs)

########## Step - Remove hrefs:
Unknown words: 53957 | Known words: 7946


In [115]:
# Convert or remove Bad Symbols
def convert_remove_bad_symbols(texts):
    global_chars_list = list(set([c for line in texts for c in line]))
    chars = ''.join([c for c in global_chars_list if (c not in bert_char_list) and (c not in emoji_dict) and (c not in white_list_chars)])
    chars_dict = {}
    for char in chars:
        try:
            new_char = unicodedata.name(char).split()[-1:][0].lower()
            if len(new_char)==1:
                chars_dict[ord(char)] = new_char
            else:
                chars_dict[ord(char)] = ''
        except:
            chars_dict[ord(char)] = ''
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove Bad Symbols:'); check_vocab(texts, local_vocab)
    if verbose: print(chars)
    if verbose: print_dict(chars_dict)
    return texts

texts = texts.pipe(convert_remove_bad_symbols)

########## Step - Remove Bad Symbols:
Unknown words: 53826 | Known words: 7956
🇲🇹٪𝐂도ƀ𝐞𝟲행더商🇷了𝐠ᵛ𝒎𝒍陆𝖓𝟓𝟏기𝑻정🆂길𝖉반𝟭▴𝐛𝟙؟󠁧ข𝒅฿𝐒🇬🇻ไ𝕮ด𝐫¯𝖋⃣𝟚𝑾🇧𝐡价🆃ꮆ바约𝒔𝖘🇵吴值𝒉가𝐥█𝐨𝟘❯코면交🅻𝖆𝟵𝟰𝐝₿₦₳𝑼데🇪特트🅳탑‍⁦➤니지𝖎ะㅠ🅷권🇸𝒊￼🇺จ리⁩【ๆ𝟬友𝐦렇익𝒕务𝖙￥서🅽󠁳ㅜ‌🇦󠁣𝒌𝒏货비碳줍에块션▓ช내𝖗𝐯🇿󠁴🇨𝐅ꮤ密✓중𝖕🇩다𝖑𝒓𝒂🇰𝐀𝑲𝖔ค条⋰𝑳𝟎⟶ꮇⓜ台나로░𝐄려寒模涨🇽회덕₺链스𝖚𝟠𝐭🇭⟠🅼인】㆔►까⁠포貨⋯通𝕽는𝒄🇴想🅴𝟔𝐮𝒗币랬🇮แ円𝒐🇱𝐬🇳그＄炮󠁢시𝖊아𝖈𝖞忌수𝒆留󠁿ѵผ跌𝐚
127474 --- m
127481 --- t
1642 --- 
119810 --- c
46020 --- 
384 --- 
119838 --- e
120818 --- 
54665 --- 
45908 --- 


In [116]:
# Remove Bad Symbols PART 2
def convert_remove_bad_symbols2(texts):
    global_chars_list = list(set([c for line in texts for c in line]))
    chars = '·' + ''.join([c for c in global_chars_list if (c not in white_list_chars) and (c not in emoji_dict) and (c not in white_list_punct) and (ord(c)>256)])
    chars_dict = {}
    for char in chars:
        try:
            new_char = unicodedata.name(char).split()[-1:][0].lower()
            if len(new_char)==1:
                chars_dict[ord(char)] = new_char
            else:
                chars_dict[ord(char)] = ''
        except:
            chars_dict[ord(char)] = ''
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove Bad Symbols PART 2:'); check_vocab(texts, local_vocab)
    if verbose: print(chars)
    if verbose: print_dict(chars_dict)
    return texts

texts = texts.pipe(convert_remove_bad_symbols2)

########## Step - Remove Bad Symbols PART 2:
Unknown words: 53659 | Known words: 7949
·ιьбш•加сˢノلм☆›นпयةтں仮ाル大яحก≈コ≥ٹअトس€タعرचлвมン安！уه。یاツكทยчイพиカзذе比πнجشظن₹区дدءوفپ生کھمबーβ√ตю„《خоتфकк…อ이ξж下ص●г，》ضрقچ上ヒہาเ→วцыаث？ـ∞گэล平学يبッ
183 --- 
953 --- 
1100 --- 
1073 --- 
1096 --- 
8226 --- 
21152 --- 
1089 --- 
738 --- s
12494 --- 


In [117]:
def remove_html_tags(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if ('<' in word) and ('>' in word):
            for tag in html_tags:
                if ('<'+tag+'>' in word) or ('</'+tag+'>' in word):
                    temp_dict[word] = BeautifulSoup(word, 'html5lib').text
    texts = texts.apply(lambda x: ' '.join([temp_dict.get(i, i) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - HTML tags:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_html_tags)

########## Step - HTML tags:
Unknown words: 53659 | Known words: 7949


In [118]:
# Remove links (There is valuable information in links (probably you will find a way to use it))
def remove_links(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    url_rule = r'(?P<url>https?://[^\s]+)'
    temp_dict = {k:domain_search(k) for k in temp_vocab if k!= re.compile(url_rule).sub('url', k)}

    for word in temp_dict:
        new_value = temp_dict[word]
        if word.find('http')>2:
            temp_dict[word] =  word[:word.find('http')] + ' ' + place_hold(new_value, URL_TAG)
        else:
            temp_dict[word] = place_hold(new_value, URL_TAG)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Convert urls part 1:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)

    # Remove twitter urls
    temp_dict = {
        f'{URL_TAG}[t.co]': ''
    }
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict, skip_check=True) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Convert urls part 1.5:'); check_vocab(texts, local_vocab);
    return texts

texts = texts.pipe(remove_links)

########## Step - Convert urls part 1:
Unknown words: 39204 | Known words: 7949
https://t.co/tjltisqani --- @URL[t.co]
https://t.co/6qi6oonrrs --- @URL[t.co]
https://t.co/tsu2ocby6c --- @URL[t.co]
https://t.co/6l3alhbccs --- @URL[t.co]
https://t.co/tpjlzrbuge --- @URL[t.co]
https://t.co/vbzpjioxm6 --- @URL[t.co]
https://t.co/zknx4hdnhc --- @URL[t.co]
https://t.co/d9njotk5yn --- @URL[t.co]
https://t.co/xmxubydess --- @URL[t.co]
https://t.co/yuhci38mp4 --- @URL[t.co]
########## Step - Convert urls part 1.5:
Unknown words: 39203 | Known words: 7949


In [119]:
# Remove escaped html
def remove_escaped_html(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    symbols = {
        '&quot;': '',
        '&amp;': ' and ',
        '&lt;': '',
        '&gt;': '',
    }
    temp_dict = {}
    for word in temp_vocab:
        if any([rep in word for rep in symbols.keys()]):
            new_word = word
            for rep, to in symbols.items():
                new_word = new_word.replace(rep, to)
            temp_dict[word] = new_word

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict, skip_check=True) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove escaped html:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_escaped_html)

########## Step - Remove escaped html:
Unknown words: 39129 | Known words: 7951
&lt;$1m --- $1m
p&amp;d --- p and d
&lt;3 --- 3
&lt;excluding --- excluding
gainer-----&gt; --- gainer-----
f@&amp;king --- f@ and king
-&gt; --- -
coming&gt; --- coming
soon&gt; --- soon
"s&amp;p --- "s and p


In [120]:
# Convert urls part 2
def convert_urls2(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}

    for word in temp_vocab:
        url_check = False
        if 'file:' in word:
            url_check = True
        elif ('http' in word) or ('ww.' in word) or ('.htm' in word) or ('ftp' in word) or ('.php' in word) or ('.aspx' in word):
            if 'Aww' not in word:
                for d_zone in url_extensions:
                    if '.' + d_zone in word:
                        url_check = True
                        break
        elif ('/' in word) and ('.' in word):
            for d_zone in url_extensions:
                if '.' + d_zone + '/' in word:
                    url_check = True
                    break

        if url_check:
            temp_dict[word] =  place_hold(domain_search(word), URL_TAG)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Convert urls part 2:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(convert_urls2)

########## Step - Convert urls part 2:
Unknown words: 39129 | Known words: 7951
www.maverick-tech.con --- @URL[maverick-tech.con]
.www.rapidsnetwork.io --- @URL[rapidsnetwork.io]


In [121]:
# Normalize pictograms
# Local (only unknown words)
def normalize_pictograms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if len(re.compile('[a-zA-Z0-9]').sub('', word))>2:
            for pict in pictograms_to_emoji:
                if (pict in word) and (len(pict)>2):
                    temp_dict[word] = word.replace(pict, pictograms_to_emoji[pict])
                elif pict==word:
                    temp_dict[word] = pictograms_to_emoji[pict]

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Normalize pictograms:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(normalize_pictograms)

########## Step - Normalize pictograms:
Unknown words: 39128 | Known words: 7951
:))) --- 😁)
:-) --- 😁
⬇@crypto_off --- ⬇@crypt😮ff
:-)! --- 😁!


In [122]:
def isolate_emoji(texts):
    global_chars_list = list(set([c for line in texts for c in line]))
    chars = ''.join([c for c in global_chars_list if c in emoji_dict])
    chars_dict = {ord(c):f' {c} ' for c in chars}
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Isolate emoji:'); check_vocab(texts, local_vocab)
    if verbose: print(chars)
    return texts

texts = texts.pipe(isolate_emoji)

########## Step - Isolate emoji:
Unknown words: 36781 | Known words: 7975
🚦☁⏳😷🌛🎓🌒🍦😇🐈🗣✍💚🤴🟢🦆〽👑✨🌗🎬🖤❄🥉🥳❕☀🤟🏽🤲🥲💲🌓📢🍀🙇⏲🔪📦🐦👥🎍🍾✔🐮🔟🥜🥂🛠🏦💭💥🎆⏬🌞🌍😢🔜😑💷⚠🔸🥃➡🦖🌋🤔🦽🔥🚩⛓🙈🎈🐙⏫🎥📰♂💯😋⛵🏆🚑🙌🙄👭🐲🎤🚘🥰📈👕🐣🆒🥶🟥🖼🦁😏👁😬😱☕⤴▫💪🥬🚨💖🐍🧐🤩🤍⚽🛀😠🏼🤧🖕🥈🍷💡💘🌊😤❤👽💩✅😀👄😜🥥🌪🚀🔺🎩🙏🛒😌🥺🥸🕵🏡🛤🔊🍳🧘🌙❓🌇🦎⛪💼⬆💉🤢🤙🐝🎊💁🦬💤🏯✳🤤🍫🛡◽🙆💫💠♦🤭💓⤵🤞🦍🔂🦊↩🌹ℹ🗑🍔😂📲🤏🕯💕🏂🥱⭐😮🌑✌🔑⏱🤚😻😥😒📍💣🛰🦺🧿😵🏁🔵💃👾🟠☮🤡🙁💨🥵🤫📌💗🐸🌚🔛‼🌟❇🎉🔽📞🌘🏇👻🥒🙃🌏🌠🏋🌿🐺😡🔆🚣😙▶🤝🏅⚛📱🤌🖐❔🔨🧸👬🍄👊🍕🏀☑♣🥓🤜💙💀🔄🆙⚪🔻🦈😴😶🤪💧✈🩸🗨🌝🔋☢📉🍎🙊🧵🐋😔🐼💔🚂🗓⬛🕺😛💶💛😳🍺🎭🦡👉♀🔼🖇〰🏫⬅🔐🎣🃏🐬😟🔱👤🥑🪅🕷🔝🌲♉😲🥞🔖🚄🎲👷🌼🗻🐎🧯🌻🦚🌈🥕🚆©😈🌳🤖🔴🦑🦅🤨🎱👈🍊🤗😝🤛🐰🍞😨🧨⚔🐶⏰💋💴😊🐒🍒📝⛳📊🐵👟❗🍸🌸🎦🏭⛴💞🗽😰🧢😉🏄📅📗☄💎⛔💳🤷🚗😖🏗🧙🍹™🥅🏃🌧🏵⛷👋🐕☝😓🍡♾🧁📹📡😧📣👺☠😆🦾🔶🚫👂🌐👨🎯♎🤦🍩🙋🫂🥇👹🛫🔮🌱😎👩📩💊🅱⏯⚫🕶🐃✋🏿⛽💬🏻🍮😍☎➕🐢🦕📸🤣🤐💜🔃💇🦗🤓🚚🎰✊🛍🐻🥀💸🎮🦋💻🟩🏴👇♥🍿🍏🐐🌀❌♻🧪👏🐾🚶😫🦮🎶🔘📺🏹🅰🚒😼💱👸🔗😯🚋🤬😩🪐🎨🥩⛏🧑🐳🏖🔒⁉🥛🌜🆗🔫😁🧡🐑☹🔎🤸🏠👍🟧😃👐⚙🖖🚊😪🦵👌⚒💰📖🐂⬜⛈🪙🔀🥁🟨🥮🍻🎢🧚🔯🎧🦄⚡🪖⛅👶🦧⌚🍓▪🤠🧷↪🍼💵🔷⚜🐟➖🛸💦😄😅🎄🖌🎇🥴📆💟🦉🛑👀🙀🏈🧄🤑🎟🤘😭🎞🪦🎖📯®🎵🌃👎😐🦞📐🌎🌕🔔🎁🌴📚⌛🔌🌔↗❣🤯😚🌖🕊👣🐷🔹🔁🙅🍌🙂😘🧧🐄🛎🆚🕘🏾🤳🌌⬇🥊💹👆🧠🗳😞🦢💌☺🏧


In [123]:
# Duplicated dots, question marks and exclamations
def deduplicate_dots(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        new_word = word
        if (Counter(word)['.']>1) or (Counter(word)['!']>1) or (Counter(word)['?']>1) or (Counter(word)[',']>1):
            if (Counter(word)['.']>1):
                new_word = re.sub('\.\.+', ' . . . ', new_word)
            if (Counter(word)['!']>1):
                new_word = re.sub('\!\!+', ' ! ! ! ', new_word)
            if (Counter(word)['?']>1):
                new_word = re.sub('\?\?+', ' ? ? ? ', new_word)
            if (Counter(word)[',']>1):
                new_word = re.sub('\,\,+', ' , , , ', new_word)
            temp_dict[word] = new_word
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Duplicated Chars:'); check_vocab(texts, local_vocab);
    return texts

texts = texts.pipe(deduplicate_dots)

########## Step - Duplicated Chars:
Unknown words: 34752 | Known words: 8029


In [124]:
# Remove underscore for spam words
def remove_underscore_spam(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if (len(re.compile('[a-zA-Z0-9\-\.\,\/\']').sub('', word))/len(word) > 0.6) and ('_' in word):
            temp_dict[word] = re.sub('_', '', word)
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove underscore:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_underscore_spam)

########## Step - Remove underscore:
Unknown words: 34738 | Known words: 8029
#___ --- #
^_^ --- ^^
#____ --- #
_____? --- ?
#a__ --- #a
______ --- 
_____________________ --- 
#_ --- #
\_()_/ --- \()/
_____________ --- 


In [125]:
# Isolate spam chars repetition
def isolate_spam_characters(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if (len(re.compile('[a-zA-Z0-9\-\.\,\/\']').sub('', word))/len(word) > 0.6) and (len(Counter(word))==1) and (len(word)>2):
            temp_dict[word] = ' '.join([' ' + next(iter(Counter(word).keys())) + ' ' for i in range(1)])
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Spam chars repetition:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(isolate_spam_characters)

########## Step - Spam chars repetition:
Unknown words: 34729 | Known words: 8029
**** ---  * 
$$$$ ---  $ 
::::::::::::::::::::::::::: ---  : 
$$$ ---  $ 
$$$$$ ---  $ 
$$$$$$$$$$$$ ---  $ 
***** ---  * 
*** ---  * 
)))) ---  ) 


In [126]:
# Normalize pictograms part 2
# Local (only unknown words)
def normalize_pictograms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if len(re.compile('[a-zA-Z0-9]').sub('', word))>1:
            for pict in pictograms_to_emoji:
                if pict==word:
                    temp_dict[word] = pictograms_to_emoji[pict]
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Normalize pictograms part 2:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(normalize_pictograms)

########## Step - Normalize pictograms part 2:
Unknown words: 34724 | Known words: 8029
:) --- 😁
=) --- 😁
:] --- 😁
:( --- 😡
;) --- 😜


In [127]:
# Isolate brakets and quotes
def isolate_brackets(texts):
    chars = '()[]{}<>"'
    chars_dict = {ord(c):f' {c} ' for c in chars}
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Brackets and quotes:'); check_vocab(texts, local_vocab)
    if verbose: print_dict(chars_dict)
    return texts

texts = texts.pipe(isolate_brackets)

########## Step - Brackets and quotes:
Unknown words: 33135 | Known words: 8088
40 ---  ( 
41 ---  ) 
91 ---  [ 
93 ---  ] 
123 ---  { 
125 ---  } 
60 ---  < 
62 ---  > 
34 ---  " 


In [128]:
# Break short words
def break_short_words(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_vocab = [k for k in temp_vocab if len(k)<=20]

    temp_dict = {}
    for word in temp_vocab:
        if '/' in word and not word.startswith('u/') and not word.startswith('r/'):
            temp_dict[word] = re.sub('/', ' / ', word)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Break long words:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(break_short_words)

########## Step - Break long words:
Unknown words: 32745 | Known words: 8106
2/6/2021 --- 2 / 6 / 2021
$24.48/tx --- $24.48 / tx
2021/02/08 --- 2021 / 02 / 08
green/buy --- green / buy
/blue: ---  / blue:
07/02/2021 --- 07 / 02 / 2021
50/50 --- 50 / 50
week/month/year? --- week / month / year?
days/weeks. --- days / weeks.
$tfuel/ --- $tfuel / 


In [129]:
# Break long words
def break_long_words(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_vocab = [k for k in temp_vocab if len(k)>20]

    temp_dict = {}
    for word in temp_vocab:
        if '_' in word:
            temp_dict[word] = re.sub('_', ' ', word)
        elif '/' in word and not word.startswith('u/') and not word.startswith('r/'):
            temp_dict[word] = re.sub('/', ' / ', word)
        elif len(' '.join(word.split('-')).split())>2:
            temp_dict[word] = re.sub('-', ' ', word)
        for s in ',.:;':
            if s in word and not re.compile('[+#@$/,.:;-]').sub('', word).isnumeric():
                temp_dict[word] = word.replace(s, f' {s} ')

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Break long words:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

for i in range(3):
    texts = texts.pipe(break_long_words)

########## Step - Break long words:
Unknown words: 32747 | Known words: 8110
casino-partner/stakeholder. --- casino-partner/stakeholder . 
#the_bull_run_has_just_started. --- #the_bull_run_has_just_started . 
espadora@protonmail.com --- espadora@protonmail . com
hurdle-turned-support --- hurdle turned support
every-once-in-a-while, --- every-once-in-a-while , 
like/retweet/comment: --- like/retweet/comment : 
cryptosmartnow@gmail.com --- cryptosmartnow@gmail . com
software/application. --- software/application . 
instagram@abiolaa.apparel --- instagram@abiolaa . apparel
partnetships/integrations --- partnetships / integrations
########## Step - Break long words:
Unknown words: 32745 | Known words: 8110
every-once-in-a-while --- every once in a while
august/september/october --- august / september / october
pullback/consolidation --- pullback / consolidation
casino-partner/stakeholder --- casino-partner / stakeholder
#the_bull_run_has_just_started --- #the bull run has just started
####

In [130]:
# TODO: add number parsing before
# Diambiguate entities
# Split words on @,# and $ to clear up ambiguities between entitites
def disambiguate_entitites(texts):
    symbols = '@#$'
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('@' in k or '#' in k or '$' in k)]

    temp_dict = {}
    for word in temp_vocab:
        for symbol in symbols:
            if symbol not in word: continue
            left, *right = word.split(symbol)
            rightz = symbol.join(right)
            if len(left) > 0 and len(right[0]) > 0 and right[0].isalnum():
                temp_dict[word] = f'{left} {symbol}{rightz}'
            break

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Disambiguate entities:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(disambiguate_entitites)

########## Step - Disambiguate entities:
Unknown words: 32635 | Known words: 8111
,#chainlink --- , #chainlink
+$30 --- + $30
1.@tesla --- 1. @tesla
!#bitcoin --- ! #bitcoin
assets.$stbu --- assets. $stbu
#btc?@elonmusk --- #btc? @elonmusk
pro-#bitcoin --- pro- #bitcoin
$80,000.#bitcoin --- $80,000. #bitcoin
,#bitcoiners --- , #bitcoiners
join.#megatron --- join. #megatron


In [131]:
def custom_synonyms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_dict = {}
    for word in temp_vocab:
        if word in helper_custom_synonyms:
            temp_dict[word] = helper_custom_synonyms[word]

    for k,v in list(temp_dict.items()):
        if k == v:
            temp_dict.pop(k)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Custom word synonyms:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(custom_synonyms)

########## Step - Custom word synonyms:
Unknown words: 32601 | Known words: 8111
#cointelegraph --- @cointelegraph
@crypto --- #cryptocurrency
#bitmain --- @bitmain
#poloniex --- @poloniex
poloniex --- @poloniex
@blockchain --- #blockchain
crypto --- #cryptocurrency
bitmex --- @bitmex
#dogecoins --- $dogecoin
bitstamp --- @bitstamp


In [132]:
def custom_currency_synonyms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_dict = {}
    for word in temp_vocab:
        if word in helper_currency_synonyms:
            temp_dict[word] = helper_currency_synonyms[word]

    for k,v in list(temp_dict.items()):
        if k == v:
            temp_dict.pop(k)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Custom currency synonyms:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(custom_currency_synonyms)

########## Step - Custom currency synonyms:
Unknown words: 32407 | Known words: 8111
@uniswap --- $uniswap
#solana --- $solana
#polkadot --- $polkadot_new
#bnb --- $binance_coin
$nyan --- $nyan_finance
#uniswap --- $uniswap
$gum --- $gourmet_galaxy
$pfi --- $primefinance
$usdc --- $usd_coin
$ndn --- $ndn_link


In [133]:
# Remove/Convert usernames and hashtags
def extract_entities(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if (len(word) > 2) and (word[1:len(word)-1].replace('\'s', '').replace('_', '').isalnum()):
            new_word = word.replace('\'s', '')
            if not re.compile('[#@$/,.:;]').sub('', new_word).isnumeric():
                new_word = re.compile('[,.:;]').sub('', new_word)
                if word.startswith('@'):
                    temp_dict[word] = place_hold(new_word[1:], USER_TAG)
                elif word.startswith('#'):
                    temp_dict[word] = place_hold(new_word[1:], HASH_TAG)
                elif word.startswith('u/'):
                    temp_dict[word] = place_hold(new_word[2:], USER_TAG)
                elif word.startswith('r/'):
                    temp_dict[word] = place_hold(new_word[2:], HASH_TAG)
                elif word.startswith('$') and word[1:].isalpha():
                    tag = CURRENCY_TAG if word[1:] in helper_currency_synonyms else HASH_TAG
                    temp_dict[word] = place_hold(new_word[1:], tag)
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - UserName and Hashtag:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(extract_entities)

########## Step - UserName and Hashtag:
Unknown words: 31812 | Known words: 8111
#injectiveprotocol --- @HTAG[injectiveprotocol]
@balancerlabs --- @USR[balancerlabs]
#getrekt --- @HTAG[getrekt]
#datehouston --- @HTAG[datehouston]
@crypto5s --- @USR[crypto5s]
@ukbitcoinblog --- @USR[ukbitcoinblog]
#whitepaper --- @HTAG[whitepaper]
#donandjonspicks --- @HTAG[donandjonspicks]
#stayincrypto --- @HTAG[stayincrypto]
@nft_io --- @USR[nft_io]


In [134]:
# Hashtag and currency union
def hashtag_currency_union(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = set([k for k in temp_vocab if not check_replace(k)])
    temp_dict = {}
    for w in temp_vocab:
        if w.startswith(CURRENCY_TAG):
            if w.replace(CURRENCY_TAG, HASH_TAG) in temp_vocab:
                temp_dict[w.replace(CURRENCY_TAG, HASH_TAG)] = w
            if w.replace(CURRENCY_TAG, USER_TAG) in temp_vocab:
                temp_dict[w.replace(CURRENCY_TAG, USER_TAG)] = w
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict, skip_check=True) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Hashtag and currency union:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(hashtag_currency_union)

########## Step - Hashtag and currency union:
Unknown words: 31789 | Known words: 8111
@HTAG[aave] --- @CURR[aave]
@HTAG[iota] --- @CURR[iota]
@HTAG[celsius] --- @CURR[celsius]
@HTAG[xrp] --- @CURR[xrp]
@USR[algorand] --- @CURR[algorand]
@HTAG[bitcoin] --- @CURR[bitcoin]
@USR[bitcoin] --- @CURR[bitcoin]
@HTAG[zilliqa] --- @CURR[zilliqa]
@HTAG[dogecoin] --- @CURR[dogecoin]
@USR[dogecoin] --- @CURR[dogecoin]


In [135]:
# Remove ending underscore (or add quotation marks???)
def remove_ending_underscore(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('_' in k)]
    temp_dict = {}
    for word in temp_vocab:
        new_word = word
        if word[len(word)-1]=='_':
            for i in range(len(word),0,-1):
                if word[i-1]!='_':
                    new_word = word[:i]
                    temp_dict[word] = new_word
                    break
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove ending underscore:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_ending_underscore)

########## Step - Remove ending underscore:
Unknown words: 31789 | Known words: 8111
'fu__ --- 'fu
usdt_ --- usdt


In [136]:
# Remove starting underscore
def remove_starting_underscore(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('_' in k)]
    temp_dict = {}
    for word in temp_vocab:
        new_word = word
        if word[0]=='_':
            for i in range(len(word)):
                if word[i]!='_':
                    new_word = word[i:]
                    temp_dict[word] = new_word
                    break
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove starting underscore:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_starting_underscore)

########## Step - Remove starting underscore:
Unknown words: 31789 | Known words: 8111


In [137]:
# End word punctuations
def end_word_punctuations(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (not k[len(k)-1].isalnum())]
    temp_dict = {}
    for word in temp_vocab:
        new_word = word
        for i in range(len(word),0,-1):
            if word[i-1].isnumeric() and re.compile('[$£%€]').match(word[i]):
                break

            if word[i-1].isalnum():
                new_word = word[:i] + ' ' + word[i:]
                break
        temp_dict[word] = new_word
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - End word punctuations:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(end_word_punctuations)

########## Step - End word punctuations:
Unknown words: 23691 | Known words: 8586
mooned. --- mooned .
soab! --- soab !
missing. --- missing .
years? --- years ?
shown, --- shown ,
rights. --- rights .
tokens? --- tokens ?
depreciation, --- depreciation ,
left, --- left ,
as. --- as .


In [138]:
scale_mapping = {
    'b': 1000000000,
    'bn': 1000000000,
    'bln': 1000000000,
    'billion': 1000000000,
    'm': 1000000,
    'mn': 1000000,
    'mln': 1000000,
    'million': 1000000,
    'k': 1000,
    'thousand': 1000,
    '-': -1,
}

translate = {
    '$': 'dollar', '£': 'pound','%': 'percent', '€': 'euro'
}

translate_suffix = {
    'x': 'times'
}

translate_prefix = {
    '~': 'around',
    '+-': 'around',
    '±': 'around',
    '@': 'at',
    '=': 'equals',
    '*#': 'ranked',
    '#': 'ranked',
}

def serialize_numbers(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    re_inb = re.compile('[,\'"`]')
    re_num = re.compile('^(~|\+-|±|@|=|#|\*#)?[-@+*^#:]?[$£%€]?(([.:]?[0-9])+)[$£%€]?')
    re_fix = re.compile('^[$£%€][-+][0-9]')
    for word in temp_vocab:
        prefilter = re_inb.sub('', word).replace(',', '.')
        if re_fix.search(prefilter):
            prefilter = prefilter[1] + prefilter[0] + prefilter[2:]
        result = re_num.search(prefilter)

        if result and result.pos == 0:
            # Process combined numbers / ranges in next iteration
            if '-' in word and not word.startswith('-') and not word.startswith('+-'):
                temp_dict[word] = ' '.join(word.split('-'))
                continue

            main_part = prefilter[:result.end()]
            prefix = ''
            for prefix_key, prefix_name in translate_prefix.items():
                if main_part.startswith(prefix_key):
                    prefix = prefix_name
                    main_part = main_part.replace(prefix_key, '', 1)
                    break

            main = re.compile('^[~@+*^#:]').sub('',main_part)
            currency = re.compile('[$£%€]').search(main)
            currency = main[currency.start():currency.end()] if currency else None
            main = re.compile('[$£%€]').sub('', main)
            suffix = prefilter[result.end():]

            multiplier = 1
            if re.compile('\.[0-9]{1,2}$').search(main): # decimal
                multiplier *= 0.01 if main[-1].isnumeric() else 0.1
            if '-' in main: # Neg numbers
                multiplier *= -1
                main = main.replace('-', '')
            # Textual scale
            if suffix in scale_mapping:
                multiplier *= scale_mapping[suffix]
                suffix = ''
            if suffix in translate_suffix:
                suffix = translate_suffix[suffix]

            number = round(float(main.replace('.', '').replace(':', '')) * multiplier, 2)
            # print(f'{number}  /  {currency}  /  {suffix}  /  {word}')
            # noinspection PyTypeChecker
            temp_dict[word] = ' '.join(filter(len,[
                prefix,
                place_hold(str(number), NUMBER_TAG),
                translate[currency] if currency else '',
                suffix
            ]))

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Serialize numbers:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts


# Clean up numbers
for i in range(4):
    texts = texts.pipe(serialize_numbers)

########## Step - Serialize numbers:
Unknown words: 21873 | Known words: 8606
6:05pm --- @NUM[605.0] pm
24k --- @NUM[24000.0]
$47.7 --- @NUM[4.77] dollar
10,000ttt --- @NUM[10000.0] ttt
0.444 --- @NUM[444.0]
0.75% --- @NUM[0.75] percent
+2000% --- @NUM[2000.0] percent
$9 --- @NUM[9.0] dollar
138.30% --- @NUM[138.3] percent
20.43% --- @NUM[20.43] percent
########## Step - Serialize numbers:
Unknown words: 21682 | Known words: 8606
100$300$5001000$2000 --- @NUM[100.0] dollar 300$5001000$2000
9% --- @NUM[9.0] percent
.93000 --- @NUM[93000.0]
8%. --- @NUM[8.0] percent .
250$. --- @NUM[250.0] dollar .
16,500 --- @NUM[16500.0]
26.1 --- @NUM[2.61]
38750 --- @NUM[38750.0]
35k --- @NUM[35000.0]
25% --- @NUM[25.0] percent
########## Step - Serialize numbers:
Unknown words: 21680 | Known words: 8606
78$ --- @NUM[78.0] dollar
^24 --- @NUM[24.0]
300$5001000$2000 --- @NUM[300.0] dollar 5001000$2000
########## Step - Serialize numbers:
Unknown words: 21680 | Known words: 8606
5001000$2000 --- @NUM[50

In [139]:
# Extract entities again
texts = texts\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Custom word synonyms:
Unknown words: 21676 | Known words: 8606
poloniex --- @poloniex
crypto --- #cryptocurrency
bitpay --- @bitpay
cryptocurrencies --- #cryptocurrency
hodl --- #hodl
kraken --- @kraken
altcoins --- #altcoins
bitmain --- @bitmain
bitcoins --- $bitcoin
coinbase --- @coinbase
########## Step - Custom currency synonyms:
Unknown words: 21595 | Known words: 8606
$usdc --- $usd_coin
$wrx --- $wazirx
$trac --- $origintrail
$cos --- $contentos
cardano --- $cardano
$xem --- $nem
dgb --- $digibyte
$fil --- $filecoin
$ltc --- $litecoin
$pols --- $polkastarter
########## Step - UserName and Hashtag:
Unknown words: 21457 | Known words: 8606
@poloniex --- @USR[poloniex]
#blockchain --- @HTAG[blockchain]
@bitmain --- @USR[bitmain]
$nem --- @CURR[nem]
$swirge --- @HTAG[swirge]
$atos --- @HTAG[atos]
$cny --- @CURR[cny]
$xrp --- @CURR[xrp]
$waltonchain --- @HTAG[waltonchain]
$contentos --- @HTAG[contentos]
########## Step - Hashtag and currency union:
Unknown words: 21

In [140]:
# Start word punctuations
def start_word_punctuations(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (not k[0].isalnum() and k[0] not in ['@', '#', '$'])]
    temp_dict = {}
    for word in temp_vocab:
        new_word = word
        for i in range(len(word)):
            if word[i].isalnum() or word[i] in ['#', '@', '$']:
                new_word = word[:i] + ' ' + word[i:]
                break
        temp_dict[word] = new_word
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    # texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Start word punctuations:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(start_word_punctuations)

########## Step - Start word punctuations:
Unknown words: 21457 | Known words: 8606
'buy --- ' buy
-reach --- - reach
.sign --- . sign
'joke --- ' joke
'dogecoin --- ' dogecoin
**guess --- ** guess
-or --- - or
'aggressive --- ' aggressive
'team --- ' team
-sec.gov --- - sec.gov


In [142]:
# Extract entities again and numbers
texts = texts\
    .pipe(serialize_numbers)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Serialize numbers:
Unknown words: 21456 | Known words: 8606
$0x --- @NUM[0.0] dollar times
########## Step - Custom word synonyms:
Unknown words: 21456 | Known words: 8606
########## Step - Custom currency synonyms:
Unknown words: 21456 | Known words: 8606
########## Step - UserName and Hashtag:
Unknown words: 21456 | Known words: 8606
########## Step - Hashtag and currency union:
Unknown words: 21456 | Known words: 8606


In [143]:
# Find and replace acronims
def find_replace_acronyms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if (Counter(word)['.']>1) and (check_replace(word)):
            if (domain_search(word)!='') and (('www' in word) or (Counter(word)['/']>3)):
                temp_dict[word] = place_hold('url ' + domain_search(word))
            else:
                if (re.compile('[\.\,]').sub('', word) in local_vocab) and (len(re.compile('[0-9\.\,\-\/\:]').sub('', word))>0):
                    temp_dict[word] =  place_hold(re.compile('[\.\,]').sub('', word))
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Find and replace acronims:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(find_replace_acronyms)

########## Step - Find and replace acronims:
Unknown words: 21456 | Known words: 8606
g.o.a.t --- word_placeholder[goat]
f.i.a.t --- word_placeholder[fiat]
p.o.d --- word_placeholder[pod]


In [144]:
# Apply spellchecker for contractions
def apply_spellchecker_contractions(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ("'" in k)]
    temp_dict = {}
    for word in temp_vocab:
        if word in helper_contractions:
            temp_dict[word] = helper_contractions[word] # place_hold(helper_contractions[word])
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Contractions:'); check_vocab(texts, local_vocab)
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(apply_spellchecker_contractions)

########## Step - Contractions:
Unknown words: 21394 | Known words: 8606
she's --- she is
where's --- where is
he's --- he is
ya'll --- you will
who's --- who is
they'd --- they would
don't --- do not
how's --- how is
he'll --- he will
where'd --- where did


In [145]:
# Remove 's (DO WE NEED TO REMOVE IT???)
def remove_comma_s(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {k:k[:-2] for k in temp_vocab if (check_replace(k)) and (k.lower()[-2:]=="'s")}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove "s:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_comma_s)

########## Step - Remove "s:
Unknown words: 21181 | Known words: 8617
ftx's --- ftx
inc.'s --- inc.
hodler's --- hodler
germany's --- germany
#cme's --- #cme
#gpu's --- #gpu
brent's --- brent
management's --- management
union's --- union
$link's --- $link


In [146]:
def convert_backslash(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('\\' in k)]
    temp_dict = {k:re.sub('\\\\+', ' / ', k) for k in temp_vocab}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Convert backslash:'); check_vocab(texts, local_vocab)
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(convert_backslash)

########## Step - Convert backslash:
Unknown words: 21181 | Known words: 8617
\4238285.0 ---  / 4238285.0
\4301056.0 ---  / 4301056.0
\4299147.0 ---  / 4299147.0
\5058389.0 ---  / 5058389.0
\4241491.0 ---  / 4241491.0
\4240291.0 ---  / 4240291.0
\4233436.0 ---  / 4233436.0


In [147]:
# Extract entities again and numbers
texts = texts\
    .pipe(serialize_numbers)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Serialize numbers:
Unknown words: 21181 | Known words: 8617
4301056.0 --- @NUM[430105.6]
4299147.0 --- @NUM[429914.7]
4238285.0 --- @NUM[423828.5]
4241491.0 --- @NUM[424149.1]
4233436.0 --- @NUM[423343.6]
4240291.0 --- @NUM[424029.1]
5058389.0 --- @NUM[505838.9]
########## Step - Custom word synonyms:
Unknown words: 21176 | Known words: 8617
crypto --- #cryptocurrency
#crypto --- #cryptocurrency
coinbase --- @coinbase
paypal --- @paypal
blockchain --- #blockchain
binance --- @binance
cryptocurrency --- #cryptocurrency
#binance --- @binance
########## Step - Custom currency synonyms:
Unknown words: 21163 | Known words: 8617
cardano --- $cardano
elrond --- $elrond_egld
iota --- $iota
bitcoin --- $bitcoin
@dogecoin --- $dogecoin
@cardano --- $cardano
doge --- $dogecoin
#cardano --- $cardano
#btc --- $bitcoin
eth --- $ethereum
########## Step - UserName and Hashtag:
Unknown words: 21116 | Known words: 8617
#blockchain --- @HTAG[blockchain]
@bitfinex --- @USR[bitfinex]
#ny

In [148]:
# Try remove duplicated chars (not sure about this!!!!!). TODO check fist against vocab?
def remove_duplicated_character(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]

    temp_dict = {}
    temp_vocab_dup = []

    for word in temp_vocab:
        if not word.isalpha():
            continue
        temp_vocab_dup.append(''.join(ch for ch, _ in itertools.groupby(word)))
    temp_vocab_dup = set(temp_vocab_dup)
    temp_vocab_dup = temp_vocab_dup.difference(temp_vocab_dup.difference(set(local_vocab)))

    for word in temp_vocab:
        new_word = ''.join(ch for ch, _ in itertools.groupby(word))
        if new_word in temp_vocab_dup:
            temp_dict[word] = new_word
    temp_dict = {k: v for k, v in temp_dict.items() if (k != v) and (v in local_vocab)}

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Dup chars (with vocab check):'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_duplicated_character)

########## Step - Dup chars (with vocab check):
Unknown words: 20866 | Known words: 8652
scalling --- scaling
annnnnnnnnd --- and
wenn --- wen
brr --- br
peep --- pep
caal --- cal
aai --- ai
mmmm --- m
goooood --- god
yeet --- yet


In [149]:
# Extract entities again and numbers
texts = texts\
    .pipe(serialize_numbers)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Serialize numbers:
Unknown words: 20866 | Known words: 8652
########## Step - Custom word synonyms:
Unknown words: 20866 | Known words: 8652
########## Step - Custom currency synonyms:
Unknown words: 20866 | Known words: 8652
########## Step - UserName and Hashtag:
Unknown words: 20866 | Known words: 8652
########## Step - Hashtag and currency union:
Unknown words: 20866 | Known words: 8652


In [150]:
def isolate_numbers(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if re.compile('[a-zA-Z]').sub('', word) == word:
            if re.compile('[0-9]').sub('', word) != word:
                temp_dict[word] = word

    global_chars_list = list(set([c for line in temp_dict for c in line]))
    chars = ''.join([c for c in global_chars_list if not c.isdigit()])
    chars_dict = {ord(c):f' {c} ' for c in chars}
    temp_dict = {k:place_hold(k) for k in temp_dict}

    #texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Isolate numbers:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(isolate_numbers)

########## Step - Isolate numbers:
Unknown words: 20866 | Known words: 8652
*_100% --- word_placeholder[*_100%]
:-6.11 --- word_placeholder[:-6.11]


In [151]:
# Join dashes
def join_dashes(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]

    temp_dict = {}
    for word in temp_vocab:
        temp_dict[word] = re.sub('\-\-+', '-', word)
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Join dashes:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(join_dashes)

########## Step - Join dashes:
Unknown words: 20860 | Known words: 8652
---- --- -
aa--tag --- aa-tag
outshined--cryptocurrency --- outshined-cryptocurrency
------------- --- -
--designed --- -designed
----- --- -
--- --- -
#crypto!--where --- #crypto!-where
------------------------------------------ --- -
clockwork--up --- clockwork-up


In [152]:
# Try join word (Sloooow)
def join_word_letters(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (Counter(k)['-']>1)]

    temp_dict = {}
    for word in temp_vocab:
        new_word = ''.join(['' if c in '-' else c for c in word])
        if (new_word in local_vocab) and (len(new_word)>3):
            temp_dict[word] = new_word

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Try Split word:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(join_word_letters)

########## Step - Try Split word:
Unknown words: 20860 | Known words: 8652


In [156]:
# Try Split word
def split_words(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]

    temp_dict = {}
    for word in temp_vocab:
        if len(re.compile('[a-zA-Z0-9\*]').sub('', word))>0:
            chars = re.compile('[a-zA-Z0-9\*]').sub('', word)
            temp_dict[word] = ''.join([' ' + c + ' ' if c in chars else c for c in word])

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Try Split word:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(split_words)

########## Step - Try Split word:
Unknown words: 19538 | Known words: 8791
🌋 ---  🌋 
♂ ---  ♂ 
🎤 ---  🎤 
🖕 ---  🖕 
⤵ ---  ⤵ 
🧿 ---  🧿 
📌 ---  📌 
🧸 ---  🧸 
🚂 ---  🚂 
🐒 ---  🐒 


In [157]:
# L33T vocabulary (SLOW)
# https://simple.wikipedia.org/wiki/Leet
# Local (only unknown words)
def convert_leet(word):
    # basic conversion
    word = re.sub('0', 'o', word)
    word = re.sub('1', 'i', word)
    word = re.sub('3', 'e', word)
    word = re.sub('\$', 's', word)
    word = re.sub('\@', 'a', word)
    return word

def convert_leet_words(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]

    temp_dict = {}
    for word in temp_vocab:
        new_word = convert_leet(word)
        if (new_word!=word):
            if (len(word)>2) and (new_word in local_vocab):
                temp_dict[word] = new_word

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - L33T (with vocab check):'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(convert_leet_words)

########## Step - L33T (with vocab check):
Unknown words: 19538 | Known words: 8791


In [155]:
# Extract entities again and numbers
texts = texts\
    .pipe(serialize_numbers)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Serialize numbers:
Unknown words: 19548 | Known words: 8791
43000 --- @NUM[43000.0]
47716 --- @NUM[47716.0]
458 --- @NUM[458.0]
408 --- @NUM[408.0]
564 --- @NUM[564.0]
599 --- @NUM[599.0]
100awayfrom --- @NUM[100.0] awayfrom
614 --- @NUM[614.0]
012088 --- @NUM[12088.0]
726 --- @NUM[726.0]
########## Step - Custom word synonyms:
Unknown words: 19547 | Known words: 8791
coinbase --- @coinbase
paypal --- @paypal
crypto --- #cryptocurrency
cryptocurrency --- #cryptocurrency
hodl --- #hodl
blockchain --- #blockchain
bitstamp --- @bitstamp
binance --- @binance
altcoins --- #altcoins
bitcoins --- $bitcoin
########## Step - UserName and Hashtag:
Unknown words: 19538 | Known words: 8791
#blockchain --- @HTAG[blockchain]
@bitstamp --- @USR[bitstamp]
@binance --- @USR[binance]
#hodl --- @HTAG[hodl]
@paypal --- @USR[paypal]
@coinbase --- @USR[coinbase]
#altcoins --- @HTAG[altcoins]
#cryptocurrency --- @HTAG[cryptocurrency]
$bitcoin --- @CURR[bitcoin]
########## Step - Hashtag and

In [159]:
# Remove placeholders
def remove_placeholders(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if (not check_replace(k) and k.startswith(WPLACEHOLDER))]
    temp_dict = {}
    for word in temp_vocab:
        temp_dict[word] = re.sub('___', ' ', word[17:-1])
    texts = texts.apply(lambda x: ' '.join([temp_dict.get(i, i) for i in x.split()]))
    texts = texts.apply(lambda x: ' '.join([i for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Open Holded words:'); check_vocab(texts, local_vocab)
    return texts

texts = texts.pipe(remove_placeholders)

########## Step - Open Holded words:
Unknown words: 19231 | Known words: 8868


In [162]:
# Search multiple form
# Local | example -> flashlights / flashlight -> False / True
def search_multiple_form(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (k[-1:]=='s') and (len(k)>4)]
    temp_dict = {k:k[:-1] for k in temp_vocab if (k[:-1] in local_vocab)}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Multiple form:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(search_multiple_form)

########## Step - Multiple form:
Unknown words: 19210 | Known words: 8868


In [161]:
# Extract entities again and numbers
texts = texts\
    .pipe(serialize_numbers)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Serialize numbers:
Unknown words: 19231 | Known words: 8868
########## Step - Custom word synonyms:
Unknown words: 19231 | Known words: 8868
########## Step - Custom currency synonyms:
Unknown words: 19226 | Known words: 8868
usdt --- $tether
xlm --- $stellar
aave --- $aave
dgb --- $digibyte
yfi --- $yearn_finance
ethereum --- $ethereum
trx --- $tron
egld --- $elrond_egld
ves --- $ves
algo --- $algorand
########## Step - UserName and Hashtag:
Unknown words: 19210 | Known words: 8868
$xrp --- @CURR[xrp]
$ethereum --- @CURR[ethereum]
$tron --- @CURR[tron]
$digibyte --- @CURR[digibyte]
$tether --- @CURR[tether]
$litecoin --- @CURR[litecoin]
$dogecoin --- @CURR[dogecoin]
$ves --- @CURR[ves]
$aave --- @CURR[aave]
$bitcoin --- @CURR[bitcoin]
########## Step - Hashtag and currency union:
Unknown words: 19210 | Known words: 8868


In [163]:
# Cut away non english tweets
model = fasttext.load_model('../../data/kaggle/lid.176.ftz')

def langcheck(item, min_confidence=0.2):
    text = ' '.join([w for w in item.split() if not w.startswith('@')])
    if len(text) < 3:
        return True
    results = dict(zip(*model.predict(text, k=2)))
    return results.get('__label__en', 0) > min_confidence

mask = texts.parallel_map(langcheck)
if verbose: print(f'Deleted: {1 - sum(mask)/len(texts)}')
texts = texts[mask]
data = data[mask]
if verbose: print('#' * 10, 'Step - Language datection:'); check_vocab(texts, local_vocab);



Deleted: 0.04115000000000002
########## Step - Language datection:
Unknown words: 17597 | Known words: 8718


In [164]:
data['text'] = texts
data

Unnamed: 0,_id,text
0,1360142875330232324,when the top u . s . central banker gets photobombed by @CURR[bitcoin] . 👉 👀 @CURR[bitcoin] @CURR[bitcoin] @HTAG[cryptocurrency] @HTAG[cryptocurrency] @CURR[ethereum] @HTAG[ripple] @HTAG[link] @CU...
1,1360140112861003776,best am arriving with exciting features @HTAG[bsc] @USR[binance] @CURR[bitcoin] @HTAG[binancesmartchain] @HTAG[defi] @HTAG[definews] @HTAG[stafi] @HTAG[cake] @HTAG[pancakeswap] @HTAG[paraswap] @HT...
2,1360137307047694337,"to keep its ultra bullish run intact , $ $elrond_egld _ $elrond_egld bulls need to keep $ $elrond_egld _ $elrond_egld / @CURR[tether] daily above @NUM[148.0] dollar . reclaiming @NUM[174.0] dollar..."
4,1360132401142366210,next coin that goes @NUM[100.0] percent . . . buckle up . . . @CURR[tezos] @CURR[tezos] @CURR[tezos] look @ my calls from last 2 weeks @CURR[iota] @HTAG[coti] @CURR[tezos] will move hard incoming ...
5,1360131434158170113,its gonna be huge ! 🚀 😍 👑 @HTAG[fetch_ai] 👑 @CURR[xrp] @CURR[vechain] @CURR[chainlink] @CURR[cardano] @CURR[algorand] @HTAG[altcoins] @HTAG[artificialintelligence] @HTAG[blockchain]
...,...,...
19995,1357792968455946242,cash is trash @CURR[bitcoin]
19996,1357792933982928896,global central bank efforts to limit u . s . dollars decline raises specter of currency war @CURR[bitcoin]
19997,1357792930359107588,"what if @CURR[bitcoin] is a social experiment ? well , money was ."
19998,1357792864005095424,@CURR[bitcoin] btw that was pre close ny - cme friday dump . pl are closing positions b4 weekend .


### TODO:
* numbers
