In [1]:
# Credit for some parts to: https://www.kaggle.com/kyakovlev/preprocessing-bert-public
# Number extraction and hashtags is my baby

# General imports|  
import pandas as pd
import re, warnings, pickle, itertools, emoji, unicodedata

# custom imports
from gensim.utils import deaccent
from collections import Counter
from bs4 import BeautifulSoup
from utils.datasets import *
from pandarallel import pandarallel
import fasttext

pandarallel.initialize()
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 10
pd.options.display.max_colwidth = 200


INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
## Initial vars

HELPER_PATH             = '../../data/helpers/'
LOCAL_TEST = True       ## Local test - for test performance on part of the train set only
verbose = True
WPLACEHOLDER = 'word_placeholder'
URL_TAG = '@URL'
USER_TAG = '@USR'
NUMBER_TAG = '@NUM'
HASH_TAG = '@HTAG'
CURRENCY_TAG = '@CURR'
TIME_TAG = '@TIME'
DATE_TAG = '@DATE'
IMMUTABLES = [
    WPLACEHOLDER,
    URL_TAG, USER_TAG, NUMBER_TAG, HASH_TAG, CURRENCY_TAG,
    TIME_TAG, DATE_TAG
]

SEED = 42               ## Seed for enviroment
seed_everything(SEED)   ## Seed everything

In [3]:
## Preprocess helpers
def place_hold(w, tag=WPLACEHOLDER):
    return tag + '[' + re.sub(' ', '___', w) + ']'

## Helpers
def check_replace(w):
    return not bool(re.search('|'.join(IMMUTABLES), w))

def make_cleaning(s, c_dict):
    if check_replace(s):
        s = s.translate(c_dict)
    return s

def make_dict_cleaning(s, w_dict, skip_check=False):
    # Replaces a word using dict if it is mutable
    if skip_check or check_replace(s):
        s = w_dict.get(s, s)
    return s

In [4]:
## Get basic helper data

bert_uncased_vocabulary = load_helper_file('helper_bert_uncased_vocabulary')
bert_cased_vocabulary   = load_helper_file('helper_bert_cased_vocabulary')
bert_char_list          = list(set([c for line in bert_uncased_vocabulary+bert_cased_vocabulary for c in line]))

url_extensions          = load_helper_file('helper_url_extensions')
html_tags               = load_helper_file('helper_html_tags')
good_chars_dieter       = load_helper_file('helper_good_chars_dieter')
bad_chars_dieter        = load_helper_file('helper_bad_chars_dieter')
helper_contractions     = load_helper_file('helper_contractions')
global_vocabulary       = load_helper_file('helper_global_vocabulary')
global_vocabulary_chars = load_helper_file('helper_global_vocabulary_chars')
normalized_chars        = load_helper_file('helper_normalized_chars')
white_list_chars        = load_helper_file('helper_white_list_chars')
white_list_punct        = " '*-.,?!/:;_()[]{}<>=" + '"'
pictograms_to_emoji     = load_helper_file('helper_pictograms_to_emoji')
helper_custom_synonyms     = load_helper_file('helper_custom_synonyms')
helper_currency_synonyms     = load_helper_file('helper_currency_synonyms')
helper_custom_general_synonyms     = load_helper_file('helper_custom_general_synonyms')
emoji_dict = set(e for lang in emoji.UNICODE_EMOJI.values() for e in lang)

In [5]:
## Load Data
good_cols       = ['_id', 'text']
data = pd.read_parquet('../../data/bitcoin_twitter_test_raw/part_0.parquet')
data = data.iloc[:20000][good_cols]

In [6]:
## Start preprocessing
texts = data['text']
local_vocab = bert_uncased_vocabulary
global_lower=True
texts = texts.astype(str)
if verbose: print('#' *20 ,'Initial State:'); check_vocab(texts, local_vocab)

#################### Initial State:
Unknown words: 75185 | Known words: 6991


In [7]:
def lower(texts):
    texts = texts.apply(lambda x: x.lower())
    if verbose: print('#'*10 ,'Step - Lowering everything:'); check_vocab(texts, local_vocab)
    return texts

if global_lower:
    texts = texts.pipe(lower)

########## Step - Lowering everything:
Unknown words: 65753 | Known words: 8352


In [8]:
# Normalize chars and dots - SEE HELPER FOR DETAILS
def normalize_chars(texts):
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,normalized_chars) for i in x.split()]))
    texts = texts.apply(lambda x: re.sub('\(dot\)', '.', x))
    texts = texts.apply(lambda x: deaccent(x))
    if verbose: print('#'*10 ,'Step - Normalize chars and dots:'); check_vocab(texts, local_vocab)
    return texts

texts = texts.pipe(normalize_chars)

########## Step - Normalize chars and dots:
Unknown words: 65087 | Known words: 8387


In [9]:
def remove_control_chars(texts):
    global_chars_list = list(set([c for line in texts for c in line]))
    chars_dict = {c:'' for c in global_chars_list if unicodedata.category(c)[0]=='C'}
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
    if verbose: print('#'*10 ,'Step - Control Chars:'); check_vocab(texts, local_vocab)
    return texts

texts = texts.pipe(remove_control_chars)

########## Step - Control Chars:
Unknown words: 65087 | Known words: 8387


In [10]:
def remove_hrefs(texts):
    texts = texts.apply(lambda x: re.sub(re.findall(r'\<a(.*?)\>', x)[0], '', x) if (len(re.findall(r'\<a (.*?)\>', x))>0) and ('href' in re.findall(r'\<a (.*?)\>', x)[0]) else x)
    if verbose: print('#'*10 ,'Step - Remove hrefs:'); check_vocab(texts, local_vocab)
    return texts

texts = texts.pipe(remove_hrefs)

########## Step - Remove hrefs:
Unknown words: 65087 | Known words: 8387


In [11]:
# Convert or remove Bad Symbols
def convert_remove_bad_symbols(texts):
    global_chars_list = list(set([c for line in texts for c in line]))
    chars = ''.join([c for c in global_chars_list if (c not in bert_char_list) and (c not in emoji_dict) and (c not in white_list_chars)])
    chars_dict = {}
    for char in chars:
        try:
            new_char = unicodedata.name(char).split()[-1:][0].lower()
            if len(new_char)==1:
                chars_dict[ord(char)] = new_char
            else:
                chars_dict[ord(char)] = ''
        except:
            chars_dict[ord(char)] = ''
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove Bad Symbols:'); check_vocab(texts, local_vocab)
    if verbose: print(chars)
    if verbose: print_dict(chars_dict)
    return texts

texts = texts.pipe(convert_remove_bad_symbols)

########## Step - Remove Bad Symbols:
Unknown words: 64638 | Known words: 8422
※自ठ収어接半𝓲壇操餐②析買延립処経様欲테대玲景베ะ倍式策怖邪眼電攻辛話請欧添票为悟皆𝒐逃♡調般𝐅末고늘내ヤ忘題説บ観尊何게겨技抗剰序突ｳ乱報禍暇触后起条｀録稼貼晩午𝐢瞬別印優痛𝗥掩ぇ‸効⇨只۵昇族리ண噂提ゃ嘘重封当在彻積۴𝐫상工𝟕系込興割∀录獲관携持𝐧病𝟰矿気含𝟬％본〆返読己射开深＜富♧範다ॉ‍既仕飲ヌ撤視数討ำ概縦税績정స放証记著引記竜폰昔座隆◡息減种¯獄找存黒他湧燃售械米悪忙艰٣集🇼𝗵복言総点결۱۲ｌ선𝒊適변克運検増ｱ孩告注⇩特臭𝓸略改違率న易赶뮤니煽ங強؟□申音拡焉貯보위慢甚寝終贫迎】科论𝗨만풀것泣个圧ช俺일採국換達𝓪商导ⓢ震块仲曜市番辿了静며整𝘆🇶格판❯𝐥対虎考ข١退𝗞即ｼ𝓷継攫私⬣勉］락七ｷ押었못있소低🇯ค標職案譯解決🇳降所埋낸毎就ﾃ詰列서회𝑶饮善慌ﾄ友為비質呟構ฒ她ৱ念ณ程遊亿ﾞ♤済像료으知万欺纯피昌需𝘂라ｅ要𝔂𝐊머네察頼스获穫確应値ﾛ魔𝟐东聡わ専限据騰裕倒실偉還＝제٠𝐆基厳ใ婴創𝗬眠議企價ｒ➤控🇹இ審무然額ด弾備吸０购숏非🇻可雰ﾊ백赚追번钟伴打𝒏은推凌ネৰ勧懸𝑷왕➥抑气ర시巨涨률教台共ญ営求◔錢🇦ｉ伸度無岸寒나호닐됴答辻₦焼脳早髭固慣択夭ซ僕催𝒄装급动組監勘조麻𝐑망𝗘𝗦🇬認𝐮管危알故坡𝗰期𝓭妄𝒉施더ｃ貸壁⃣形ฯ今코త를儲ﾀ𝑩𝐞ڡ利░𝟏커母節配余背激筆再響劇滞２跃오物域增至链予価ঘ停億兄兆周訴禁草너變褄論擬세昨肩색词課殺握吊원留客资ｸ등包𝓰해願忌𝓵약消𝓮切照औ데磨기以唯⣿🇱🇧￼찬厚位🇺🇨긴修ﾋ鷲流ﾌ端勢満ƀ室嫌堀筋売ﾗ𝐕連由透扶業들ফ唐＼詐想肯側인ภ央⁦इ율律𝐲借抽𝗛銀夜件影댓岁받投很直作ｯ徐路離旅芸材𝐄線斉約주儿용互𝟎수려굴密問練衆价講綺₺許ヶ払跳여先並𝘃入凄字院希独‥移并絡𝟱폐繋楽能締황ద동噴責ﾙａユ拉们𝐩設𝐚🇲움۷익轰장ｰ休純𝐒与寧拠警▓ｹ심制ソ完輪종似伺飛钱플医被素而帰死盖迷股暗𝐤유하킹𝟲習築５幅染招과樣着味銘乙戻員態𝐝詳迄化⟶談𝗲𝗶判애産거難ఇ妙ฟ现బ扱𝐁ｗﾎ吗広𝐌ฮ裏简로遣𝟵히亲界底丸ు通冊破両◝須係같🇭站１ﾟ圈态冬✦𝘄ﾘ值献歳𝗹熱躇紫瀧捉斯♪乗닝去候預𝐋販敗간牌𝗮۳힘⎌⁠登되𝐦向角𝗯交識จ每𝓱榴住残ﾏ緊𝗿奥訳霜도機＆失躊𝐔టไๆ撸额드달🇵恐変柄标評ｋ𝟓湾戒ォ貨準향実财拒拾購＋虛券彫瀚

In [12]:
# Remove Bad Symbols PART 2
def convert_remove_bad_symbols2(texts):
    global_chars_list = list(set([c for line in texts for c in line]))
    chars = '·' + ''.join([c for c in global_chars_list if (c not in white_list_chars) and (c not in emoji_dict) and (c not in white_list_punct) and (ord(c)>256)])
    chars_dict = {}
    for char in chars:
        try:
            new_char = unicodedata.name(char).split()[-1:][0].lower()
            if len(new_char)==1:
                chars_dict[ord(char)] = new_char
            else:
                chars_dict[ord(char)] = ''
        except:
            chars_dict[ord(char)] = ''
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove Bad Symbols PART 2:'); check_vocab(texts, local_vocab)
    if verbose: print(chars)
    if verbose: print_dict(chars_dict)
    return texts

texts = texts.pipe(convert_remove_bad_symbols2)

########## Step - Remove Bad Symbols PART 2:
Unknown words: 62759 | Known words: 8380
·βσ加น。ت民₱へ目ふש仮前地示סフ」風野ω氏火ष宣山高तডつッ瀬←т光え√দワせ金ぬ『οウь小マ下長खד書ی石も学かםכ三あ真شφιिغী戦文京相場生าسצशпρレそ⇒』★י四女区見рфおツシذ上たオ이иอ春щγব立ョ信•的ทו「幸井чг）हطლीءתアپचκள…成ηงэмযんャ南ـ新白лケ良ोこνやоخュரノ比வ語分華व食नーみυろ！タপィমजןब本→มаカжقع（ш의אыהدеτ行国லস將ا花ध，زு€士スچض元ら朝ミج子─内平ך年力ζמ世พ月神у土তニب外むкگிث英юएى↑কวநதсコ사ひעगচडァ奈大ハ保有ةن代正男政டб●：цสε₹人イ馬र空ยר十千েט我天る・ট同車張会χ☆ルз之θ木チ↓？хはςசナ五ל原定م事な久二り発حヘλラ北ロ手ξにエα،सउே出ףגセو史明ホ陽う社谷।ム安めと心部ف義பريظнमв„都ゆร／れすล口也זपд和ちを止キ合中ल名家ก广後トহンリμבっ～াตாほ不星くヒाקけのदट島て美লい貴道法しモ日ェメअさமোகя州πサテδ〜一古كصねय한面لआ香主เ犬ま方חよি勝ه、き水門間ভ王ন海介کक≈クפ公נ
183 --- 
946 --- 
963 --- 
21152 --- 
3609 --- 
12290 --- 
1578 --- 
27665 --- 
8369 --- 
12408 --- 


In [13]:
def remove_html_tags(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if ('<' in word) and ('>' in word):
            for tag in html_tags:
                if ('<'+tag+'>' in word) or ('</'+tag+'>' in word):
                    temp_dict[word] = BeautifulSoup(word, 'html5lib').text
    texts = texts.apply(lambda x: ' '.join([temp_dict.get(i, i) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - HTML tags:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_html_tags)

########## Step - HTML tags:
Unknown words: 62759 | Known words: 8380


In [14]:
# Remove links (There is valuable information in links (probably you will find a way to use it))
def remove_links(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    url_rule = r'(?P<url>https?://[^\s]+)'
    temp_dict = {k:domain_search(k) for k in temp_vocab if k!= re.compile(url_rule).sub('url', k)}

    for word in temp_dict:
        new_value = temp_dict[word]
        if word.find('http')>2:
            temp_dict[word] =  word[:word.find('http')] + ' ' + place_hold(new_value, URL_TAG)
        else:
            temp_dict[word] = place_hold(new_value, URL_TAG)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Convert urls part 1:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)

    # Remove twitter urls
    temp_dict = {
        f'{URL_TAG}[t.co]': ''
    }
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict, skip_check=True) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Convert urls part 1.5:'); check_vocab(texts, local_vocab);
    return texts

texts = texts.pipe(remove_links)

########## Step - Convert urls part 1:
Unknown words: 51806 | Known words: 8380
https://t.co/gelqggviyj --- @URL[t.co]
https://t.co/cwogs38kcz --- @URL[t.co]
https://t.co/mcvchqq4bu --- @URL[t.co]
https://t.co/mgck6u5sfi --- @URL[t.co]
https://t.co/zmcxs8ygxz --- @URL[t.co]
https://t.co/r1cjnu7xqe --- @URL[t.co]
https://t.co/rmekxunkpn --- @URL[t.co]
https://t.co/zmcr4o0mps --- @URL[t.co]
https://t.co/bfhipl2exl --- @URL[t.co]
https://t.co/qjwvxvjscq --- @URL[t.co]
########## Step - Convert urls part 1.5:
Unknown words: 51805 | Known words: 8380


In [15]:
# Remove escaped html
def remove_escaped_html(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    symbols = {
        '&quot;': '',
        '&amp;': ' and ',
        '&lt;': '',
        '&gt;': '',
    }
    temp_dict = {}
    for word in temp_vocab:
        if any([rep in word for rep in symbols.keys()]):
            new_word = word
            for rep, to in symbols.items():
                new_word = new_word.replace(rep, to)
            temp_dict[word] = new_word

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict, skip_check=True) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove escaped html:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_escaped_html)

########## Step - Remove escaped html:
Unknown words: 51757 | Known words: 8380
==&gt;&gt; --- ==
s&amp;p --- s and p
stakepool🥩&amp;🍳 --- stakepool🥩 and 🍳
&gt;&gt;&gt;&gt;&gt; --- 
love&gt;money. --- lovemoney.
&lt;$0.20. --- $0.20.
p&amp;d --- p and d
&gt;remember --- remember
&gt;&gt;7 --- 7
&lt;$f/s --- $f/s


In [16]:
# Convert urls part 2
def convert_urls2(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}

    for word in temp_vocab:
        url_check = False
        if 'file:' in word:
            url_check = True
        elif ('http' in word) or ('ww.' in word) or ('.htm' in word) or ('ftp' in word) or ('.php' in word) or ('.aspx' in word):
            if 'Aww' not in word:
                for d_zone in url_extensions:
                    if '.' + d_zone in word:
                        url_check = True
                        break
        elif ('/' in word) and ('.' in word):
            for d_zone in url_extensions:
                if '.' + d_zone + '/' in word:
                    url_check = True
                    break

        if url_check:
            temp_dict[word] =  place_hold(domain_search(word), URL_TAG)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Convert urls part 2:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(convert_urls2)

########## Step - Convert urls part 2:
Unknown words: 51754 | Known words: 8380
.io/boxes/all?r=5f4c54b0bd312243977db0f7 --- @URL[url]
httpss://betfury.io/boxes/all?r=601593e4b08af17cbc468064 --- @URL[betfury.io]
35%/betfury.io/boxes/all?r=600b1be208cc0b1e47440365 --- @URL[url]
www.studio192.nle --- @URL[studio192.nle]
//t.co/nf0x22os7q --- @URL[url]


In [17]:
# Normalize pictograms
# Local (only unknown words)
def normalize_pictograms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if len(re.compile('[a-zA-Z0-9]').sub('', word))>2:
            for pict in pictograms_to_emoji:
                if (pict in word) and (len(pict)>2):
                    char_pict = pict[-1].isalpha() and pict[0].isalpha()
                    if char_pict:
                        pass
                    else:
                        temp_dict[word] = word.replace(pict, pictograms_to_emoji[pict])
                elif pict==word:
                    temp_dict[word] = pictograms_to_emoji[pict]

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Normalize pictograms:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(normalize_pictograms)

########## Step - Normalize pictograms:
Unknown words: 51752 | Known words: 8380
:-) --- 😁
:)) --- 😁
:))) --- 😁)


In [18]:
def isolate_emoji(texts):
    global_chars_list = list(set([c for line in texts for c in line]))
    chars = ''.join([c for c in global_chars_list if c in emoji_dict])
    chars_dict = {ord(c):f' {c} ' for c in chars}
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Isolate emoji:'); check_vocab(texts, local_vocab)
    if verbose: print(chars)
    return texts

texts = texts.pipe(isolate_emoji)

########## Step - Isolate emoji:
Unknown words: 50158 | Known words: 8404
🩺🧐🔻📨🕴🍺🔃🌼⚠🍛⬆🐱💙📃🎢💞😇🏋🎾⁉⚙📝🌽❗🏤😝🌪☺☁❣🪄🍃📍👆◻😊🐾🏡😷⛳🐶🌠🔴🔟🪓↕☹🌂🎊💌🤎🤌🌵🐑👐💥🏠🛩💵🎬🔋🙌🖨🔍🚬⬛🕒👊🍊👹😵🖖🙋💝🥇🥵◼🐕▪🧨👋🔉💖😙🎇✔🕘🤲💻✳🏎🛥🛢🥷🖥🌛🏧🖼💘🚤⏲🚩👢🦥😛🔒🪀🚶🥴🤮😻🏀🧙🍰🛒☕◾🥈☑🤠📹💅⬅🔎🍯🌸🚀😳☀🚦📥⚫🤭🟠❌📱🟩📩🤙🔶🥚🤪📉💬🎶🏘🪜📤😹⛽🔹🌉💱📚🎥🏃🤢🛍💀🆙🌧😸⏰😎⌚😲🤹☮🌎🧑🌺💁🍇🍳✌👩😁👉🧳🧱🍕🖇😧😘🍾🖍👁🤨😤🔊😒🟧💯🙆🤑✈🤯⛑🐌®🤗🙀🌱🏾🦉🙈🤚🌒🧭🔮🍆🌙👺⏩🥳🛶🅰🙂🥂🐍⚖🍑💹🤟🏭🍻🎆💃⤵💰📡✖😄🌹📅🏆🔐🕷🏼🐐🧠🤡🟢👨🏈🧘👈🎙🌕📀☔🗓💤🌘🐋😶🔛🏁🎀➡🆘⏱✋🏿🕰📷🐢🗣💓💊😢🏽🐯🤬☃🍎💡💟❄🚨📢🔜🐀🆕☝🧊🎧🎉⛔🌐😐🐂🍄🏊⛏📗🥉▫⛄🐭👌🎯🟡🥙⬇🍬😯🌋〽💭📯🔄🦳⛵💶💩👧💳😭‼🛳🩳🐻👟🚂©🤺💕👦🌚🍌🚪📽🖌👓🙊🐃📋🧡📣🤾💦🕯💲🗼🛫🐺🙉🦄🥩🏞🤸🔦♾❕😖🤞🔫🎂♻🔘🐛🌗😰🏴🌝👽📸💧🏻🪙🛑🐏📲📬😉🎸🐓🎱🤖📦⏬🤣🥺🌴🐦😨♥🌜ℹ⏳🐝🦃💋🔵🟨🌖👤😋☠😫🤛📰㊗💛📌✍🐸🎈💴🏦🚛😱✊🍁🦆🤷😈⛷⌛🥃✅🖕⤴👛🍀🤘🆒🌟🧬🃏🍿📏🙄🏄🔗🌏🥰🗽🌮🥶☄♦👏👅🐙🎐😪🗞☣▶🔷💗🔰🌇😬🤫🍷🏇🏜™🐰♀🙇💫⚛🚘🔽🥁🌄📈😕🌑🏒🤴🏓🐊🕐🍔🦠🐬😮🌅❤🪨🤏🦾🏅🐇✨🌈💪👍🎁🔸🦰🚌〰🪐🛸📪🌡🆓🎻🌞👇🙏🐳😴🐎🏰📆💣🤦🕹👂🔝🤓🌊🤤💐💜⛴♂😀🥞🐧🏛🥱💠😑🤝♣🤔😓🚗🐟😥📊🩸💿💨😣👑💉👔💔🎼🍦🧲⚡🖐🤧😃❓🧯😌🏪🔥🌔🥲🏖🦍♠🧿🕸🎦🏮🦅🔼☘😅😔🔖📄👎🏌🛠🤩💚🙃⭐🔺🔔🐮🅱🧻🛰😍💎❇😡💸🕚😏🎨⛓😜🌓👀🎤🤳🔞😽🛡📺🚄🖤🌌🌍😂🦜😚🗯⚽😆🕵🛎🎣⭕


In [19]:
# Duplicated dots, question marks and exclamations
def deduplicate_dots(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        new_word = word
        if (Counter(word)['.']>1) or (Counter(word)['!']>1) or (Counter(word)['?']>1) or (Counter(word)[',']>1):
            if (Counter(word)['.']>1):
                new_word = re.sub('\.\.+', ' . . . ', new_word)
            if (Counter(word)['!']>1):
                new_word = re.sub('\!\!+', ' ! ! ! ', new_word)
            if (Counter(word)['?']>1):
                new_word = re.sub('\?\?+', ' ? ? ? ', new_word)
            if (Counter(word)[',']>1):
                new_word = re.sub('\,\,+', ' , , , ', new_word)
            temp_dict[word] = new_word
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Duplicated Chars:'); check_vocab(texts, local_vocab);
    return texts

texts = texts.pipe(deduplicate_dots)

########## Step - Duplicated Chars:
Unknown words: 48775 | Known words: 8452


In [20]:
# Remove underscore for spam words
def remove_underscore_spam(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if (len(re.compile('[a-zA-Z0-9\-\.\,\/\']').sub('', word))/len(word) > 0.6) and ('_' in word):
            temp_dict[word] = re.sub('_', '', word)
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove underscore:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_underscore_spam)

########## Step - Remove underscore:
Unknown words: 48762 | Known words: 8452
#_ --- #
__________________ --- 
___! --- !
@l_____l____l___ --- @lll
____ --- 
_______ --- 
_____ --- 
______." --- ."
webd____________________ --- webd
#_l --- #l


In [21]:
# Isolate spam chars repetition
def isolate_spam_characters(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if (len(re.compile('[a-zA-Z0-9\-\.\,\/\']').sub('', word))/len(word) > 0.6) and (len(Counter(word))==1) and (len(word)>2):
            temp_dict[word] = ' '.join([' ' + next(iter(Counter(word).keys())) + ' ' for i in range(1)])
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Spam chars repetition:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(isolate_spam_characters)

########## Step - Spam chars repetition:
Unknown words: 48752 | Known words: 8452
$$$$$ ---  $ 
*** ---  * 
**** ---  * 
$$$ ---  $ 
************** ---  * 
^^^^ ---  ^ 
$$$$ ---  $ 
*************** ---  * 
^^^^^ ---  ^ 
^^^ ---  ^ 


In [22]:
# Normalize pictograms part 2
# Local (only unknown words)
def normalize_pictograms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if len(re.compile('[a-zA-Z0-9]').sub('', word))>1:
            for pict in pictograms_to_emoji:
                if pict==word:
                    temp_dict[word] = pictograms_to_emoji[pict]
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Normalize pictograms part 2:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(normalize_pictograms)

########## Step - Normalize pictograms part 2:
Unknown words: 48746 | Known words: 8452
:) --- 😁
=) --- 😁
:( --- 😡
;) --- 😜
%) --- 😵
:/ --- 🤔


In [23]:
# Isolate brakets and quotes
def isolate_brackets(texts):
    chars = '()[]{}<>"'
    chars_dict = {ord(c):f' {c} ' for c in chars}
    texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Brackets and quotes:'); check_vocab(texts, local_vocab)
    if verbose: print_dict(chars_dict)
    return texts

texts = texts.pipe(isolate_brackets)

########## Step - Brackets and quotes:
Unknown words: 46987 | Known words: 8523
40 ---  ( 
41 ---  ) 
91 ---  [ 
93 ---  ] 
123 ---  { 
125 ---  } 
60 ---  < 
62 ---  > 
34 ---  " 


In [24]:
# Extract date and time
def extract_date_and_time(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}

    re_inb = re.compile('[,\'"`]')
    re_fix = re.compile('^[$£%€][-+][0-9]')
    time_regex = re.compile('([0-9]{1,2}:[0-9]{1,2}:[0-9]{1,4})')
    date_regex = re.compile('([0-9]{1,4}\/[0-9]{1,2}\/[0-9]{1,4})')
    for word in temp_vocab:
        prefilter = re_inb.sub('', word).replace(',', '.')
        if re_fix.search(prefilter):
            prefilter = prefilter[1] + prefilter[0] + prefilter[2:]

        ## -------- Time
        time_result = time_regex.search(prefilter)
        if time_result:
            prefix = prefilter[:time_result.start()]
            suffix = prefilter[time_result.end():]
            mpart = prefilter[time_result.start():time_result.end()]
            temp_dict[word] = ' '.join([
                prefix,
                place_hold(str(mpart), TIME_TAG),
                suffix
            ])
            continue

        ## -------- Date
        date_result = date_regex.search(prefilter.replace('-', '/'))
        if date_result and len(word.split('/')) == 3:
            prefix = prefilter[:date_result.start()]
            suffix = prefilter[date_result.end():]
            mpart = prefilter[date_result.start():date_result.end()]
            temp_dict[word] = ' '.join([
                prefix,
                place_hold(str(mpart), DATE_TAG),
                suffix
            ])
            continue
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Extract date and time:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(extract_date_and_time)

########## Step - Extract date and time:
Unknown words: 46989 | Known words: 8523
11:12:43 ---  @TIME[11:12:43] 
06:26:31 ---  @TIME[06:26:31] 
17:05:00 ---  @TIME[17:05:00] 
02/01/2021, ---  @DATE[02/01/2021] 
23:12:20 ---  @TIME[23:12:20] 
02/02/2021 ---  @DATE[02/02/2021] 
13:17:53 ---  @TIME[13:17:53] 
13:12:47 ---  @TIME[13:12:47] 
14:19:13 ---  @TIME[14:19:13] 
06:02:43 ---  @TIME[06:02:43] 


In [25]:
def custom_global_synonyms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_dict = {}
    for word in temp_vocab:
        if word in helper_custom_general_synonyms:
            temp_dict[word] = helper_custom_general_synonyms[word]

    for k,v in list(temp_dict.items()):
        if k == v:
            temp_dict.pop(k)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Custom global word synonyms:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(custom_global_synonyms)


########## Step - Custom global word synonyms:
Unknown words: 46985 | Known words: 8523
b4 --- before
mkt --- market
u.s. --- united states
chg --- change


In [26]:
# Break short words
def break_short_words(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_vocab = [k for k in temp_vocab if len(k)<=20]

    temp_dict = {}
    for word in temp_vocab:
        if '/' in word and not word.startswith('u/') and not word.startswith('r/'):
            temp_dict[word] = re.sub('/', ' / ', word)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Break short words:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(break_short_words)

########## Step - Break short words:
Unknown words: 46617 | Known words: 8546
16th/s --- 16th / s
6/10 --- 6 / 10
p/e --- p / e
44/100 --- 44 / 100
#arpa/#btc --- #arpa / #btc
2021/clubhouse --- 2021 / clubhouse
$theta/ --- $theta / 
76/100 --- 76 / 100
/r/wallstreetbets, ---  / r / wallstreetbets,
4/ --- 4 / 


In [27]:
# Break long words
def break_long_words(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_vocab = [k for k in temp_vocab if len(k)>20]

    temp_dict = {}
    for word in temp_vocab:
        if '_' in word and not (len(word) > 2 and word[0] in ['#', '$', '@'] and word[1:len(word)-1].replace('\'s', '').replace('_', '').isalnum()):
            temp_dict[word] = re.sub('_', ' ', word)
        elif '/' in word and not word.startswith('u/') and not word.startswith('r/'):
            temp_dict[word] = re.sub('/', ' / ', word)
        elif len(' '.join(word.split('-')).split())>2:
            temp_dict[word] = re.sub('-', ' ', word)
        for s in ',.:;':
            if s in word and not re.compile('[+#@$/,.:;-]').sub('', word).isnumeric():
                temp_dict[word] = word.replace(s, f' {s} ')

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Break long words:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

for i in range(3):
    texts = texts.pipe(break_long_words)

########## Step - Break long words:
Unknown words: 46595 | Known words: 8551
ripconnnneeeeecccctt. --- ripconnnneeeeecccctt . 
trx,doge,xrp,kava,xlm,bel,cvc --- trx , doge , xrp , kava , xlm , bel , cvc
poker/blackjack/roulette --- poker / blackjack / roulette
sell-bitcoin-btc-for-usd-in-united-states --- sell bitcoin btc for usd in united states
everybody's-getting-#bitcoin-for-birthdays --- everybody's getting #bitcoin for birthdays
noche/madrugada/manana --- noche / madrugada / manana
buy-bitcoin-btc-for-aud-in-kenya --- buy bitcoin btc for aud in kenya
34650-34850-35050-35250-35450 --- 34650 34850 35050 35250 35450
better,faster,chesper --- better , faster , chesper
range:01/01/2016-02/01/2021 --- range : 01/01/2016-02/01/2021
########## Step - Break long words:
Unknown words: 46594 | Known words: 8551
reddit/robinhood/gamestop --- reddit / robinhood / gamestop
level,thankyouverymuch --- level , thankyouverymuch
01/01/2016-02/01/2021 --- 01 / 01 / 2016-02 / 01 / 2021
########## Ste

In [28]:
# TODO: add number parsing before
# Diambiguate entities
# Split words on @,# and $ to clear up ambiguities between entitites
def disambiguate_entitites(texts):
    symbols = '@#$'
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('@' in k or '#' in k or '$' in k)]

    temp_dict = {}
    for word in temp_vocab:
        for symbol in symbols:
            if symbol not in word: continue
            left, *right = word.split(symbol)
            rightz = symbol.join(right)
            if len(left) > 0 and len(right[0]) > 0 and right[0].isalnum():
                temp_dict[word] = f'{left} {symbol}{rightz}'
            break

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Disambiguate entities:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(disambiguate_entitites)

########## Step - Disambiguate entities:
Unknown words: 46502 | Known words: 8553
,#xrp --- , #xrp
.#online --- . #online
12#in --- 12 #in
.@elonmusk --- . @elonmusk
goal.#nextprotocol --- goal. #nextprotocol
us$35 --- us $35
iui#btc --- iui #btc
1.61-$11 --- 1.61- $11
~$35k --- ~ $35k
ai#btc --- ai #btc


In [29]:
def custom_synonyms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_dict = {}
    for word in temp_vocab:
        if word in helper_custom_synonyms:
            temp_dict[word] = helper_custom_synonyms[word]

    for k,v in list(temp_dict.items()):
        if k == v:
            temp_dict.pop(k)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Custom word synonyms:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(custom_synonyms)

########## Step - Custom word synonyms:
Unknown words: 46463 | Known words: 8553
$hodl --- #hodl
bitstamp --- @bitstamp
@blockchain --- #blockchain
coinbase --- @coinbase
#bittrex --- @bittrex
paypal --- @paypal
#coinbase --- @coinbase
$binance --- @binance
#bitmex --- @bitmex
#altcoin --- #altcoins


In [30]:
def custom_currency_synonyms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_dict = {}
    for word in temp_vocab:
        if word in helper_currency_synonyms:
            temp_dict[word] = helper_currency_synonyms[word]

    for k,v in list(temp_dict.items()):
        if k == v:
            temp_dict.pop(k)

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Custom currency synonyms:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(custom_currency_synonyms)

########## Step - Custom currency synonyms:
Unknown words: 46173 | Known words: 8553
comp --- $compound
$cro --- $crypto_com_coin
$cas --- $cashaa
$nmc --- $namecoin
yfi --- $yearn_finance
$nav --- $nav_coin
$audio --- $audius
#qtum --- $qtum
$orn --- $orion_protocol
$cover --- $cover_protocol_new


In [31]:
# Remove/Convert usernames and hashtags
def extract_entities(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if (len(word) > 2) and (word[1:len(word)-1].replace('\'s', '').replace('_', '').isalnum()):
            new_word = word.replace('\'s', '')
            if not re.compile('[#@$/,.:;]').sub('', new_word).isnumeric():
                new_word = re.compile('[,.:;]').sub('', new_word)
                if word.startswith('@'):
                    temp_dict[word] = place_hold(new_word[1:], USER_TAG)
                elif word.startswith('#'):
                    temp_dict[word] = place_hold(new_word[1:], HASH_TAG)
                elif word.startswith('u/'):
                    temp_dict[word] = place_hold(new_word[2:], USER_TAG)
                elif word.startswith('r/'):
                    temp_dict[word] = place_hold(new_word[2:], HASH_TAG)
                elif word.startswith('$') and new_word[1:].replace('_', '').isalpha():
                    tag = CURRENCY_TAG if word[1:] in helper_currency_synonyms else HASH_TAG
                    temp_dict[word] = place_hold(new_word[1:], tag)
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - UserName and Hashtag:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(extract_entities)

########## Step - UserName and Hashtag:
Unknown words: 45459 | Known words: 8553
@cardosogerman --- @USR[cardosogerman]
#binanceexchange --- @HTAG[binanceexchange]
@followarmysbts --- @USR[followarmysbts]
#speculators --- @HTAG[speculators]
#decentralization --- @HTAG[decentralization]
$uvxy --- @HTAG[uvxy]
#bigpumpsignal --- @HTAG[bigpumpsignal]
@bloomberg --- @USR[bloomberg]
#casino --- @HTAG[casino]
#financialeducation --- @HTAG[financialeducation]


In [32]:
# Hashtag and currency union
def hashtag_currency_union(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = set([k for k in temp_vocab if not check_replace(k)])
    temp_dict = {}
    for w in temp_vocab:
        if w.startswith(CURRENCY_TAG):
            if w.replace(CURRENCY_TAG, HASH_TAG) in temp_vocab:
                temp_dict[w.replace(CURRENCY_TAG, HASH_TAG)] = w
            if w.replace(CURRENCY_TAG, USER_TAG) in temp_vocab:
                temp_dict[w.replace(CURRENCY_TAG, USER_TAG)] = w
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict, skip_check=True) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Hashtag and currency union:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(hashtag_currency_union)

########## Step - Hashtag and currency union:
Unknown words: 45432 | Known words: 8553
@HTAG[link] --- @CURR[link]
@HTAG[xrp] --- @CURR[xrp]
@HTAG[zilliqa] --- @CURR[zilliqa]
@HTAG[qtum] --- @CURR[qtum]
@HTAG[dash] --- @CURR[dash]
@HTAG[litecoin] --- @CURR[litecoin]
@HTAG[tezos] --- @CURR[tezos]
@HTAG[bitcoin] --- @CURR[bitcoin]
@USR[bitcoin] --- @CURR[bitcoin]
@HTAG[cardano] --- @CURR[cardano]


In [33]:
# Remove ending underscore (or add quotation marks???)
def remove_ending_underscore(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('_' in k)]
    temp_dict = {}
    for word in temp_vocab:
        new_word = word
        if word[len(word)-1]=='_':
            for i in range(len(word),0,-1):
                if word[i-1]!='_':
                    new_word = word[:i]
                    temp_dict[word] = new_word
                    break
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove ending underscore:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_ending_underscore)

########## Step - Remove ending underscore:
Unknown words: 45431 | Known words: 8553
zennf_ --- zennf
_h_o_u_r_s_ --- _h_o_u_r_s
_i_n_ --- _i_n
._ --- .
e_n_d_s_ --- e_n_d_s


In [34]:
# Remove starting underscore
def remove_starting_underscore(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('_' in k)]
    temp_dict = {}
    for word in temp_vocab:
        new_word = word
        if word[0]=='_':
            for i in range(len(word)):
                if word[i]!='_':
                    new_word = word[i:]
                    temp_dict[word] = new_word
                    break
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove starting underscore:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_starting_underscore)

########## Step - Remove starting underscore:
Unknown words: 45431 | Known words: 8553
_h_o_u_r_s --- h_o_u_r_s
_i_n --- i_n


In [35]:
# End word punctuations
def end_word_punctuations(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (not k[len(k)-1].isalnum())]
    temp_dict = {}
    for word in temp_vocab:
        new_word = word
        for i in range(len(word),0,-1):
            if word[i-1].isnumeric() and re.compile('[$£%€]').match(word[i]):
                break

            if word[i-1].isalnum():
                new_word = word[:i] + ' ' + word[i:]
                break
        temp_dict[word] = new_word
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - End word punctuations:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(end_word_punctuations)

########## Step - End word punctuations:
Unknown words: 37616 | Known words: 9040
crypto: --- crypto :
mind-bender, --- mind-bender ,
clue. --- clue .
rappers, --- rappers ,
'21, --- '21 ,
'stake' --- 'stake '
informations. --- informations .
meme! --- meme !
semana. --- semana .
fund. --- fund .


In [36]:
scale_mapping = {
    'b': 1000000000,
    'bn': 1000000000,
    'bln': 1000000000,
    'billion': 1000000000,
    'm': 1000000,
    'mn': 1000000,
    'mln': 1000000,
    'million': 1000000,
    'k': 1000,
    'thousand': 1000,
    '-': -1,
}

translate = {
    '$': 'usd', '£': 'gbp','%': 'percent', '€': 'eur'
}

translate_suffix = {
    'x': 'times'
}

translate_prefix = {
    '~': 'around',
    '+-': 'around',
    '±': 'around',
    '@': 'at',
    '=': 'equals',
    '*#': 'ranked',
    '#': 'ranked',
}

def serialize_numbers(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    re_inb = re.compile('[,\'"`]')
    re_num = re.compile('^(~|\+-|±|@|=|#|\*#)?[-@+*^#:]?[$£%€]?(([.:]?[0-9])+)[$£%€]?')
    re_fix = re.compile('^[$£%€][-+][0-9]')
    time_regex = re.compile('([0-9]{1,2}:[0-9]{1,2}:[0-9]{1,4})')
    date_regex = re.compile('([0-9]{1,4}\/[0-9]{1,2}\/[0-9]{1,4})')
    for word in temp_vocab:
        prefilter = re_inb.sub('', word).replace(',', '.')
        if re_fix.search(prefilter):
            prefilter = prefilter[1] + prefilter[0] + prefilter[2:]

        ## ----- Various other numbers
        result = re_num.search(prefilter)
        if result and result.pos == 0:
            # Process combined numbers / ranges in next iteration
            if '-' in word and not word.startswith('-') and not word.startswith('+-'):
                temp_dict[word] = ' '.join(word.split('-'))
                continue

            main_part = prefilter[:result.end()]
            prefix = ''
            for prefix_key, prefix_name in translate_prefix.items():
                if main_part.startswith(prefix_key):
                    prefix = prefix_name
                    main_part = main_part.replace(prefix_key, '', 1)
                    break

            main = re.compile('^[~@+*^#:]').sub('',main_part)
            currency = re.compile('[$£%€]').search(main)
            currency = main[currency.start():currency.end()] if currency else None
            main = re.compile('[$£%€]').sub('', main)
            suffix = prefilter[result.end():]

            multiplier = 1
            if re.compile('\.[0-9]{1,2}$').search(main): # decimal
                multiplier *= 0.01 if main[-1].isnumeric() else 0.1
            if '-' in main: # Neg numbers
                multiplier *= -1
                main = main.replace('-', '')
            # Textual scale
            if suffix in scale_mapping:
                multiplier *= scale_mapping[suffix]
                suffix = ''
            if suffix in translate_suffix:
                suffix = translate_suffix[suffix]

            number = round(float(main.replace('.', '').replace(':', '')) * multiplier, 2)
            # print(f'{number}  /  {currency}  /  {suffix}  /  {word}')
            # noinspection PyTypeChecker
            temp_dict[word] = ' '.join(filter(len,[
                prefix,
                place_hold(str(number), NUMBER_TAG),
                translate[currency] if currency else '',
                suffix
            ]))

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Serialize numbers:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts


# Clean up numbers
for i in range(4):
    texts = texts.pipe(serialize_numbers)

########## Step - Serialize numbers:
Unknown words: 36070 | Known words: 9052
$1488.36 --- @NUM[1488.36] usd
4.63% --- @NUM[4.63] percent
0xce1f27b591ca205066ac9257e3cab7b604a457b4 --- @NUM[0.0] xce1f27b591ca205066ac9257e3cab7b604a457b4
$28k --- @NUM[28000.0] usd
193.50$ --- @NUM[193.5] usd
424,017 --- @NUM[424017.0]
171.18$ --- @NUM[171.18] usd
1.63% --- @NUM[1.63] percent
2k --- @NUM[2000.0]
44,903 --- @NUM[44903.0]
########## Step - Serialize numbers:
Unknown words: 35971 | Known words: 9052
2k --- @NUM[2000.0]
0.00145 --- @NUM[145.0]
17% --- @NUM[17.0] percent
0.45 --- @NUM[0.45]
2022 --- @NUM[2022.0]
+35 --- @NUM[35.0]
0.00135 --- @NUM[135.0]
0.0014 --- @NUM[14.0]
10pm --- @NUM[10.0] pm
21,621 --- @NUM[21621.0]
########## Step - Serialize numbers:
Unknown words: 35968 | Known words: 9052
+5x --- @NUM[5.0] times
=72k --- equals @NUM[72000.0]
=1.500 --- equals @NUM[1500.0]
=11k --- equals @NUM[11000.0]
*365 --- @NUM[365.0]
########## Step - Serialize numbers:
Unknown words: 35968 | 

In [37]:
# Extract entities again
texts = texts\
    .pipe(custom_global_synonyms)\
    .pipe(disambiguate_entitites)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Custom global word synonyms:
Unknown words: 35967 | Known words: 9052
chg --- change
########## Step - Disambiguate entities:
Unknown words: 35968 | Known words: 9052
.@elonmusk --- . @elonmusk
#innitialcoinoffering@myidentitycoin --- #innitialcoinoffering @myidentitycoin
'#bitcoin --- ' #bitcoin
|#litecoin --- | #litecoin
.@moneyonchainok --- . @moneyonchainok
us$450 --- us $450
.@peterschiff --- . @peterschiff
guap@s --- guap @s
.@joebiden --- . @joebiden
.@jack --- . @jack
########## Step - Custom word synonyms:
Unknown words: 35964 | Known words: 9052
coinbase --- @coinbase
paypal --- @paypal
cointelegraph --- @cointelegraph
binance --- @binance
airdrop --- #airdrop
altcoin --- #altcoins
blockchain --- #blockchain
bittrex --- @bittrex
dogecoins --- $dogecoin
#dogecoins --- $dogecoin
########## Step - Custom currency synonyms:
Unknown words: 35942 | Known words: 9052
yfi --- $yearn_finance
#bitcoin --- $bitcoin
jpy --- $jpy
$gme --- $gamestop_tokenized_stock_ftx
un

In [38]:
# Start word punctuations
def start_word_punctuations(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (not k[0].isalnum() and k[0] not in ['@', '#', '$'])]
    temp_dict = {}
    for word in temp_vocab:
        new_word = word
        for i in range(len(word)):
            if word[i].isalnum() or word[i] in ['#', '@', '$']:
                new_word = word[:i] + ' ' + word[i:]
                break
        temp_dict[word] = new_word
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Start word punctuations:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(start_word_punctuations)

########## Step - Start word punctuations:
Unknown words: 35558 | Known words: 9069
'but --- ' but
'pragmatic --- ' pragmatic
¥237500.02 --- ¥ 237500.02
¿posee --- ¿ posee
'units --- ' units
*valentine --- * valentine
¿cuales --- ¿ cuales
-s --- - s
'gone --- ' gone
***we --- *** we


In [39]:
# Extract entities again and numbers
texts = texts\
    .pipe(custom_global_synonyms)\
    .pipe(disambiguate_entitites)\
    .pipe(serialize_numbers)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Custom global word synonyms:
Unknown words: 35558 | Known words: 9069
########## Step - Disambiguate entities:
Unknown words: 35558 | Known words: 9069
########## Step - Serialize numbers:
Unknown words: 35555 | Known words: 9069
3653341.0 --- @NUM[365334.1]
218011.21 --- @NUM[218011.21]
224373.40 --- @NUM[224373.4]
3,767,132 --- @NUM[3767132.0]
3631924.0 --- @NUM[363192.4]
217126.98 --- @NUM[217126.98]
225519.82 --- @NUM[225519.82]
3667909.0 --- @NUM[366790.9]
$700,283.22 --- @NUM[700283.22] usd
$733,005.64 --- @NUM[733005.64] usd
########## Step - Custom word synonyms:
Unknown words: 35555 | Known words: 9069
cryptocurrency --- #cryptocurrency
########## Step - Custom currency synonyms:
Unknown words: 35555 | Known words: 9069
ethereum --- $ethereum
bitcoin --- $bitcoin
########## Step - UserName and Hashtag:
Unknown words: 35549 | Known words: 9069
@_cryptocurator --- @USR[_cryptocurator]
$ethereum --- @CURR[ethereum]
@flow_blockchain --- @USR[flow_blockchain]
@ioh

In [40]:
# Find and replace acronims
def find_replace_acronyms(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if (Counter(word)['.']>1) and (check_replace(word)):
            if (domain_search(word)!='') and (('www' in word) or (Counter(word)['/']>3)):
                temp_dict[word] = place_hold('url ' + domain_search(word))
            else:
                if (re.compile('[\.\,]').sub('', word) in local_vocab) and (len(re.compile('[0-9\.\,\-\/\:]').sub('', word))>0):
                    temp_dict[word] =  place_hold(re.compile('[\.\,]').sub('', word))
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Find and replace acronims:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(find_replace_acronyms)

########## Step - Find and replace acronims:
Unknown words: 35549 | Known words: 9069
a.k.a --- word_placeholder[aka]
b.a.l --- word_placeholder[bal]


In [41]:
# Apply spellchecker for contractions
def apply_spellchecker_contractions(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ("'" in k)]
    temp_dict = {}
    for word in temp_vocab:
        if word in helper_contractions:
            temp_dict[word] = helper_contractions[word] # place_hold(helper_contractions[word])
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Contractions:'); check_vocab(texts, local_vocab)
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(apply_spellchecker_contractions)

########## Step - Contractions:
Unknown words: 35486 | Known words: 9069
let's --- let us
ya'll --- you will
that'll --- that will
would've --- would have
i've --- i have
i'll --- i will
you're --- you are
wouldn't --- would not
you've --- you have
weren't --- were not


In [42]:
# Remove 's (DO WE NEED TO REMOVE IT???)
def remove_comma_s(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {k:k[:-2] for k in temp_vocab if (check_replace(k)) and (k.lower()[-2:]=="'s")}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Remove "s:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_comma_s)

########## Step - Remove "s:
Unknown words: 35278 | Known words: 9084
portfolio's --- portfolio
rat's --- rat
btc's --- btc
@elonmusk's --- @elonmusk
hl's --- hl
bba's --- bba
occam's --- occam
argentina's --- argentina
@zackvoell's --- @zackvoell
schokobub's --- schokobub


In [43]:
def convert_backslash(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('\\' in k)]
    temp_dict = {k:re.sub('\\\\+', ' / ', k) for k in temp_vocab}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Convert backslash:'); check_vocab(texts, local_vocab)
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(convert_backslash)

########## Step - Convert backslash:
Unknown words: 35278 | Known words: 9084
#btc\#usdt --- #btc / #usdt


In [44]:
# Extract entities again and numbers
texts = texts\
    .pipe(custom_global_synonyms)\
    .pipe(disambiguate_entitites)\
    .pipe(serialize_numbers)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Custom global word synonyms:
Unknown words: 35278 | Known words: 9084
########## Step - Disambiguate entities:
Unknown words: 35278 | Known words: 9084
########## Step - Serialize numbers:
Unknown words: 35278 | Known words: 9084
########## Step - Custom word synonyms:
Unknown words: 35275 | Known words: 9084
paypal --- @paypal
binance --- @binance
kraken --- @kraken
blockchain --- #blockchain
cryptocurrency --- #cryptocurrency
crypto --- #cryptocurrency
hodl --- #hodl
########## Step - Custom currency synonyms:
Unknown words: 35266 | Known words: 9084
#bitcoin --- $bitcoin
$tsla --- $tesla_tokenized_stock_bittrex
$gme --- $gamestop_tokenized_stock_ftx
eth --- $ethereum
#usdt --- $tether
#btc --- $bitcoin
$akro --- $akropolis
#quant --- $quant
#ethereum --- $ethereum
litecoin --- $litecoin
########## Step - UserName and Hashtag:
Unknown words: 35212 | Known words: 9084
@tesla --- @USR[tesla]
@kraken --- @USR[kraken]
@knutsvanholm --- @USR[knutsvanholm]
@razor_network 

In [45]:
# Try remove duplicated chars (not sure about this!!!!!). TODO check fist against vocab?
def remove_duplicated_character(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]

    temp_dict = {}
    temp_vocab_dup = []

    for word in temp_vocab:
        if not word.isalpha():
            continue
        temp_vocab_dup.append(''.join(ch for ch, _ in itertools.groupby(word)))
    temp_vocab_dup = set(temp_vocab_dup)
    temp_vocab_dup = temp_vocab_dup.difference(temp_vocab_dup.difference(set(local_vocab)))

    for word in temp_vocab:
        new_word = ''.join(ch for ch, _ in itertools.groupby(word))
        if new_word in temp_vocab_dup:
            temp_dict[word] = new_word
    temp_dict = {k: v for k, v in temp_dict.items() if (k != v) and (v in local_vocab)}

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Dup chars (with vocab check):'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(remove_duplicated_character)

########## Step - Dup chars (with vocab check):
Unknown words: 34936 | Known words: 9122
uu --- u
ioo --- io
jeet --- jet
bbs --- bs
upp --- up
arcc --- arc
eest --- est
yooooou --- you
wonnnnnnnn --- won
ohh --- oh


In [46]:
# Extract entities again and numbers
texts = texts\
    .pipe(custom_global_synonyms)\
    .pipe(disambiguate_entitites)\
    .pipe(serialize_numbers)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Custom global word synonyms:
Unknown words: 34936 | Known words: 9122
########## Step - Disambiguate entities:
Unknown words: 34936 | Known words: 9122
########## Step - Serialize numbers:
Unknown words: 34936 | Known words: 9122
########## Step - Custom word synonyms:
Unknown words: 34936 | Known words: 9122
########## Step - Custom currency synonyms:
Unknown words: 34936 | Known words: 9122
########## Step - UserName and Hashtag:
Unknown words: 34936 | Known words: 9122
########## Step - Hashtag and currency union:
Unknown words: 34936 | Known words: 9122


In [47]:
def isolate_numbers(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]
    temp_dict = {}
    for word in temp_vocab:
        if re.compile('[a-zA-Z]').sub('', word) == word:
            if re.compile('[0-9]').sub('', word) != word:
                temp_dict[word] = word

    global_chars_list = list(set([c for line in temp_dict for c in line]))
    chars = ''.join([c for c in global_chars_list if not c.isdigit()])
    chars_dict = {ord(c):f' {c} ' for c in chars}
    temp_dict = {k:place_hold(k) for k in temp_dict}

    #texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Isolate numbers:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(isolate_numbers)

########## Step - Isolate numbers:
Unknown words: 34936 | Known words: 9122


In [48]:
# Join dashes
def join_dashes(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]

    temp_dict = {}
    for word in temp_vocab:
        temp_dict[word] = re.sub('\-\-+', '-', word)
    temp_dict = {k: v for k, v in temp_dict.items() if k != v}

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Join dashes:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(join_dashes)

########## Step - Join dashes:
Unknown words: 34928 | Known words: 9122
.-- --- .-
#bitcoin--but --- #bitcoin-but
---- --- -
-- --- -
hahaha--and --- hahaha-and
--------- --- -
--- --- -
--------------- --- -
them--china --- them-china
----- --- -


In [49]:
# Try join word (Sloooow)
def join_word_letters(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (Counter(k)['-']>1)]

    temp_dict = {}
    for word in temp_vocab:
        new_word = ''.join(['' if c in '-' else c for c in word])
        if (new_word in local_vocab) and (len(new_word)>3):
            temp_dict[word] = new_word

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Try Split word:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(join_word_letters)

########## Step - Try Split word:
Unknown words: 34927 | Known words: 9122
fi-na-lly --- finally


In [50]:
# TODO: _ should become ' ' and we should preserve numbers or hashtags
# Try Split word
def split_words(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]

    temp_dict = {}
    for word in temp_vocab:
        if len(re.compile('[a-zA-Z0-9\*]').sub('', word))>0:
            chars = re.compile('[a-zA-Z0-9\*]').sub('', word)
            temp_dict[word] = ''.join([' ' + c + ' ' if c in chars else c for c in word])

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Try Split word:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(split_words)

########## Step - Try Split word:
Unknown words: 33543 | Known words: 9244
ceo'luktan --- ceo ' luktan
d'une --- d ' une
a:$33,781 --- a :  $ 33 , 781
kazanacaksınız --- kazanacaks ı n ı z
#cex.io ---  # cex . io
polo.eth --- polo . eth
almayı --- almay ı 
$m ---  $ m
🏃 ---  🏃 
navalny.eth --- navalny . eth


In [51]:
# L33T vocabulary (SLOW)
# https://simple.wikipedia.org/wiki/Leet
# Local (only unknown words)
def convert_leet(word):
    # basic conversion
    word = re.sub('0', 'o', word)
    word = re.sub('1', 'i', word)
    word = re.sub('3', 'e', word)
    word = re.sub('\$', 's', word)
    word = re.sub('\@', 'a', word)
    return word

def convert_leet_words(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if check_replace(k)]

    temp_dict = {}
    for word in temp_vocab:
        new_word = convert_leet(word)
        if (new_word!=word):
            if (len(word)>2) and (new_word in local_vocab):
                temp_dict[word] = new_word

    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - L33T (with vocab check):'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(convert_leet_words)

########## Step - L33T (with vocab check):
Unknown words: 33542 | Known words: 9244
algor1thms --- algorithms


In [52]:
# Extract entities again and numbers
texts = texts\
    .pipe(custom_global_synonyms)\
    .pipe(serialize_numbers)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Custom global word synonyms:
Unknown words: 33542 | Known words: 9244
########## Step - Serialize numbers:
Unknown words: 33500 | Known words: 9245
882 --- @NUM[882.0]
503 --- @NUM[503.0]
886857 --- @NUM[886857.0]
47439 --- @NUM[47439.0]
36923 --- @NUM[36923.0]
33670 --- @NUM[33670.0]
33197 --- @NUM[33197.0]
429 --- @NUM[429.0]
060 --- @NUM[60.0]
21598 --- @NUM[21598.0]
########## Step - Custom word synonyms:
Unknown words: 33499 | Known words: 9245
bitstamp --- @bitstamp
coinbase --- @coinbase
paypal --- @paypal
binance --- @binance
airdrop --- #airdrop
blockchain --- #blockchain
bittrex --- @bittrex
cryptocurrency --- #cryptocurrency
crypto --- #cryptocurrency
bitmex --- @bitmex
########## Step - Custom currency synonyms:
Unknown words: 33494 | Known words: 9245
comp --- $compound
ont --- $ontology
wbtc --- $wrapped_bitcoin
aave --- $aave
ltc --- $litecoin
egld --- $elrond_egld
ils --- $ils
bch --- $bitcoin_cash
usdc --- $usd_coin
elrond --- $elrond_egld
########## 

In [53]:
# Remove placeholders
def remove_placeholders(texts):
    temp_vocab = list(set([c for line in texts for c in line.split()]))
    temp_vocab = [k for k in temp_vocab if (not check_replace(k) and k.startswith(WPLACEHOLDER))]
    temp_dict = {}
    for word in temp_vocab:
        temp_dict[word] = re.sub('___', ' ', word[17:-1])
    texts = texts.apply(lambda x: ' '.join([temp_dict.get(i, i) for i in x.split()]))
    texts = texts.apply(lambda x: ' '.join([i for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Open Holded words:'); check_vocab(texts, local_vocab)
    return texts

texts = texts.pipe(remove_placeholders)

########## Step - Open Holded words:
Unknown words: 33454 | Known words: 9245


In [54]:
# Search multiple form
# Local | example -> flashlights / flashlight -> False / True
def search_multiple_form(texts):
    temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
    temp_vocab = [k for k in temp_vocab if (k[-1:]=='s') and (len(k)>4)]
    temp_dict = {k:k[:-1] for k in temp_vocab if (k[:-1] in local_vocab)}
    texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
    if verbose: print('#' * 10, 'Step - Multiple form:'); check_vocab(texts, local_vocab);
    if verbose: print_dict(temp_dict)
    return texts

texts = texts.pipe(search_multiple_form)

########## Step - Multiple form:
Unknown words: 33116 | Known words: 9322
nuevas --- nueva
winnings --- winning
subscriptions --- subscription
chads --- chad
collaborates --- collaborate
dinos --- dino
audits --- audit
billionaires --- billionaire
grupos --- grupo
goldmans --- goldman


In [55]:
# Extract entities again and numbers
texts = texts\
    .pipe(custom_global_synonyms)\
    .pipe(serialize_numbers)\
    .pipe(custom_synonyms)\
    .pipe(custom_currency_synonyms)\
    .pipe(extract_entities)\
    .pipe(hashtag_currency_union)

########## Step - Custom global word synonyms:
Unknown words: 33116 | Known words: 9322
########## Step - Serialize numbers:
Unknown words: 33116 | Known words: 9322
########## Step - Custom word synonyms:
Unknown words: 33116 | Known words: 9322
########## Step - Custom currency synonyms:
Unknown words: 33116 | Known words: 9322
########## Step - UserName and Hashtag:
Unknown words: 33116 | Known words: 9322
########## Step - Hashtag and currency union:
Unknown words: 33116 | Known words: 9322


In [56]:
# Cut away non english tweets
model = fasttext.load_model('../../data/kaggle/lid.176.ftz')

def langcheck(item, min_confidence=0.2):
    text = ' '.join([w for w in item.split() if not w.startswith('@')])
    if len(text) < 3:
        return True
    results = dict(zip(*model.predict(text, k=2)))
    return results.get('__label__en', 0) > min_confidence

mask = texts.parallel_map(langcheck)
if verbose: print(f'Deleted: {1 - sum(mask)/len(texts)}')
texts = texts[mask]
data = data[mask]
if verbose: print('#' * 10, 'Step - Language datection:'); check_vocab(texts, local_vocab);



Deleted: 0.1362
########## Step - Language datection:
Unknown words: 22017 | Known words: 8817


In [57]:
data['text'] = texts
data

Unnamed: 0,_id,text
0,1356029514439155714,edmonton oilers vs ottawa senators . @CURR[bitcoin] @HTAG[betting] -
1,1356029517123514371,one @CURR[bitcoin] now worth @NUM[33141677.0] usd . market cap @NUM[616963.0] usd billion . based on @HTAG[coindesk] bpi @CURR[bitcoin]
2,1356029540590616577,@USR[dogecoinrich] @USR[dogecoinrise] i have made @NUM[20000.0] usd with @CURR[dogecoin] so far . its not that big but i want to share my profit who doesnt have a chance to board on a train . plea...
3,1356029561264349185,"india proposed @HTAG[cryptocurrency] ban has investors nervous , may feed anti - @CURR[bitcoin] narrative @CURR[bitcoin] @HTAG[criptomonedas] @HTAG[trading] @HTAG[volatilidad] @HTAG[pypro] @CURR[b..."
4,1356029557757726722,what are the coin to rise up ? pld tell me now ! ! ! @CURR[bitcoin]
...,...,...
19995,1356873779323031553,@USR[meekmill] plan @CURR[bitcoin]
19996,1356873843177119744,@USR[jtjeremybtc] also this guy said that @CURR[bitcoin] going to 0 😁
19997,1356873851267874817,"we are still in the early stages of a project with over @NUM[500.0] usd million usd market cap , most projects are pump and dump . @HTAG[golden_ratio_token] is here to stay have a @CURR[the_graph]..."
19998,1356873877733969921,riot @HTAG[blockchain] mined 222 @CURR[bitcoin] in the last quarter . and is valued at @NUM[1350000000.0] usd argo @HTAG[blockchain] mined 305 @CURR[bitcoin] in the last quarter . well you see whe...


### TODO:
* numbers
