In [14]:
# Credit to: https://www.kaggle.com/kyakovlev/preprocessing-bert-public

# General imports|  
import pandas as pd
import re, warnings, pickle, itertools, emoji, unicodedata

# custom imports
from gensim.utils import deaccent
from collections import Counter
from bs4 import BeautifulSoup
from utils.datasets import *

warnings.filterwarnings('ignore')
pd.options.display.max_columns = 10
pd.options.display.max_colwidth = 200

In [9]:
## Initial vars

HELPER_PATH             = '../../data/helpers/'
LOCAL_TEST = True       ## Local test - for test performance on part of the train set only
verbose = True
WPLACEHOLDER = 'word_placeholder'
URL_TAG = '@URL'
USER_TAG = '@USER'
NUMBER_TAG = '@NUMBER'
HASHTAG = '#HASHTAG'
IMMUTABLES = [WPLACEHOLDER, URL_TAG, USER_TAG, NUMBER_TAG, HASHTAG]

SEED = 42               ## Seed for enviroment
seed_everything(SEED)   ## Seed everything

In [10]:
## Helpers

## Load helper helper))
def load_helper_file(filename):
    with open(HELPER_PATH+filename+'.pickle', 'rb') as f:
        temp_obj = pickle.load(f)
    return temp_obj

## Preprocess helpers
def place_hold(w):
    return WPLACEHOLDER + '['+re.sub(' ', '___', w)+']'

def check_replace(w):
    return not bool(re.search(WPLACEHOLDER, w))

def make_cleaning(s, c_dict):
    if check_replace(s):
        s = s.translate(c_dict)
    return s

def make_dict_cleaning(s, w_dict, skip_check=False):
    # Replaces a word using dict if it is mutable
    if skip_check or check_replace(s):
        s = w_dict.get(s, s)
    return s

In [11]:
## Get basic helper data

bert_uncased_vocabulary = load_helper_file('helper_bert_uncased_vocabulary')
bert_cased_vocabulary   = load_helper_file('helper_bert_cased_vocabulary')
bert_char_list          = list(set([c for line in bert_uncased_vocabulary+bert_cased_vocabulary for c in line]))

url_extensions          = load_helper_file('helper_url_extensions')
html_tags               = load_helper_file('helper_html_tags')
good_chars_dieter       = load_helper_file('helper_good_chars_dieter')
bad_chars_dieter        = load_helper_file('helper_bad_chars_dieter')
helper_contractions     = load_helper_file('helper_contractions')
global_vocabulary       = load_helper_file('helper_global_vocabulary')
global_vocabulary_chars = load_helper_file('helper_global_vocabulary_chars')
normalized_chars        = load_helper_file('helper_normalized_chars')
white_list_chars        = load_helper_file('helper_white_list_chars')
white_list_punct        = " '*-.,?!/:;_()[]{}<>=" + '"'
pictograms_to_emoji     = load_helper_file('helper_pictograms_to_emoji')
helper_custom_synonyms     = load_helper_file('helper_custom_synonyms')

In [12]:
## Load Data
good_cols       = ['_id', 'text']
data = pd.read_parquet('../../data/bitcoin_twitter_raw/part_0.parquet')
data = data.iloc[:20000][good_cols]

In [13]:
## Start preprocessing
texts = data['text']
local_vocab = bert_uncased_vocabulary
global_lower=True
texts = texts.astype(str)
if verbose: print('#' *20 ,'Initial State:'); check_vocab(texts, local_vocab)

#################### Initial State:


NameError: name 'check_vocab' is not defined

In [7]:
if global_lower:
    texts = texts.apply(lambda x: x.lower())
    if verbose: print('#'*10 ,'Step - Lowering everything:'); check_vocab(texts, local_vocab)

########## Step - Lowering everything:
Unknown words: 28815 | Known words: 6405


In [8]:
# Normalize chars and dots - SEE HELPER FOR DETAILS
# Global
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,normalized_chars) for i in x.split()]))
texts = texts.apply(lambda x: re.sub('\(dot\)', '.', x))
texts = texts.apply(lambda x: deaccent(x))
if verbose: print('#'*10 ,'Step - Normalize chars and dots:'); check_vocab(texts, local_vocab)

########## Step - Normalize chars and dots:
Unknown words: 28692 | Known words: 6401


In [9]:
# Remove 'control' chars
# Global
global_chars_list = list(set([c for line in texts for c in line]))
chars_dict = {c:'' for c in global_chars_list if unicodedata.category(c)[0]=='C'}
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#'*10 ,'Step - Control Chars:'); check_vocab(texts, local_vocab)

########## Step - Control Chars:
Unknown words: 28692 | Known words: 6401


In [10]:
# Remove hrefs
# Global
texts = texts.apply(lambda x: re.sub(re.findall(r'\<a(.*?)\>', x)[0], '', x) if (len(re.findall(r'\<a (.*?)\>', x))>0) and ('href' in re.findall(r'\<a (.*?)\>', x)[0]) else x)
if verbose: print('#'*10 ,'Step - Remove hrefs:'); check_vocab(texts, local_vocab)

########## Step - Remove hrefs:
Unknown words: 28692 | Known words: 6401


In [11]:
# Convert or remove Bad Symbols
# Global
global_chars_list = list(set([c for line in texts for c in line]))
chars = ''.join([c for c in global_chars_list if (c not in bert_char_list) and (c not in emoji.UNICODE_EMOJI) and (c not in white_list_chars)])
chars_dict = {}
for char in chars:
    try:
        new_char = unicodedata.name(char).split()[-1:][0].lower()
        if len(new_char)==1:
            chars_dict[ord(char)] = new_char
        else:
            chars_dict[ord(char)] = ''
    except:
        chars_dict[ord(char)] = ''
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove Bad Symbols:'); check_vocab(texts, local_vocab)
if verbose: print(chars)
if verbose: print_dict(chars_dict)

########## Step - Remove Bad Symbols:
Unknown words: 27130 | Known words: 6422
ѵ🇹🔐🤣♉𝐢🥳🧯☕거►₦👖🐸🆗🏷🎯🟥👨💬🦾🛎🚶🕵량特🌌🌸🇭😋🥴🤳📰🐐货💼🐕𝖙🐦🎞💔🔮🚀🚒💥👆🔟튼😄🦋⚽🔎𝟰🚜💇⌚💜𝐓☀⛳🛑ℹ🥁🗳🇦🎰트👾￼😔개⛔🍦🔽♀🎊😧🔒위🍊💴❯⚡🇲🤪💍📐↪🔺🌏🤞⛓📲🤗⬛🌙🎩📌🥩아📍🌋𝖔☹👸🎉🔥⬜니🛰🐬⬅𝒕😞🥓💹➕⛅🦽🤙⚪⁩𝐍↩🥮🐢클🌘😈🙃🧨🔖🔚🏽📆❔𝐁☠🔂😑🇮𝒗🌹𝟬🕺𝐊𝟙𝐖󠁧✊🔶🤫⛏𝖈🎶🙋👉🍀💪😘❤🔘☄🥊🇪🙂😶🇨🐱💀🔄🌶👐🏭🧠🏇🤍🌲🌴🌔다🟧🦍😤🌞🦁𝕮🥲🌪🎁✓🇰🤩👊😀🔜🤜💦𝐀빗🌜🗣⚔🥅🔵🤧🖐😕〰🍺₳𝖚😛🐻실👻🕯😵☮월𝐂🐶🥺🆓🦊📮💕🌎📩📸🔼🧵🩸😓𝒏𝖊𝖞🍷⏰🧚☝🙄🆙🤑🙅🎢𝐯👟󠁳🔻𝕽🐋💉⚓✅🏋▓🍎🏅🦄🏫🐉📖🌒을🦯💧😊🍕🤨💱합🍿😫🦮🌍𝒄🌽😥🎨💙👌🇸❄𝐝🚗📣🤡🛒🧑🚘😐✔🦡𝟲🟩⁦⛪🧞시😮😁💩😏🍮🚫𝖕🔔🔴😆𝐏説비☺🍞𝒊𝐑𝐎🧐😠🔸⤵😍🌐🤘𝐫🤭𝐞🦆💠😭🍏🖖⛈𝐲𝖘💣🛀😳強🌈𝟘⁉👀💯🤷🙏📢🟨😨🇷🌊𝒐🌿😉🥶👄👎🛤👥💖📗⋰👬😯📞𝐡🥇密🧷฿💃🌱간🌑➡⃣😜💡💘🟢𝟵⚠🥃😲🫂𝖗💞👋来😒더🇵🤯🤚소⏲💷⚒🏁🏈🙆💎₿🥂🛡🔃🤕🏼❕🍫🔹🦢➖✨🐍🔝🤟레𝟚✌🍔⏫🙊🐳🌓🌛↗코플💸󠁴🇬🔫💛⬆░🏃🤤🌟𝐠币🤌🦚🙌🏂🌖➤👈🔷🃏🐄🙁😂🪙📉⏱🎆😝🏀𝐜▶⚛🥬🙈🤲🤸𝐌🤴😟🌝💲일💊🔗🍹󠁢준𝐒𝟭🖼⏳😇👁🌃👗🤓ꮆꮇ리📈𝟠￥👕♂🏿🎲😡📚💚🇩😣😢‼👍尊💰❌식😪💫🚦🌚👂✋𝖆🔙☎🔆😷ꮤ🐈🌕📺块🥰😬😃👇🚣❣⭐🟠🖕💌𝐭🙀🐲💭😴🏄움🚚오🎮🇳💶😩𝐚💗🤦🐣🎱🍡▫후⏬핫👑链🐂👽📊⤴♾🌼🎤🎭🏴🦐🤠🖤🎈󠁣🕊🔑📦🥵𝐄𝐇💳🔌𝐋🌇👩🏧‌🍻👏기⚫🚂🥸🧡⁠◽𝖓👣래‍🍾🛍🏆🛸❗🦖⬇🇺🇿🐮😚🎄🤝𝐨🏻🤔🕶💵🥒ᵛ🦧😌😅🌧𝐈🔊𝐧🏦🚨🥕✍𝐅🇧󠁿🔋💻🤐✈📱♻😎🪐ⓜ🏾🧁인🏠🇽🍒😱⛵⁣❓🥞🦺
1141 --- 
127481 --- t
128272 --- 
129315 --- 
9801 --- 
119842 --- i
129395 --- 
129519 --- 
9749 --- 
44144 --- 


In [12]:
# Remove Bad Symbols PART 2
# Global
global_chars_list = list(set([c for line in texts for c in line]))
chars = '·' + ''.join([c for c in global_chars_list if (c not in white_list_chars) and (c not in emoji.UNICODE_EMOJI) and (c not in white_list_punct) and (ord(c)>256)])
chars_dict = {}
for char in chars:
    try:
        new_char = unicodedata.name(char).split()[-1:][0].lower()
        if len(new_char)==1:
            chars_dict[ord(char)] = new_char
        else:
            chars_dict[ord(char)] = ''
    except:
        chars_dict[ord(char)] = ''
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove Bad Symbols PART 2:'); check_vocab(texts, local_vocab)
if verbose: print(chars)
if verbose: print_dict(chars_dict)

########## Step - Remove Bad Symbols PART 2:
Unknown words: 27074 | Known words: 6415
·アツ下け♥نيسयच이♦ب…，یクرाつн▪€めそمکรكイ☆らト比ตوˢてもリ•≥まはξзپбーセ♣т区●अлоتквकサほ三า„иы。₹बのчう≈加с？や★ッеدр！あレ∞а手スмاムな
183 --- 
12450 --- a
12484 --- 
19979 --- 
12369 --- 
9829 --- 
1606 --- 
1610 --- 
1587 --- 
2351 --- 


In [13]:
# Remove html tags
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if ('<' in word) and ('>' in word):
        for tag in html_tags:
            if ('<'+tag+'>' in word) or ('</'+tag+'>' in word):
                temp_dict[word] = BeautifulSoup(word, 'html5lib').text
texts = texts.apply(lambda x: ' '.join([temp_dict.get(i, i) for i in x.split()]))
if verbose: print('#' * 10, 'Step - HTML tags:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - HTML tags:
Unknown words: 27074 | Known words: 6415


In [14]:
# Remove links (There is valuable information in links (probably you will find a way to use it))
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
url_rule = r'(?P<url>https?://[^\s]+)'
temp_dict = {k:domain_search(k) for k in temp_vocab if k!= re.compile(url_rule).sub('url', k)}

for word in temp_dict:
    new_value = temp_dict[word]
    if word.find('http')>2:
        temp_dict[word] =  word[:word.find('http')] + ' ' + place_hold(new_value)
    else:
        temp_dict[word] = place_hold(new_value)

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert urls part 1:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Convert urls part 1:
Unknown words: 19527 | Known words: 6415
https://t.co/qyf6yj7ijy --- word_placeholder[t.co]
https://t.co/3lfzillojy --- word_placeholder[t.co]
https://t.co/2tasheqtkm --- word_placeholder[t.co]
https://t.co/fi4saahcvv --- word_placeholder[t.co]
https://t.co/phb0vqa6of --- word_placeholder[t.co]
https://t.co/lfia7xjggr --- word_placeholder[t.co]
https://t.co/nhp1dymsuz --- word_placeholder[t.co]
https://t.co/xljybfzmm5 --- word_placeholder[t.co]
https://t.co/t552iuzzwe --- word_placeholder[t.co]
https://t.co/d0cb4nxwrh --- word_placeholder[t.co]


In [15]:
# Remove twitter links
temp_dict = {
    'word_placeholder[t.co]': ''
}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict, skip_check=True) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert urls part 1.5:'); check_vocab(texts, local_vocab);

########## Step - Convert urls part 1.5:
Unknown words: 19526 | Known words: 6415


In [16]:
# Remove escaped html
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
symbols = {
    '&quot;': '',
    '&&amp;': '',
    '&lt;': '',
    '&gt;': '',
}
temp_dict = {}
for word in temp_vocab:
    if any([rep in word for rep in symbols.keys()]):
        new_word = word
        for rep, to in symbols.items():
            new_word = new_word.replace(rep, to)
        temp_dict[word] = new_word

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict, skip_check=True) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove escaped html:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Remove escaped html:
Unknown words: 19499 | Known words: 6416
&lt;30 --- 30
&lt;3 --- 3
&lt;&lt;send --- send
&gt;&gt;&gt;&gt;@oropocket --- @oropocket
&lt;- --- -
store-&gt;pi --- store-pi
nobody&gt;&gt;&gt;&gt;&gt;&gt;&gt; --- nobody
&lt;10m --- 10m
&gt;&gt;&gt;&gt;#bitcoinz&lt;&lt;&lt;&lt; --- #bitcoinz
&gt;#bitcoin --- #bitcoin


In [17]:
# Convert urls part 2
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}

for word in temp_vocab:
    url_check = False
    if 'file:' in word:
        url_check = True
    elif ('http' in word) or ('ww.' in word) or ('.htm' in word) or ('ftp' in word) or ('.php' in word) or ('.aspx' in word):
        if 'Aww' not in word:
            for d_zone in url_extensions:
                if '.' + d_zone in word:
                    url_check = True
                    break
    elif ('/' in word) and ('.' in word):
        for d_zone in url_extensions:
            if '.' + d_zone + '/' in word:
                url_check = True
                break

    if url_check:
        temp_dict[word] =  place_hold(domain_search(word))

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert urls part 2:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Convert urls part 2:
Unknown words: 19499 | Known words: 6416


In [18]:
# Normalize pictograms
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if len(re.compile('[a-zA-Z0-9]').sub('', word))>2:
        for pict in pictograms_to_emoji:
            if (pict in word) and (len(pict)>2):
                temp_dict[word] = word.replace(pict, pictograms_to_emoji[pict])
            elif pict==word:
                temp_dict[word] = pictograms_to_emoji[pict]

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Normalize pictograms:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Normalize pictograms:
Unknown words: 19499 | Known words: 6416
:-) --- 😁
:-)! --- 😁!
:))) --- 😁)


In [19]:
# Isolate emoji
# Global
global_chars_list = list(set([c for line in texts for c in line]))
chars = ''.join([c for c in global_chars_list if c in emoji.UNICODE_EMOJI])
chars_dict = {ord(c):f' {c} ' for c in chars}
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Isolate emoji:'); check_vocab(texts, local_vocab)
if verbose: print(chars)

########## Step - Isolate emoji:
Unknown words: 19499 | Known words: 6416



In [20]:
# Duplicated dots, question marks and exclamations
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if (Counter(word)['.']>1) or (Counter(word)['!']>1) or (Counter(word)['?']>1) or (Counter(word)[',']>1):
        if (Counter(word)['.']>1):
            new_word = re.sub('\.\.+', ' . . . ', new_word)
        if (Counter(word)['!']>1):
            new_word = re.sub('\!\!+', ' ! ! ! ', new_word)
        if (Counter(word)['?']>1):
            new_word = re.sub('\?\?+', ' ? ? ? ', new_word)
        if (Counter(word)[',']>1):
            new_word = re.sub('\,\,+', ' , , , ', new_word)
        temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Duplicated Chars:'); check_vocab(texts, local_vocab);

########## Step - Duplicated Chars:
Unknown words: 18434 | Known words: 6459


In [21]:
# Remove underscore for spam words
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if (len(re.compile('[a-zA-Z0-9\-\.\,\/\']').sub('', word))/len(word) > 0.6) and ('_' in word):
        temp_dict[word] = re.sub('_', '', word)
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove underscore:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Remove underscore:
Unknown words: 18433 | Known words: 6459
#a__ --- #a


In [22]:
# Isolate spam chars repetition
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if (len(re.compile('[a-zA-Z0-9\-\.\,\/\']').sub('', word))/len(word) > 0.6) and (len(Counter(word))==1) and (len(word)>2):
        temp_dict[word] = ' '.join([' ' + next(iter(Counter(word).keys())) + ' ' for i in range(1)])
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Spam chars repetition:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Spam chars repetition:
Unknown words: 18428 | Known words: 6459
***** ---  * 
$$$$$$$$$$$$ ---  $ 
#### ---  # 
$$$ ---  $ 
$$$$ ---  $ 


In [23]:
# Normalize pictograms part 2
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if len(re.compile('[a-zA-Z0-9]').sub('', word))>1:
        for pict in pictograms_to_emoji:
            if pict==word:
                temp_dict[word] = pictograms_to_emoji[pict]
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Normalize pictograms part 2:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Normalize pictograms part 2:
Unknown words: 18426 | Known words: 6459
=) --- 😁
:) --- 😁
:( --- 😡
;) --- 😜


In [24]:
# Isolate brakets and quotes
# Global
chars = '()[]{}<>"'
chars_dict = {ord(c):f' {c} ' for c in chars}
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Brackets and quotes:'); check_vocab(texts, local_vocab)
if verbose: print_dict(chars_dict)

########## Step - Brackets and quotes:
Unknown words: 17601 | Known words: 6506
40 ---  ( 
41 ---  ) 
91 ---  [ 
93 ---  ] 
123 ---  { 
125 ---  } 
60 ---  < 
62 ---  > 
34 ---  " 


In [25]:
# Break short words
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_vocab = [k for k in temp_vocab if len(k)<=20]

temp_dict = {}
for word in temp_vocab:
    if '/' in word and not word.startswith('u/') and not word.startswith('r/'):
        temp_dict[word] = re.sub('/', ' / ', word)

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Break long words:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Break long words:
Unknown words: 17444 | Known words: 6526
#eth/usdt --- #eth / usdt
ac/dc --- ac / dc
13.2s/conf --- 13.2s / conf
cluster/range --- cluster / range
p/e --- p / e
08/02/2021 --- 08 / 02 / 2021
6/7 --- 6 / 7
rate/market --- rate / market
and/or --- and / or
when/if --- when / if


In [26]:
# Break long words
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_vocab = [k for k in temp_vocab if len(k)>20]

temp_dict = {}
for word in temp_vocab:
    if '_' in word:
        temp_dict[word] = re.sub('_', ' ', word)
    elif '/' in word and not word.startswith('u/') and not word.startswith('r/'):
        temp_dict[word] = re.sub('/', ' / ', word)
    elif len(' '.join(word.split('-')).split())>2:
        temp_dict[word] = re.sub('-', ' ', word)

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Break long words:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Break long words:
Unknown words: 17444 | Known words: 6527
software/application. --- software / application.
cbn/okonjo-iweala/luno/ghana --- cbn / okonjo-iweala / luno / ghana
/jonathan/gabriel/ozo ---  / jonathan / gabriel / ozo
#netunrealizedprofit/loss --- #netunrealizedprofit / loss
nigeria/crypto/#bitcoin/piggyvest/endsars --- nigeria / crypto / #bitcoin / piggyvest / endsars
0.078-0.085-0.099-0.105-0.12 --- 0.078 0.085 0.099 0.105 0.12
pullback/consolidation. --- pullback / consolidation.
casino-partner/stakeholder. --- casino-partner / stakeholder.
standard/professional --- standard / professional
#dgb/#pac/#nano/#doge --- #dgb / #pac / #nano / #doge


In [27]:
# Remove/Convert usernames and hashtags
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if (len(word) > 3) and (word[1:len(word)-1].replace('_', '').isalnum()):
        if not re.compile('[#@$/,.:;]').sub('', word).isnumeric():
            if (word.startswith('@')) or (word.startswith('#')):
                new_word = place_hold(new_word[0] + new_word[1:])
            elif word.startswith('u/'):
                new_word = place_hold('@' + new_word[2:])
            elif word.startswith('r/'):
                new_word = place_hold('#' + new_word[2:])
            elif word.startswith('$') and word[1:].isalpha():
                new_word = place_hold('#' + new_word[1:])
    temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - UserName and Hashtag:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - UserName and Hashtag:
Unknown words: 17171 | Known words: 6527
#person --- word_placeholder[#person]
$crm --- word_placeholder[#crm]
#likeandshare --- word_placeholder[#likeandshare]
#dax30 --- word_placeholder[#dax30]
$luna --- word_placeholder[#luna]
#poloniex --- word_placeholder[#poloniex]
#theprinceibecame --- word_placeholder[#theprinceibecame]
#sunday. --- word_placeholder[#sunday.]
#gold, --- word_placeholder[#gold,]
#tgbp --- word_placeholder[#tgbp]


In [28]:
# Remove ending underscore (or add quotation marks???)
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('_' in k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if word[len(word)-1]=='_':
        for i in range(len(word),0,-1):
            if word[i-1]!='_':
                new_word = word[:i]
                temp_dict[word] = new_word
                break
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove ending underscore:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Remove ending underscore:
Unknown words: 17171 | Known words: 6527


In [29]:
# Remove starting underscore
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('_' in k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if word[0]=='_':
        for i in range(len(word)):
            if word[i]!='_':
                new_word = word[i:]
                temp_dict[word] = new_word
                break
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove starting underscore:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Remove starting underscore:
Unknown words: 17171 | Known words: 6527


In [30]:
# End word punctuations
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (not k[len(k)-1].isalnum())]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    for i in range(len(word),0,-1):
        if word[i-1].isalnum():
            new_word = word[:i] + ' ' + word[i:]
            break
    temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - End word punctuations:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - End word punctuations:
Unknown words: 11903 | Known words: 6974
expensive, --- expensive ,
grape. --- grape .
bitcoin. --- bitcoin .
more, --- more ,
lolz. --- lolz .
fans, --- fans ,
roll. --- roll .
well: --- well :
virus. --- virus .
2021? --- 2021 ?


In [31]:
# Start word punctuations
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (not k[0].isalnum())]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    for i in range(len(word)):
        if word[i].isalnum():
            new_word = word[:i] + ' ' + word[i:]
            break
    temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Start word punctuations:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Start word punctuations:
Unknown words: 11275 | Known words: 7044
$231.00 --- $ 231.00
$58,000 --- $ 58,000
$15k --- $ 15k
~19m --- ~ 19m
*eden --- * eden
$luna --- $ luna
:55670.46 --- : 55670.46
$28 --- $ 28
$23,387,504 --- $ 23,387,504
+581.78 --- + 581.78


In [32]:
# Find and replace acronims
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if (Counter(word)['.']>1) and (check_replace(word)):
        if (domain_search(word)!='') and (('www' in word) or (Counter(word)['/']>3)):
            temp_dict[word] = place_hold('url ' + domain_search(word))
        else:
            if (re.compile('[\.\,]').sub('', word) in local_vocab) and (len(re.compile('[0-9\.\,\-\/\:]').sub('', word))>0):
                temp_dict[word] =  place_hold(re.compile('[\.\,]').sub('', word))
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Find and replace acronims:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Find and replace acronims:
Unknown words: 11275 | Known words: 7044
g.o.a.t --- word_placeholder[goat]


In [33]:
# Apply spellchecker for contractions
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ("'" in k)]
temp_dict = {}
for word in temp_vocab:
    if word in helper_contractions:
        temp_dict[word] = helper_contractions[word] # place_hold(helper_contractions[word])
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Contractions:'); check_vocab(texts, local_vocab)
if verbose: print_dict(temp_dict)

########## Step - Contractions:
Unknown words: 11219 | Known words: 7044
haven't --- have not
c'mon --- c'mon
they'll --- they will
isn't --- is not
when's --- when is
he'd --- he would
ain't --- is not
wouldn't --- would not
can't --- cannot
who's --- who is


In [34]:
# Remove 's (DO WE NEED TO REMOVE IT???)
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {k:k[:-2] for k in temp_vocab if (check_replace(k)) and (k.lower()[-2:]=="'s")}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove "s:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Remove "s:
Unknown words: 11064 | Known words: 7053
job's --- job
neighbor's --- neighbor
elonmusk's --- elonmusk
rarible's --- rarible
monday's --- monday
know's --- know
elon's --- elon
people's --- people
centre's --- centre
lion's --- lion


In [35]:
# Convert backslash
# Global
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('\\' in k)]
temp_dict = {k:re.sub('\\\\+', ' / ', k) for k in temp_vocab}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert backslash:'); check_vocab(texts, local_vocab)
if verbose: print_dict(temp_dict)

########## Step - Convert backslash:
Unknown words: 11064 | Known words: 7053


In [36]:
# Try remove duplicated chars (not sure about this!!!!!). TODO check fist against vocab?
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]

temp_dict = {}
temp_vocab_dup = []

for word in temp_vocab:
    if not word.isalpha():
        continue
    temp_vocab_dup.append(''.join(ch for ch, _ in itertools.groupby(word)))
temp_vocab_dup = set(temp_vocab_dup)
temp_vocab_dup = temp_vocab_dup.difference(temp_vocab_dup.difference(set(local_vocab)))

for word in temp_vocab:
    new_word = ''.join(ch for ch, _ in itertools.groupby(word))
    if new_word in temp_vocab_dup:
        temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if (k != v) and (v in local_vocab)}

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Dup chars (with vocab check):'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Dup chars (with vocab check):
Unknown words: 10910 | Known words: 7076
boysss --- boys
haalf --- half
ooop --- op
btt --- bt
ummm --- um
kaam --- kam
huuuuge --- huge
aave --- ave
aaaaaaaand --- and
aax --- ax


In [37]:
# Isolate numbers
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if re.compile('[a-zA-Z]').sub('', word) == word:
        if re.compile('[0-9]').sub('', word) != word:
            temp_dict[word] = word

global_chars_list = list(set([c for line in temp_dict for c in line]))
chars = ''.join([c for c in global_chars_list if not c.isdigit()])
chars_dict = {ord(c):f' {c} ' for c in chars}
temp_dict = {k:place_hold(k) for k in temp_dict}

#texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Isolate numbers:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Isolate numbers:
Unknown words: 10910 | Known words: 7076
5-7 --- word_placeholder[5-7]
20-50 --- word_placeholder[20-50]
3.95 --- word_placeholder[3.95]
10000000 --- word_placeholder[10000000]
0.05561 --- word_placeholder[0.05561]
254710729282 --- word_placeholder[254710729282]
4.43 --- word_placeholder[4.43]
2.26.2021 --- word_placeholder[2.26.2021]
17:01 --- word_placeholder[17:01]
34300 --- word_placeholder[34300]


In [38]:
# Join dashes
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]

temp_dict = {}
for word in temp_vocab:
    temp_dict[word] = re.sub('\-\-+', '-', word)
temp_dict = {k: v for k, v in temp_dict.items() if k != v}

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Join dashes:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Join dashes:
Unknown words: 10904 | Known words: 7076
200---withdraw --- 200-withdraw
----------------- --- -
-- --- -
---- --- -
------------- --- -
----- --- -
transactions--innovate --- transactions-innovate
aa--tag --- aa-tag
--- --- -


In [39]:
# Try join word (Sloooow)
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (Counter(k)['-']>1)]

temp_dict = {}
for word in temp_vocab:
    new_word = ''.join(['' if c in '-' else c for c in word])
    if (new_word in local_vocab) and (len(new_word)>3):
        temp_dict[word] = new_word

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Try Split word:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Try Split word:
Unknown words: 10904 | Known words: 7076


In [40]:
# Try Split word
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]

temp_dict = {}
for word in temp_vocab:
    if len(re.compile('[a-zA-Z0-9\*]').sub('', word))>0:
        chars = re.compile('[a-zA-Z0-9\*]').sub('', word)
        temp_dict[word] = ''.join([' ' + c + ' ' if c in chars else c for c in word])

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Try Split word:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Try Split word:
Unknown words: 9466 | Known words: 7287
re-read --- re - read
5-7 --- 5 - 7
?: ---  ?  : 
currency@elonmusk --- currency @ elonmusk
20-50 --- 20 - 50
wo,be --- wo , be
seedphrase#safe --- seedphrase # safe
3.95 --- 3 . 95
0.05561 --- 0 . 05561
4.43 --- 4 . 43


In [41]:
# L33T vocabulary (SLOW)
# https://simple.wikipedia.org/wiki/Leet
# Local (only unknown words)
def convert_leet(word):
    # basic conversion
    word = re.sub('0', 'o', word)
    word = re.sub('1', 'i', word)
    word = re.sub('3', 'e', word)
    word = re.sub('\$', 's', word)
    word = re.sub('\@', 'a', word)
    return word

temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]

temp_dict = {}
for word in temp_vocab:
    new_word = convert_leet(word)
    if (new_word!=word):
        if (len(word)>2) and (new_word in local_vocab):
            temp_dict[word] = new_word

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - L33T (with vocab check):'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - L33T (with vocab check):
Unknown words: 9463 | Known words: 7289
t13 --- tie
fa1 --- fai
sh1t --- shit


In [42]:
# Remove placeholders
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if (not check_replace(k))]
temp_dict = {}
for word in temp_vocab:
    temp_dict[word] = re.sub('___', ' ', word[17:-1])
texts = texts.apply(lambda x: ' '.join([temp_dict.get(i, i) for i in x.split()]))
texts = texts.apply(lambda x: ' '.join([i for i in x.split()]))
if verbose: print('#' * 10, 'Step - Open Holded words:'); check_vocab(texts, local_vocab)

########## Step - Open Holded words:
Unknown words: 9462 | Known words: 7289


In [43]:
# Search multiple form
# Local | example -> flashlights / flashlight -> False / True
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (k[-1:]=='s') and (len(k)>4)]
temp_dict = {k:k[:-1] for k in temp_vocab if (k[:-1] in local_vocab)}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Multiple form:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Multiple form:
Unknown words: 9259 | Known words: 7354
asides --- aside
capes --- cape
achieves --- achieve
cravings --- craving
maximizes --- maximize
wastes --- waste
stabilizes --- stabilize
earns --- earn
envelopes --- envelope
forgives --- forgive


In [44]:
data['text'] = texts
data

Unnamed: 0,_id,text
0,1358900722977435655,asia . . . you have one job . . . and that is continuation . . . #btc #eth #alts
1,1358900709584961537,"new art piece straight from the oven ! @apompliano this is for you , man ! hope you like it . you rock ! let us go ! #btc"
2,1358900559508553728,"#metx dd #metx are looking to incorporate blockchain technology into their business . "" we are actively searching for qualified and well - known partners in blockchain industry like ebang "" #btc @..."
3,1358900511538360335,me watching #btc and the giant wall at 45k that is gonna get blasted soon ! ! !
4,1358900445381488641,investing is so much fun ! ! ! @elonmusk #doge #btc #stonks
...,...,...
9995,1357326186540650496,#ont big rise is coming do not miss it #bitcoin #ont
9996,1357326140013174789,this . @wsbchairman @wsbmod #bitcoin
9997,1357326132375298049,bullish ! go happy thursday ! #bitcoin
9998,1357326099349438468,mark cuban talks bitcoin hodlers and blockchain stocks in recent ama #blockchain #bitcoin via


### TODO:
* numbers