In [1]:
# Credit to: https://www.kaggle.com/kyakovlev/preprocessing-bert-public

# General imports
import numpy as np
import pandas as pd
import os, sys, gc, re, warnings, pickle, itertools, emoji, psutil, random, unicodedata, torch

# custom imports
from gensim.utils import deaccent
from collections import Counter
from bs4 import BeautifulSoup
from multiprocessing import Pool

warnings.filterwarnings('ignore')
pd.options.display.max_columns = 10
pd.options.display.max_colwidth = 200

## Initial vars

In [2]:
HELPER_PATH             = '../data/helpers/'
LOCAL_TEST = True       ## Local test - for test performance on part of the train set only
WPLACEHOLDER = 'word_placeholder'

## Load helper helper))
def load_helper_file(filename):
    with open(HELPER_PATH+filename+'.pickle', 'rb') as f:
        temp_obj = pickle.load(f)
    return temp_obj

## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

    if 'torch' in sys.modules:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True

SEED = 42               ## Seed for enviroment
seed_everything(SEED)   ## Seed everything

## Helpers

In [3]:
## Multiprocessing Run.
# :df - DataFrame to split                      # type: pandas DataFrame
# :func - Function to apply on each split       # type: python function
# This function is NOT 'bulletproof', be carefull and pass only correct types of variables.
def df_parallelize_run(df, func):
    num_partitions, num_cores = 16, psutil.cpu_count()  # number of partitions and cores
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

## Build of vocabulary from file - reading data line by line
## Line splited by 'space' and we store just first argument - Word
# :path - txt/vec/csv absolute file path        # type: str
def get_vocabulary(path):
    with open(path) as f:
        return [line.strip().split()[0] for line in f][0:]

## Check how many words are in Vocabulary
# :c_list - 1d array with 'comment_text'        # type: pandas Series
# :vocabulary - words in vocabulary to check    # type: list of str
# :response - type of response                  # type: str
def check_vocab(c_list, vocabulary, response='default'):
    try:
        words = set([w for line in c_list for w in line.split()])
        u_list = words.difference(set(vocabulary))
        k_list = words.difference(u_list)

        if response=='default':
            print('Unknown words:', len(u_list), '| Known words:', len(k_list))
        elif response=='unknown_list':
            return list(u_list)
        elif response=='known_list':
            return list(k_list)
    except:
        return []

## Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2)

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

## Domain Search
re_3986_enhanced = re.compile(r"""
        # Parse and capture RFC-3986 Generic URI components.
        ^                                    # anchor to beginning of string
        (?:  (?P<scheme>    [^:/?#\s]+):// )?  # capture optional scheme
        (?:(?P<authority>  [^/?#\s]*)  )?  # capture optional authority
             (?P<path>        [^?#\s]*)      # capture required path
        (?:\?(?P<query>        [^#\s]*)  )?  # capture optional query
        (?:\#(?P<fragment>      [^\s]*)  )?  # capture optional fragment
        $                                    # anchor to end of string
        """, re.MULTILINE | re.VERBOSE)

re_domain =  re.compile(r"""
        # Pick out top two levels of DNS domain from authority.
        (?P<domain>[^.]+\.[A-Za-z]{2,6})  # $domain: top two domain levels.
        (?::[0-9]*)?                      # Optional port number.
        $                                 # Anchor to end of string.
        """,
        re.MULTILINE | re.VERBOSE)

def domain_search(text):
    try:
        return re_domain.search(re_3986_enhanced.match(text).group('authority')).group('domain')
    except:
        return 'url'

## Preprocess helpers
def place_hold(w):
    return WPLACEHOLDER + '['+re.sub(' ', '___', w)+']'

def check_replace(w):
    return not bool(re.search(WPLACEHOLDER, w))

def make_cleaning(s, c_dict):
    if check_replace(s):
        s = s.translate(c_dict)
    return s

def make_dict_cleaning(s, w_dict):
    if check_replace(s):
        s = w_dict.get(s, s)
    return s

def export_dict(temp_dict, serial_num):
    pd.DataFrame.from_dict(temp_dict, orient='index').to_csv('dict_'+str(serial_num)+'.csv')

def print_dict(temp_dict, n_items=10):
    run = 0
    for k,v in temp_dict.items():
        print(k,'---',v)
        run +=1
        if run==n_items:
            break

## Get basic helper data

In [4]:
bert_uncased_vocabulary = load_helper_file('helper_bert_uncased_vocabulary')
bert_cased_vocabulary   = load_helper_file('helper_bert_cased_vocabulary')
bert_char_list          = list(set([c for line in bert_uncased_vocabulary+bert_cased_vocabulary for c in line]))

url_extensions          = load_helper_file('helper_url_extensions')
html_tags               = load_helper_file('helper_html_tags')
good_chars_dieter       = load_helper_file('helper_good_chars_dieter')
bad_chars_dieter        = load_helper_file('helper_bad_chars_dieter')
helper_contractions     = load_helper_file('helper_contractions')
global_vocabulary       = load_helper_file('helper_global_vocabulary')
global_vocabulary_chars = load_helper_file('helper_global_vocabulary_chars')
normalized_chars        = load_helper_file('helper_normalized_chars')
white_list_chars        = load_helper_file('helper_white_list_chars')
white_list_punct        = " '*-.,?!/:;_()[]{}<>=" + '"'
pictograms_to_emoji     = load_helper_file('helper_pictograms_to_emoji')

## Load Data

In [5]:
good_cols       = ['_id', 'text']
data = pd.read_csv('../data/bitcoin_twitter_raw.csv')
if LOCAL_TEST:
    data = data.iloc[:10000][good_cols]

## Word / Vocab cleaning

In [6]:
texts = data['text']
local_vocab = bert_uncased_vocabulary
verbose = True
global_lower=True
texts = texts.astype(str)
if verbose: print('#' *20 ,'Initial State:'); check_vocab(texts, local_vocab)

#################### Initial State:
Unknown words: 35646 | Known words: 5919


In [7]:
if global_lower:
    texts = texts.apply(lambda x: x.lower())
    if verbose: print('#'*10 ,'Step - Lowering everything:'); check_vocab(texts, local_vocab)

########## Step - Lowering everything:
Unknown words: 29702 | Known words: 6942


In [8]:
# Normalize chars and dots - SEE HELPER FOR DETAILS
# Global
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,normalized_chars) for i in x.split()]))
texts = texts.apply(lambda x: re.sub('\(dot\)', '.', x))
texts = texts.apply(lambda x: deaccent(x))
if verbose: print('#'*10 ,'Step - Normalize chars and dots:'); check_vocab(texts, local_vocab)

########## Step - Normalize chars and dots:
Unknown words: 29514 | Known words: 6941


In [9]:
# Remove 'control' chars
# Global
global_chars_list = list(set([c for line in texts for c in line]))
chars_dict = {c:'' for c in global_chars_list if unicodedata.category(c)[0]=='C'}
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#'*10 ,'Step - Control Chars:'); check_vocab(texts, local_vocab)

########## Step - Control Chars:
Unknown words: 29514 | Known words: 6941


In [10]:
# Remove hrefs
# Global
texts = texts.apply(lambda x: re.sub(re.findall(r'\<a(.*?)\>', x)[0], '', x) if (len(re.findall(r'\<a (.*?)\>', x))>0) and ('href' in re.findall(r'\<a (.*?)\>', x)[0]) else x)
if verbose: print('#'*10 ,'Step - Remove hrefs:'); check_vocab(texts, local_vocab)

########## Step - Remove hrefs:
Unknown words: 29514 | Known words: 6941


In [11]:
# Convert or remove Bad Symbols
# Global
global_chars_list = list(set([c for line in texts for c in line]))
chars = ''.join([c for c in global_chars_list if (c not in bert_char_list) and (c not in emoji.UNICODE_EMOJI) and (c not in white_list_chars)])
chars_dict = {}
for char in chars:
    try:
        new_char = unicodedata.name(char).split()[-1:][0].lower()
        if len(new_char)==1:
            chars_dict[ord(char)] = new_char
        else:
            chars_dict[ord(char)] = ''
    except:
        chars_dict[ord(char)] = ''
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove Bad Symbols:'); check_vocab(texts, local_vocab)
if verbose: print(chars)
if verbose: print_dict(chars_dict)


########## Step - Remove Bad Symbols:
Unknown words: 28072 | Known words: 6962
😈⤵🍾😧✔🔻😤🔒📦▶🧼😻🚜⠀💲🍀𝘼🔹🙏📣🔋😘𝐒📢𝐜𝐔🍎𝟎🍩🏭🩳📊⋆😆𝐋►🥶🇨𝟐😐⚔☄⁠📹👰💜🍄🎄🇭🌹条𝐫🏎📰🥲🟡🇺💷💇💀𝙀😌🛰🔜⭕↪🌕🙈฿𝐚☂🐂📌💳🤓🗣⚖🔫🎊🍿↗자🌎🐲☠✅🏾🤖🤯🎟찬🙁🥣🐕📩🚂🖊🔞🎈🔐👇𝐅🤫👈‼𝐭𝐝🤦😃📞描💍🎞⬆🐄🇫💯𝐈🌞霜🎲𝘿🇵💬🌍🚨😭🤣⛓☮🦮₿🍺🍒💩ค🕵🏻🏀📐🙆⚛𝙐🥺🙄𝐡🛩🇿🥩🎁🦄⬇🤾🏆🅱𝐧트😠🥕👱🎙⁦🤸😜🐊😑🧡🧮⚽피👌𝐀𝗢🚀🙌🌽𝙩📷😽🎶🚶🌈🌸🇳🙋🌇😍🎰🌚🖤💵🚚🏠😒⌚💪🧠ะ🌔🎉🌳𝗜🎧💕🇦🕒💊😯𝗕♾💼☺😅💓👑💸본🥵🟠🤜🥧🏽😞🤔🦢💠🆙😙⚠🇬💁🪄⛔特繋💽🌘𝗧𝐢🤑🔼🦐👉😕🏮🦊🦈👸💡😉🏦😀🐩🔁𝗔🤨🐶⚓👎☕☯열♀해🌾❤🍻🏴🧞𝗘😱🥞🇸🤩✌￼𝐇⚾법📽𝗦👟🏂呪𝙏𝐑인🛡🎯廻📺❗🛸🧪💋🥜🤮🙂𝙞🦖𝙉✋🔙🔝𝗡🍉🦡👐🗻♂‍🔘🦺🧎👩☀🍔📍😶⏰💎💞🏼❓🌶💧⚕🎅💰𝙑🔊💖𝐖🇷🎮🔎✈🌐💱🐮😏瀧익📸🕶𝙄🎢⚘🥳😪﹩🤚🎆🆗🔸❌👆✊📝😁🥸💨🧑🪙𝐊😫悟₦𝙣😩🕺🏔👄💚⛵👾🦅🥁🥷😥🏷📮📜𝐗⛷🐦𝐍💫ด🦍🤍🔽𝐠𝗚🤼📕𝙊𝐄🥮😎币정🐋😼💅😨😄📄🤤🐑🧊🥱🌼🏃🇰術🍰降🖱⁩🕊𝐂𝗠𝗟↩🌌先𝗥🤙🐻비🏁🧵⟶👍💗💌🌙🥂😲😴됴🤪😹✨🤟🔄💦🏿🪨🎤⬅📲왕😂🇹🏄☎💝🌑😷🤛🤠𝙎🛒📒𝟏📈🥋𝘾🤝𝐓➡💥💃😝🔚👻⚙🍨〰👕𝐏🐉🟩😔⃣😟🆓⏱😓🏧𝐯🔥🥪📉🔴𝐉🙊💣⁣👗🪓🌿☹🖖🌜스🔑👨🥅🌴👀🏹🍧🌀😇🎩🦰🤳🤭𝐃𝐌▓𝐎🔮🤷🇩🇧📖💔⛳🤡😮👏✍데😡💴👖🍫코🔔🐬🥤🇱⏳🌝🎵🐳𝐲𝐆🌊🤘🥴🔗🐱🇪🏅📅🪐📶𝐞🎣😊⭐👣𝐘😳👅𝐁림📡🌋ช👽🌖🔌🍭☁🦋✓🤧🍊🤞🕗🌟絵💻😣🤗🦾🍕𝐕🥊🤕방😋🧁🡆⛏⚡💭🧘수📱🖼𝙪💹𝗣🐸🤲🐐░😬👋🎱🚫🌛🏤🎂𝐟🥰🤴🇮🔬🍪👁🍬🧐💶💆𝐨😚🙃🦳🔨👊ไ☝𝙔😢🔲
128520 --- 
10549 --- 
127870 --- 
128551 --- 
10004 --- 
128315 --- 
128548 --- 
128274 --- 
128230 --- 
9654 --- 


In [12]:
# Remove Bad Symbols PART 2
# Global
global_chars_list = list(set([c for line in texts for c in line]))
chars = '·' + ''.join([c for c in global_chars_list if (c not in white_list_chars) and (c not in emoji.UNICODE_EMOJI) and (c not in white_list_punct) and (ord(c)>256)])
chars_dict = {}
for char in chars:
    try:
        new_char = unicodedata.name(char).split()[-1:][0].lower()
        if len(new_char)==1:
            chars_dict[ord(char)] = new_char
        else:
            chars_dict[ord(char)] = ''
    except:
        chars_dict[ord(char)] = ''
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove Bad Symbols PART 2:'); check_vocab(texts, local_vocab)
if verbose: print(chars)
if verbose: print_dict(chars_dict)

########## Step - Remove Bad Symbols PART 2:
Unknown words: 28024 | Known words: 6953
·тอ比生н이사ตเ☆、сیе♦ท五น♣い●前⇒تとา！か★ξ星ب…зک明戦おв•มч„。бยنو小たさ♥роа€りき™รん√≥икよл
183 --- 
1090 --- 
3629 --- 
27604 --- 
29983 --- 
1085 --- 
51060 --- i
49324 --- 
3605 --- 
3648 --- e


In [13]:
# Remove html tags
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if ('<' in word) and ('>' in word):
        for tag in html_tags:
            if ('<'+tag+'>' in word) or ('</'+tag+'>' in word):
                temp_dict[word] = BeautifulSoup(word, 'html5lib').text
texts = texts.apply(lambda x: ' '.join([temp_dict.get(i, i) for i in x.split()]))
if verbose: print('#' * 10, 'Step - HTML tags:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - HTML tags:
Unknown words: 28024 | Known words: 6953


In [14]:
# Remove links (There is valuable information in links (probably you will find a way to use it))
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
url_rule = r'(?P<url>https?://[^\s]+)'
temp_dict = {k:domain_search(k) for k in temp_vocab if k!= re.compile(url_rule).sub('url', k)}

for word in temp_dict:
    new_value = temp_dict[word]
    if word.find('http')>2:
        temp_dict[word] =  word[:word.find('http')] + ' ' + place_hold(new_value)
    else:
        temp_dict[word] = place_hold(new_value)

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert urls part 1:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Convert urls part 1:
Unknown words: 20306 | Known words: 6953
https://t.co/r5kewo764x --- word_placeholder[t.co]
https://t.co/ghvivaiu2l --- word_placeholder[t.co]
https://t.co/89gomj1rdr --- word_placeholder[t.co]
https://t.co/fnu8rqjii8 --- word_placeholder[t.co]
https://t.co/sptecbzkgn --- word_placeholder[t.co]
https://t.co/u09c9xnwxe --- word_placeholder[t.co]
https://t.co/xbjftq74su --- word_placeholder[t.co]
https://t.co/3zgq3qjmtc --- word_placeholder[t.co]
https://t.co/uotxann5yl --- word_placeholder[t.co]
https://t.co/b1h2vkqbad --- word_placeholder[t.co]


In [15]:
# Remove twitter links
temp_dict = {
    'word_placeholder[t.co]': ''
}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert urls part 1.5:'); check_vocab(texts, local_vocab);

########## Step - Convert urls part 1.5:
Unknown words: 20306 | Known words: 6953


In [16]:
# Convert urls part 2
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}

for word in temp_vocab:
    url_check = False
    if 'file:' in word:
        url_check = True
    elif ('http' in word) or ('ww.' in word) or ('.htm' in word) or ('ftp' in word) or ('.php' in word) or ('.aspx' in word):
        if 'Aww' not in word:
            for d_zone in url_extensions:
                if '.' + d_zone in word:
                    url_check = True
                    break
    elif ('/' in word) and ('.' in word):
        for d_zone in url_extensions:
            if '.' + d_zone + '/' in word:
                url_check = True
                break

    if url_check:
        temp_dict[word] =  place_hold(domain_search(word))

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert urls part 2:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Convert urls part 2:
Unknown words: 20306 | Known words: 6953


In [17]:
# Normalize pictograms
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if len(re.compile('[a-zA-Z0-9]').sub('', word))>2:
        for pict in pictograms_to_emoji:
            if (pict in word) and (len(pict)>2):
                temp_dict[word] = word.replace(pict, pictograms_to_emoji[pict])
            elif pict==word:
                temp_dict[word] = pictograms_to_emoji[pict]

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Normalize pictograms:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Normalize pictograms:
Unknown words: 20306 | Known words: 6953
:))) --- 😁)
:-) --- 😁


In [18]:
# Isolate emoji
# Global
global_chars_list = list(set([c for line in texts for c in line]))
chars = ''.join([c for c in global_chars_list if c in emoji.UNICODE_EMOJI])
chars_dict = {ord(c):f' {c} ' for c in chars}
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Isolate emoji:'); check_vocab(texts, local_vocab)
if verbose: print(chars)

########## Step - Isolate emoji:
Unknown words: 20306 | Known words: 6953



In [19]:
# Duplicated dots, question marks and exclamations
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if (Counter(word)['.']>1) or (Counter(word)['!']>1) or (Counter(word)['?']>1) or (Counter(word)[',']>1):
        if (Counter(word)['.']>1):
            new_word = re.sub('\.\.+', ' . . . ', new_word)
        if (Counter(word)['!']>1):
            new_word = re.sub('\!\!+', ' ! ! ! ', new_word)
        if (Counter(word)['?']>1):
            new_word = re.sub('\?\?+', ' ? ? ? ', new_word)
        if (Counter(word)[',']>1):
            new_word = re.sub('\,\,+', ' , , , ', new_word)
        temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Duplicated Chars:'); check_vocab(texts, local_vocab);

########## Step - Duplicated Chars:
Unknown words: 19246 | Known words: 6995


In [20]:
# Remove underscore for spam words
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if (len(re.compile('[a-zA-Z0-9\-\.\,\/\']').sub('', word))/len(word) > 0.6) and ('_' in word):
        temp_dict[word] = re.sub('_', '', word)
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove underscore:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Remove underscore:
Unknown words: 19244 | Known words: 6995
___ --- 
#__ --- #


In [21]:
# Isolate spam chars repetition
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if (len(re.compile('[a-zA-Z0-9\-\.\,\/\']').sub('', word))/len(word) > 0.6) and (len(Counter(word))==1) and (len(word)>2):
        temp_dict[word] = ' '.join([' ' + next(iter(Counter(word).keys())) + ' ' for i in range(3)])
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Spam chars repetition:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Spam chars repetition:
Unknown words: 19240 | Known words: 6995
************************************* ---  *   *   * 
$$$$ ---  $   $   $ 
#### ---  #   #   # 
$$$ ---  $   $   $ 


In [22]:
# Normalize pictograms part 2
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if len(re.compile('[a-zA-Z0-9]').sub('', word))>1:
        for pict in pictograms_to_emoji:
            if pict==word:
                temp_dict[word] = pictograms_to_emoji[pict]
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Normalize pictograms part 2:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Normalize pictograms part 2:
Unknown words: 19239 | Known words: 6995
:( --- 😡
;) --- 😜
:) --- 😁


In [None]:
# Isolate brakets and quotes
# Global
chars = '()[]{}<>"'
chars_dict = {ord(c):f' {c} ' for c in chars}
texts = texts.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Brackets and quotes:'); check_vocab(texts, local_vocab)

In [None]:
# Break short words
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_vocab = [k for k in temp_vocab if len(k)<=20]

temp_dict = {}
for word in temp_vocab:
    if '/' in word:
        temp_dict[word] = re.sub('/', ' / ', word)

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Break long words:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

In [27]:
# Break long words
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_vocab = [k for k in temp_vocab if len(k)>20]

temp_dict = {}
for word in temp_vocab:
    if '_' in word:
        temp_dict[word] = re.sub('_', ' ', word)
    elif '/' in word:
        temp_dict[word] = re.sub('/', ' / ', word)
    elif len(' '.join(word.split('-')).split())>2:
        temp_dict[word] = re.sub('-', ' ', word)

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Break long words:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Break long words:
Unknown words: 18865 | Known words: 7012
cbn/okonjo-iweala/luno/ghana --- cbn / okonjo-iweala / luno / ghana
monitoring/purchasing --- monitoring / purchasing
/jonathan/gabriel/ozo ---  / jonathan / gabriel / ozo
standard/professional --- standard / professional
#blockchain_technology --- #blockchain technology
misinterpretation/pseudo-analysis. --- misinterpretation / pseudo-analysis.
#cryptocurrency_mass_adoption --- #cryptocurrency mass adoption
nigeria/crypto/#bitcoin/piggyvest/endsars --- nigeria / crypto / #bitcoin / piggyvest / endsars
eth-&gt;aave-&gt;eth. --- eth &gt;aave &gt;eth.
#netunrealizedprofit/loss --- #netunrealizedprofit / loss


In [23]:
# Remove/Convert usernames and hashtags
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if (len(word) > 3) and (word[1:len(word)-1].isalnum()) and (not re.compile('[#@,.:;]').sub('', word).isnumeric()):
        if word[len(word)-1].isalnum():
            if (word.startswith('@')) or (word.startswith('#')):
                new_word = place_hold(new_word[0] + ' ' + new_word[1:])
            elif word.startswith('u/'):
                 new_word = place_hold('@' + ' ' + new_word[2:])
            elif word.startswith('r/'):
                 new_word = place_hold('#' + ' ' + new_word[2:])
        else:
            if (word.startswith('@')) or (word.startswith('#')):
                new_word = place_hold(new_word[0] + ' ' + new_word[1:len(word)-1]) + ' ' + word[len(word)-1]
            elif word.startswith('u/'):
                 new_word = place_hold('@' + ' ' + new_word[2:])
            elif word.startswith('r/'):
                 new_word = place_hold('#' + ' ' + new_word[2:])
    temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - UserName and Hashtag:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - UserName and Hashtag:
Unknown words: 19051 | Known words: 6995
@founderflori, --- word_placeholder[@___founderflori] ,
#ada --- word_placeholder[#___ada]
#fxstrategy --- word_placeholder[#___fxstrategy]
#cdwsocial --- word_placeholder[#___cdwsocial]
@tap2crypto --- word_placeholder[@___tap2crypto]
#bandusdt --- word_placeholder[#___bandusdt]
@peterlbrandt --- word_placeholder[@___peterlbrandt]
#belgium! --- word_placeholder[#___belgium] !
#bitshares --- word_placeholder[#___bitshares]
@80strolls --- word_placeholder[@___80strolls]


In [25]:
# Remove ending underscore (or add quotation marks???)
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('_' in k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if word[len(word)-1]=='_':
        for i in range(len(word),0,-1):
            if word[i-1]!='_':
                new_word = word[:i]
                temp_dict[word] = new_word
                break
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove ending underscore:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Remove ending underscore:
Unknown words: 18868 | Known words: 7012
@kevin_cage_ --- @kevin_cage
@official_jhay_ --- @official_jhay
@smithie___ --- @smithie
_their_ --- _their
@_checkmatey_ --- @_checkmatey
@chris_belcher_ --- @chris_belcher


In [26]:
# Remove starting underscore
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('_' in k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if word[0]=='_':
        for i in range(len(word)):
            if word[i]!='_':
                new_word = word[i:]
                temp_dict[word] = new_word
                break
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove starting underscore:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Remove starting underscore:
Unknown words: 18867 | Known words: 7012
_their --- their
_hal9001 --- hal9001


In [28]:
# End word punctuations
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (not k[len(k)-1].isalnum())]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    for i in range(len(word),0,-1):
        if word[i-1].isalnum():
            new_word = word[:i] + ' ' + word[i:]
            break
    temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - End word punctuations:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - End word punctuations:
Unknown words: 12563 | Known words: 7543
get? --- get ?
states' --- states '
heed. --- heed .
costs, --- costs ,
bleeds, --- bleeds ,
37,227.27$ --- 37,227.27 $
faces. --- faces .
management. --- management .
meat. --- meat .
continues. --- continues .


In [29]:
# Start word punctuations
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (not k[0].isalnum())]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    for i in range(len(word)):
        if word[i].isalnum():
            new_word = word[:i] + ' ' + word[i:]
            break
    temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Start word punctuations:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Start word punctuations:
Unknown words: 11215 | Known words: 7692
$cl.wt --- $ cl.wt
$eosup --- $ eosup
"exploring --- " exploring
$laho --- $ laho
(ark --- ( ark
$33.8k --- $ 33.8k
$flr --- $ flr
$dogecoin --- $ dogecoin
&gt;500 --- & gt;500
(dmbs --- ( dmbs


In [30]:
# Find and replace acronims
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if (Counter(word)['.']>1) and (check_replace(word)):
        if (domain_search(word)!='') and (('www' in word) or (Counter(word)['/']>3)):
            temp_dict[word] = place_hold('url ' + domain_search(word))
        else:
            if (re.compile('[\.\,]').sub('', word) in local_vocab) and (len(re.compile('[0-9\.\,\-\/\:]').sub('', word))>0):
                temp_dict[word] =  place_hold(re.compile('[\.\,]').sub('', word))
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Find and replace acronims:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Find and replace acronims:
Unknown words: 11215 | Known words: 7692
h.i.m --- word_placeholder[him]
r.i.p --- word_placeholder[rip]
g.o.a.t --- word_placeholder[goat]


In [31]:
# Apply spellchecker for contractions
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ("'" in k)]
temp_dict = {}
for word in temp_vocab:
    if word in helper_contractions:
        temp_dict[word] = place_hold(helper_contractions[word])
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Contractions:'); check_vocab(texts, local_vocab)
if verbose: print_dict(temp_dict)

########## Step - Contractions:
Unknown words: 11213 | Known words: 7692
i'd --- word_placeholder[i___would]
haven't --- word_placeholder[have___not]
how's --- word_placeholder[how___is]
didn't --- word_placeholder[did___not]
it'll --- word_placeholder[it___will]
we're --- word_placeholder[we___are]
that's --- word_placeholder[that___is]
we'd --- word_placeholder[we___would]
c'mon --- word_placeholder[c'mon]
she's --- word_placeholder[she___is]


In [32]:
# Remove 's (DO WE NEED TO REMOVE IT???)
# Local
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {k:k[:-2] for k in temp_vocab if (check_replace(k)) and (k.lower()[-2:]=="'s")}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove "s:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Remove "s:
Unknown words: 11010 | Known words: 7707
queen's --- queen
street's --- street
market's --- market
world's --- world
bloomberg's --- bloomberg
haram's --- haram
person's --- person
ath's --- ath
brother's --- brother
microsoft's --- microsoft


In [33]:
# Convert backslash
# Global
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('\\' in k)]
temp_dict = {k:re.sub('\\\\+', ' / ', k) for k in temp_vocab}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert backslash:'); check_vocab(texts, local_vocab)
if verbose: print_dict(temp_dict)

########## Step - Convert backslash:
Unknown words: 11010 | Known words: 7707


In [34]:
# Try remove duplicated chars (not sure about this!!!!!). TODO check fist against vocab?
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]

temp_dict = {}
temp_vocab_dup = []

for word in temp_vocab:
    if not word.isalpha():
        continue
    temp_vocab_dup.append(''.join(ch for ch, _ in itertools.groupby(word)))
temp_vocab_dup = set(temp_vocab_dup)
temp_vocab_dup = temp_vocab_dup.difference(temp_vocab_dup.difference(set(local_vocab)))

for word in temp_vocab:
    new_word = ''.join(ch for ch, _ in itertools.groupby(word))
    if new_word in temp_vocab_dup:
        temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if (k != v) and (v in local_vocab)}

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Dup chars (with vocab check):'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Dup chars (with vocab check):
Unknown words: 10776 | Known words: 7760
2800 --- 280
xvii --- xvi
** --- *
reff --- ref
32000 --- 320
49000 --- 490
choosen --- chosen
remmember --- remember
30777 --- 307
brooooo --- bro


In [35]:
# Isolate numbers
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if re.compile('[a-zA-Z]').sub('', word) == word:
        if re.compile('[0-9]').sub('', word) != word:
            temp_dict[word] = word

global_chars_list = list(set([c for line in temp_dict for c in line]))
chars = ''.join([c for c in global_chars_list if not c.isdigit()])
chars_dict = {ord(c):f' {c} ' for c in chars}
temp_dict = {k:place_hold(make_cleaning(k,chars_dict)) for k in temp_dict}

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Isolate numbers:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Isolate numbers:
Unknown words: 10776 | Known words: 7760
180,000 --- word_placeholder[180___,___000]
2024 --- word_placeholder[2024]
33,534.57 --- word_placeholder[33___,___534___.___57]
40590.99 --- word_placeholder[40590___.___99]
682,875 --- word_placeholder[682___,___875]
7.4 --- word_placeholder[7___.___4]
5,471,598 --- word_placeholder[5___,___471___,___598]
0.72 --- word_placeholder[0___.___72]
0,04 --- word_placeholder[0___,___04]
36616.9805 --- word_placeholder[36616___.___9805]


In [36]:
# Join dashes
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]

temp_dict = {}
for word in temp_vocab:
    temp_dict[word] = re.sub('\-\-+', '-', word)
temp_dict = {k: v for k, v in temp_dict.items() if k != v}

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Join dashes:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)
if verbose: print_dict(temp_dict)

########## Step - Join dashes:
Unknown words: 10774 | Known words: 7760
--=[ --- -=[
--& --- -&
]=-- --- ]=-
;-- --- ;-
transactions--innovate --- transactions-innovate
-----------------& --- -&
free--&gt --- free-&gt


In [37]:
# Try join word (Sloooow)
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (Counter(k)['-']>1)]

temp_dict = {}
for word in temp_vocab:
    new_word = ''.join(['' if c in '-' else c for c in word])
    if (new_word in local_vocab) and (len(new_word)>3):
        temp_dict[word] = new_word

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Try Split word:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Try Split word:
Unknown words: 10774 | Known words: 7760


In [45]:
# Try Split word
# Local (only unknown words)
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]

temp_dict = {}
for word in temp_vocab:
    if len(re.compile('[a-zA-Z0-9\*]').sub('', word))>0:
        chars = re.compile('[a-zA-Z0-9\*]').sub('', word)
        temp_dict[word] = ''.join([' ' + c + ' ' if c in chars else c for c in word])

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Try Split word:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Try Split word:
Unknown words: 7639 | Known words: 8147
😜 ---  😜 
c'mon --- c ' mon
cumhurbaskanıistifa --- cumhurbaskan ı istifa
😡 ---  😡 
twitter.com --- twitter . com
😁 ---  😁 
erdoganınyanındayız --- erdogan ı nyan ı nday ı z
t.co --- t . co


In [39]:
# L33T vocabulary (SLOW)
# https://simple.wikipedia.org/wiki/Leet
# Local (only unknown words)
def convert_leet(word):
    # basic conversion
    word = re.sub('0', 'o', word)
    word = re.sub('1', 'i', word)
    word = re.sub('3', 'e', word)
    word = re.sub('\$', 's', word)
    word = re.sub('\@', 'a', word)
    return word

temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]

temp_dict = {}
for word in temp_vocab:
    new_word = convert_leet(word)
    if (new_word!=word):
        if (len(word)>2) and (new_word in local_vocab):
            temp_dict[word] = new_word

texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - L33T (with vocab check):'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - L33T (with vocab check):
Unknown words: 9972 | Known words: 7868
k3nneth --- kenneth
r10 --- rio
10s --- ios


In [40]:
# Open Holded words
# Global
temp_vocab = list(set([c for line in texts for c in line.split()]))
temp_vocab = [k for k in temp_vocab if (not check_replace(k))]
temp_dict = {}
for word in temp_vocab:
    temp_dict[word] = re.sub('___', ' ', word[17:-1])
texts = texts.apply(lambda x: ' '.join([temp_dict.get(i, i) for i in x.split()]))
texts = texts.apply(lambda x: ' '.join([i for i in x.split()]))
if verbose: print('#' * 10, 'Step - Open Holded words:'); check_vocab(texts, local_vocab)

########## Step - Open Holded words:
Unknown words: 7873 | Known words: 8075


In [41]:
# Search multiple form
# Local | example -> flashlights / flashlight -> False / True
temp_vocab = check_vocab(texts, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (k[-1:]=='s') and (len(k)>4)]
temp_dict = {k:k[:-1] for k in temp_vocab if (k[:-1] in local_vocab)}
texts = texts.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Multiple form:'); check_vocab(texts, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Multiple form:
Unknown words: 7640 | Known words: 8146
insiders --- insider
intelligents --- intelligent
alerts --- alert
validations --- validation
wednesdays --- wednesday
powerplants --- powerplant
tonights --- tonight
surges --- surge
welcomes --- welcome
whats --- what


In [42]:
data['text'] = texts
data

Unnamed: 0,_id,text
0,1.357811e+18,"blockchains rely on fees to incentivize participation in the decentralized ecosystem . but to get mainstream adoption , we might have to do away with transaction costs altogether . t.co"
1,1.358008e+18,annual percentage yield ( apy ) # blockchain # cryptocurrency # bitcoin t.co
2,1.358061e+18,on a long enough timeline every asset looks flat against # bitcoin t.co
3,1.358010e+18,# bitcoin breaks $ 40k . . . again ! t.co
4,1.358021e+18,life gets cheaper on the # bitcoin standard . @ jclcapital @ crypto _ daily
...,...,...
9995,1.353849e+18,is # bitcoin setting up for this move ? t.co
9996,1.353831e+18,make the best of your circumstances ; focus on being grateful for the things you have . b $ gvt @ genesis _ vision b # altseason2021 # altcoins # bch # bitcoin # bnb # ethereum # dot # link # zrx ...
9997,1.353847e+18,bitcoin price looks to resume bull cycle after rising above $ 34k t.co # bitcoin
9998,1.353818e+18,check out this video . explains exactly what is happening with bitcoin right now ! ! ! for more trader insight please follow me # btc # bitcoin . t.co


In [44]:
if not LOCAL_TEST:
    data['text'] = texts
    data.to_csv('../data/bitcoin_twitter_processed.csv')