In [187]:
import numpy as np
import pandas as pd
import os, sys, gc, re, warnings, pickle, itertools, emoji, psutil, random, unicodedata
import string
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from bs4 import BeautifulSoup
from tqdm import tqdm
import spacy
import random
from spacy.util import minibatch, compounding
from collections import defaultdict, Counter

from sklearn import preprocessing
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords
from nltk.util import ngrams
STOP = set(stopwords.words('english'))
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image

from gensim.utils import deaccent



In [188]:
TWEET_PATH = '../data/vaccination_all_tweets.csv'
GEO_PATH = '../data/country_vaccinations.csv'
LABELED_PATH = '../data/covid_vaccine_tweets_with_sentiment.csv'

TWEETS = pd.read_csv(TWEET_PATH)
VACCINATION = pd.read_csv(GEO_PATH)
LABELED = pd.read_csv(LABELED_PATH)

In [189]:
LABELED.head()

Unnamed: 0,tweet_id,label,tweet_text
0,1.360342e+18,1,"4,000 a day dying from the so called Covid-19 ..."
1,1.382896e+18,2,Pranam message for today manifested in Dhyan b...
2,1.375673e+18,2,Hyderabad-based ?@BharatBiotech? has sought fu...
3,1.381311e+18,1,"Confirmation that Chinese #vaccines ""don�t hav..."
4,1.362166e+18,3,"Lab studies suggest #Pfizer, #Moderna vaccines..."


In [190]:
def miss_val(df):
    total=df.isnull().sum()
    return pd.concat([total],axis=1,keys=['Total'])
print("Missing values for train dataset \n")
print(miss_val(LABELED))


Missing values for train dataset 

            Total
tweet_id        0
label           0
tweet_text      0


In [191]:
def remove_link(string): 
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'," ",string)
    return " ".join(text.split())
LABELED['tweet_text']=LABELED['tweet_text'].apply(lambda x:remove_link(x))

In [192]:
## Build of vocabulary from file - reading data line by line
## Line splited by 'space' and we store just first argument - Word
# :path - txt/vec/csv absolute file path        # type: str
def get_vocabulary(path):
    with open(path) as f:
        return [line.strip().split()[0] for line in f][0:]

## Check how many words are in Vocabulary
# :c_list - 1d array with 'comment_text'        # type: pandas Series
# :vocabulary - words in vocabulary to check    # type: list of str
# :response - type of response                  # type: str
def check_vocab(c_list, vocabulary, response='default'):
    try:
        words = set([w for line in c_list for w in line.split()])
        u_list = words.difference(set(vocabulary))
        k_list = words.difference(u_list)
    
        if response=='default':
            print('Unknown words:', len(u_list), '| Known words:', len(k_list))
        elif response=='unknown_list':
            return list(u_list)
        elif response=='known_list':
            return list(k_list)
    except:
        return []
        
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

    if 'torch' in sys.modules:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
 
## Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)
    
## Export pickle
def make_export(tr, tt, file_name):
    train_export = train[['id']]
    test_export = test[['id']]

    try:
        cur_shape = tr.shape[1]>1
        train_export = pd.concat([train_export, tr], axis=1)
        test_export = pd.concat([test_export, tt], axis=1)        
    except:
        train_export['p_comment'] = tr
        test_export['p_comment'] = tt
    
    train_export.to_pickle(file_name + '_x_train.pkl')
    test_export.to_pickle(file_name + '_x_test.pkl')

## Domain Search
re_3986_enhanced = re.compile(r"""
        # Parse and capture RFC-3986 Generic URI components.
        ^                                    # anchor to beginning of string
        (?:  (?P<scheme>    [^:/?#\s]+):// )?  # capture optional scheme
        (?:(?P<authority>  [^/?#\s]*)  )?  # capture optional authority
             (?P<path>        [^?#\s]*)      # capture required path
        (?:\?(?P<query>        [^#\s]*)  )?  # capture optional query
        (?:\#(?P<fragment>      [^\s]*)  )?  # capture optional fragment
        $                                    # anchor to end of string
        """, re.MULTILINE | re.VERBOSE)

re_domain =  re.compile(r"""
        # Pick out top two levels of DNS domain from authority.
        (?P<domain>[^.]+\.[A-Za-z]{2,6})  # $domain: top two domain levels.
        (?::[0-9]*)?                      # Optional port number.
        $                                 # Anchor to end of string.
        """, 
        re.MULTILINE | re.VERBOSE)

def domain_search(text):
    try:
        return re_domain.search(re_3986_enhanced.match(text).group('authority')).group('domain')
    except:
        return 'url'

## Load helper helper))
def load_helper_file(filename):
    with open(HELPER_PATH+filename+'.pickle', 'rb') as f:
        temp_obj = pickle.load(f)
    return temp_obj
        
## Preprocess helpers
def place_hold(w):
    return WPLACEHOLDER + '['+re.sub(' ', '___', w)+']'

def check_replace(w):
    return not bool(re.search(WPLACEHOLDER, w))

def make_cleaning(s, c_dict):
    if check_replace(s):
        s = s.translate(c_dict)
    return s
  
def make_dict_cleaning(s, w_dict):
    if check_replace(s):
        s = w_dict.get(s, s)
    return s

def export_dict(temp_dict, serial_num):
    pd.DataFrame.from_dict(temp_dict, orient='index').to_csv('dict_'+str(serial_num)+'.csv')

def print_dict(temp_dict, n_items=10):
    run = 0
    for k,v in temp_dict.items():
        print(k,'---',v)
        run +=1
        if run==n_items:
            break    
## ----------------------------------------------------------------------------------------------------

In [193]:
########################### Initial vars
#################################################################################
HELPER_PATH             = '../helper/'

LOCAL_TEST = True       ## Local test - for test performance on part of the train set only
SEED = 42               ## Seed for enviroment
seed_everything(SEED)   ## Seed everything

WPLACEHOLDER = 'word_placeholder'

########################### DATA LOAD
#################################################################################
print('1.1. Load Data')
good_cols       = ['tweet_id', 'tweet_text']
if LOCAL_TEST:
    tt          = pd.read_csv('../data/covid_vaccine_tweets_with_sentiment.csv', nrows=200000)
    train       = tt.iloc[:-100000,:]
    test        = tt.iloc[-100000:,:]
    del tt
    train, test = train[good_cols+['label']], test[good_cols]
else:
    train       = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
    test        = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')    
    train, test = train[good_cols+['label', 'created_date']], test[good_cols]

########################### Get basic helpers
#################################################################################
print('1.2. Basic helpers')
bert_uncased_vocabulary = load_helper_file('helper_bert_uncased_vocabulary')
bert_cased_vocabulary   = load_helper_file('helper_bert_cased_vocabulary')
bert_char_list          = list(set([c for line in bert_uncased_vocabulary+bert_cased_vocabulary for c in line]))

url_extensions          = load_helper_file('helper_url_extensions')
html_tags               = load_helper_file('helper_html_tags')
#good_chars_dieter       = load_helper_file('helper_good_chars_dieter')
#bad_chars_dieter        = load_helper_file('helper_bad_chars_dieter')
helper_contractions     = load_helper_file('helper_contractions')
#global_vocabulary       = load_helper_file('helper_global_vocabulary')
#global_vocabulary_chars = load_helper_file('helper_global_vocabulary_chars')
normalized_chars        = load_helper_file('helper_normalized_chars')
white_list_chars        = load_helper_file('helper_white_list_chars')
white_list_punct        = " '*-.,?!/:;_()[]{}<>=" + '"'
pictograms_to_emoji     = load_helper_file('helper_pictograms_to_emoji')
toxic_misspell_dict     = load_helper_file('helper_toxic_misspell_dict')

1.1. Load Data
1.2. Basic helpers


In [194]:
tweets = LABELED['tweet_text']
local_vocab = bert_uncased_vocabulary
verbose = True
global_lower=True
tweets = tweets.astype(str)
if verbose: print('#' *20 ,'Initial State:'); check_vocab(tweets, local_vocab)

#################### Initial State:
Unknown words: 23251 | Known words: 5358


In [195]:
if global_lower:
    tweets = tweets.apply(lambda x: x.lower())
    if verbose: print('#'*10 ,'Step - Lowering everything:'); check_vocab(tweets, local_vocab)

########## Step - Lowering everything:
Unknown words: 18896 | Known words: 6398


In [196]:
# Normalize chars and dots - SEE HELPER FOR DETAILS
# Global
tweets = tweets.apply(lambda x: ' '.join([make_cleaning(i,normalized_chars) for i in x.split()]))
tweets = tweets.apply(lambda x: re.sub('\(dot\)', '.', x))
tweets = tweets.apply(lambda x: deaccent(x))
if verbose: print('#'*10 ,'Step - Normalize chars and dots:'); check_vocab(tweets, local_vocab)

########## Step - Normalize chars and dots:
Unknown words: 18889 | Known words: 6398


In [197]:
# Remove 'control' chars
# Global    
global_chars_list = list(set([c for line in tweets for c in line]))
chars_dict = {c:'' for c in global_chars_list if unicodedata.category(c)[0]=='C'}
tweets = tweets.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#'*10 ,'Step - Control Chars:'); check_vocab(tweets, local_vocab)

########## Step - Control Chars:
Unknown words: 18889 | Known words: 6398


In [198]:
# Remove hrefs
# Global    
tweets = tweets.apply(lambda x: re.sub(re.findall(r'\<a(.*?)\>', x)[0], '', x) if (len(re.findall(r'\<a (.*?)\>', x))>0) and ('href' in re.findall(r'\<a (.*?)\>', x)[0]) else x)
if verbose: print('#'*10 ,'Step - Remove hrefs:'); check_vocab(tweets, local_vocab)

########## Step - Remove hrefs:
Unknown words: 18889 | Known words: 6398


In [199]:
# Convert or remove Bad Symbols
# Global
global_chars_list = list(set([c for line in tweets for c in line]))
chars = ''.join([c for c in global_chars_list if (c not in bert_char_list) and (c not in emoji.UNICODE_EMOJI) and (c not in white_list_chars)])
chars_dict = {}
for char in chars:
    try:
        new_char = unicodedata.name(char).split()[-1:][0].lower()
        if len(new_char)==1:
            chars_dict[ord(char)] = new_char
        else:
            chars_dict[ord(char)] = ''
    except:
        chars_dict[ord(char)] = ''
tweets = tweets.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove Bad Symbols:'); check_vocab(tweets, local_vocab)
if verbose: print(chars)
if verbose: print_dict(chars_dict)

########## Step - Remove Bad Symbols:
Unknown words: 18616 | Known words: 6417
�
65533 --- 


In [200]:
# Remove Bad Symbols PART 2
# Global
global_chars_list = list(set([c for line in tweets for c in line]))
chars = '·' + ''.join([c for c in global_chars_list if (c not in white_list_chars) and (c not in emoji.UNICODE_EMOJI) and (c not in white_list_punct) and (ord(c)>256)])
chars_dict = {}
for char in chars:
    try:
        new_char = unicodedata.name(char).split()[-1:][0].lower()
        if len(new_char)==1:
            chars_dict[ord(char)] = new_char
        else:
            chars_dict[ord(char)] = ''
    except:
        chars_dict[ord(char)] = ''
tweets = tweets.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove Bad Symbols PART 2:'); check_vocab(tweets, local_vocab)
if verbose: print(chars)
if verbose: print_dict(chars_dict)

########## Step - Remove Bad Symbols PART 2:
Unknown words: 18616 | Known words: 6417
·
183 --- 


In [201]:
# Remove html tags
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if ('<' in word) and ('>' in word):
        for tag in html_tags:
            if ('<'+tag+'>' in word) or ('</'+tag+'>' in word):
                temp_dict[word] = BeautifulSoup(word, 'html5lib').text  
tweets = tweets.apply(lambda x: ' '.join([temp_dict.get(i, i) for i in x.split()]))
if verbose: print('#' * 10, 'Step - HTML tags:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - HTML tags:
Unknown words: 18616 | Known words: 6417


In [202]:
# Remove links (There is valuable information in links (probably you will find a way to use it)) 
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
url_rule = r'(?P<url>https?://[^\s]+)'
temp_dict = {k:domain_search(k) for k in temp_vocab if k!= re.compile(url_rule).sub('url', k)}
    
for word in temp_dict:
    new_value = temp_dict[word]
    if word.find('http')>2:
        temp_dict[word] =  word[:word.find('http')] + ' ' + place_hold(new_value)
    else:
        temp_dict[word] = place_hold(new_value)
            
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert urls part 1:'); check_vocab(tweets, local_vocab); 
if verbose: print_dict(temp_dict)

########## Step - Convert urls part 1:
Unknown words: 18616 | Known words: 6417


In [203]:
# Convert urls part 2
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}

for word in temp_vocab:
    url_check = False
    if 'file:' in word:
        url_check = True
    elif ('http' in word) or ('ww.' in word) or ('.htm' in word) or ('ftp' in word) or ('.php' in word) or ('.aspx' in word):
        if 'Aww' not in word:
            for d_zone in url_extensions:
                if '.' + d_zone in word:
                    url_check = True
                    break            
    elif ('/' in word) and ('.' in word):
        for d_zone in url_extensions:
            if '.' + d_zone + '/' in word:
                url_check = True
                break

    if url_check:
        temp_dict[word] =  place_hold(domain_search(word))
        
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert urls part 2:'); check_vocab(tweets, local_vocab); 
if verbose: print_dict(temp_dict)

########## Step - Convert urls part 2:
Unknown words: 18616 | Known words: 6417


In [204]:
# Normalize pictograms
# Local (only unknown words)
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if len(re.compile('[a-zA-Z0-9]').sub('', word))>2:
        for pict in pictograms_to_emoji:
            if (pict in word) and (len(pict)>2):
                temp_dict[word] = word.replace(pict, pictograms_to_emoji[pict])
            elif pict==word:  
                temp_dict[word] = pictograms_to_emoji[pict]

tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Normalize pictograms:'); check_vocab(tweets, local_vocab); 
if verbose: print_dict(temp_dict)

########## Step - Normalize pictograms:
Unknown words: 18616 | Known words: 6417
:-) --- 😁


In [205]:
# Isolate emoji
# Global
global_chars_list = list(set([c for line in tweets for c in line]))
chars = ''.join([c for c in global_chars_list if c in emoji.UNICODE_EMOJI])
chars_dict = {ord(c):f' {c} ' for c in chars}
tweets = tweets.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Isolate emoji:'); check_vocab(tweets, local_vocab)
if verbose: print(chars)

########## Step - Isolate emoji:
Unknown words: 18616 | Known words: 6417



In [206]:
# Duplicated dots, question marks and exclamations
# Local
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if (Counter(word)['.']>1) or (Counter(word)['!']>1) or (Counter(word)['?']>1) or (Counter(word)[',']>1):
        if (Counter(word)['.']>1):
            new_word = re.sub('\.\.+', ' . . . ', new_word)
        if (Counter(word)['!']>1):
            new_word = re.sub('\!\!+', ' ! ! ! ', new_word)
        if (Counter(word)['?']>1):
            new_word = re.sub('\?\?+', ' ? ? ? ', new_word)
        if (Counter(word)[',']>1):
            new_word = re.sub('\,\,+', ' , , , ', new_word)
        temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Duplicated Chars:'); check_vocab(tweets, local_vocab);

########## Step - Duplicated Chars:
Unknown words: 17843 | Known words: 6463


In [207]:
# Remove underscore for spam words
# Local
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if (len(re.compile('[a-zA-Z0-9\-\.\,\/\']').sub('', word))/len(word) > 0.6) and ('_' in word):
        temp_dict[word] = re.sub('_', '', word)       
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove underscore:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Remove underscore:
Unknown words: 17842 | Known words: 6463
_? --- ?


In [208]:
# Isolate spam chars repetition
# Local
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if (len(re.compile('[a-zA-Z0-9\-\.\,\/\']').sub('', word))/len(word) > 0.6) and (len(Counter(word))==1) and (len(word)>2):
        temp_dict[word] = ' '.join([' ' + next(iter(Counter(word).keys())) + ' ' for i in range(3)])
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Spam chars repetition:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)

########## Step - Spam chars repetition:
Unknown words: 17840 | Known words: 6463
*** ---  *   *   * 
$$$ ---  $   $   $ 


In [209]:
# Normalize pictograms part 2
# Local (only unknown words)
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if len(re.compile('[a-zA-Z0-9]').sub('', word))>1:
        for pict in pictograms_to_emoji:
            if pict==word:  
                temp_dict[word] = pictograms_to_emoji[pict]
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Normalize pictograms part 2:'); check_vocab(tweets, local_vocab); 
if verbose: print_dict(temp_dict)   

########## Step - Normalize pictograms part 2:
Unknown words: 17839 | Known words: 6463
:) --- 😁
:( --- 😡
;) --- 😜


In [210]:
# Isolate brakets and quotes
# Global
chars = '()[]{}<>"'
chars_dict = {ord(c):f' {c} ' for c in chars}
tweets = tweets.apply(lambda x: ' '.join([make_cleaning(i,chars_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Brackets and quotes:'); check_vocab(tweets, local_vocab)

########## Step - Brackets and quotes:
Unknown words: 17070 | Known words: 6506


In [211]:
# Break short words
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_vocab = [k for k in temp_vocab if len(k)<=20]
    
temp_dict = {}
for word in temp_vocab:
    if '/' in word:
        temp_dict[word] = re.sub('/', ' / ', word)
    
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Break long words:'); check_vocab(tweets, local_vocab); 
if verbose: print_dict(temp_dict)           

########## Step - Break long words:
Unknown words: 16918 | Known words: 6525
10/10 --- 10 / 10
w/no --- w / no
15000/-for --- 15000 / -for
gov/icmr --- gov / icmr
b/w --- b / w
w/o --- w / o
13/04/2021 --- 13 / 04 / 2021
clinics/sites --- clinics / sites
p/b, --- p / b,
covid-19/ --- covid-19 / 


In [212]:
# Break long words
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_vocab = [k for k in temp_vocab if len(k)>20]
    
temp_dict = {}
for word in temp_vocab:
    if '_' in word:
        temp_dict[word] = re.sub('_', ' ', word)
    elif '/' in word:
        temp_dict[word] = re.sub('/', ' / ', word)
    elif len(' '.join(word.split('-')).split())>2:
        temp_dict[word] = re.sub('-', ' ', word)
    
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Break long words:'); check_vocab(tweets, local_vocab); 
if verbose: print_dict(temp_dict)           

########## Step - Break long words:
Unknown words: 16922 | Known words: 6530
#students_against_covid --- #students against covid
race/gender/faith,why --- race / gender / faith,why
challenging/difficult/stressful --- challenging / difficult / stressful
#covaxin/#covishield. --- #covaxin / #covishield.
update/wibble/waffle. --- update / wibble / waffle.
.@maryam_rajavi:khameneis --- .@maryam rajavi:khameneis
1300gmt/1400cet/0800est --- 1300gmt / 1400cet / 0800est
#tika_vaccination_utsav --- #tika vaccination utsav
#perfomance_enhancing --- #perfomance enhancing
crazy-polar-vortex-winter-storm --- crazy polar vortex winter storm


In [213]:
# Remove/Convert usernames and hashtags (add username/hashtag word?????)
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if (len(word) > 3) and (word[1:len(word)-1].isalnum()) and (not re.compile('[#@,.:;]').sub('', word).isnumeric()):
        if word[len(word)-1].isalnum():
            if (word.startswith('@')) or (word.startswith('#')):
                new_word = place_hold(new_word[0] + ' ' + new_word[1:]) 
        else:
            if (word.startswith('@')) or (word.startswith('#')):
                new_word = place_hold(new_word[0] + ' ' + new_word[1:len(word)-1]) + ' ' + word[len(word)-1]

    temp_dict[word] = new_word
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - UserName and Hashtag:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)           

########## Step - UserName and Hashtag:
Unknown words: 16581 | Known words: 6530
#ahp --- word_placeholder[#___ahp]
@ravishndtv --- word_placeholder[@___ravishndtv]
#chennai --- word_placeholder[#___chennai]
@99freemind --- word_placeholder[@___99freemind]
#aot139spoilers --- word_placeholder[#___aot139spoilers]
@sumanthraman --- word_placeholder[@___sumanthraman]
@kelleypersonal --- word_placeholder[@___kelleypersonal]
@ndtvfeed --- word_placeholder[@___ndtvfeed]
@novy62 --- word_placeholder[@___novy62]
#ino --- word_placeholder[#___ino]


In [214]:
# Remove ending underscore (or add quotation marks???)
# Local
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('_' in k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if word[len(word)-1]=='_':
        for i in range(len(word),0,-1):
            if word[i-1]!='_':
                new_word = word[:i]
                temp_dict[word] = new_word   
                break
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove ending underscore:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)       

########## Step - Remove ending underscore:
Unknown words: 16580 | Known words: 6531
@_5andman_ --- @_5andman
@ian_hamilton_ --- @ian_hamilton
@danil_bochkov_ --- @danil_bochkov
_with_ --- _with
cubs___ --- cubs
@_sjpeace_ --- @_sjpeace


In [215]:
# Remove starting underscore 
# Local
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('_' in k)]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    if word[0]=='_':
        for i in range(len(word)):
            if word[i]!='_':
                new_word = word[i:]
                temp_dict[word] = new_word   
                break
data = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Remove starting underscore:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)     

########## Step - Remove starting underscore:
Unknown words: 16580 | Known words: 6531
_with --- with


In [216]:
# End word punctuations
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (not k[len(k)-1].isalnum())]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    for i in range(len(word),0,-1):
        if word[i-1].isalnum():
            new_word = word[:i] + ' ' + word[i:]
            break
    temp_dict[word] = new_word     
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - End word punctuations:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)       

########## Step - End word punctuations:
Unknown words: 12190 | Known words: 7022
correct! --- correct !
st. --- st .
understand. --- understand .
underwear, --- underwear ,
structure. --- structure .
imo. --- imo .
kiya? --- kiya ?
info. --- info .
jan.1, --- jan.1 ,
vintage! --- vintage !


In [217]:
# Start word punctuations
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and (not k[0].isalnum())]
temp_dict = {}
for word in temp_vocab:
    new_word = word
    for i in range(len(word)):
        if word[i].isalnum():
            new_word = word[:i] + ' ' + word[i:]
            break
    temp_dict[word] = new_word     
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Start word punctuations:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)     

########## Step - Start word punctuations:
Unknown words: 11828 | Known words: 7049
##moderna --- ## moderna
-update --- - update
?#vaccinated --- ?# vaccinated
'meanwhile --- ' meanwhile
-25c --- - 25c
?@joebiden --- ?@ joebiden
?10 --- ? 10
@christina_mitas --- @ christina_mitas
@cmo_england --- @ cmo_england
#nation_with_modi --- # nation_with_modi


In [218]:
# Find and replace acronims
# Local (only unknown words)
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
temp_dict = {}
for word in temp_vocab:
    if (Counter(word)['.']>1) and (check_replace(word)):
        if (domain_search(word)!='') and (('www' in word) or (Counter(word)['/']>3)):
            temp_dict[word] = place_hold('url ' + domain_search(word))
        else: 
            if (re.compile('[\.\,]').sub('', word) in local_vocab) and (len(re.compile('[0-9\.\,\-\/\:]').sub('', word))>0):
                temp_dict[word] =  place_hold(re.compile('[\.\,]').sub('', word))
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Find and replace acronims:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)  

########## Step - Find and replace acronims:
Unknown words: 11828 | Known words: 7049
f.d.a --- word_placeholder[fda]


In [220]:
# Convert backslash
# Global
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (check_replace(k)) and ('\\' in k)]    
temp_dict = {k:re.sub('\\\\+', ' / ', k) for k in temp_vocab}
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert backslash:'); check_vocab(tweets, local_vocab)
if verbose: print_dict(temp_dict)

########## Step - Convert backslash:
Unknown words: 11826 | Known words: 7049


In [221]:
# Join dashes
# Local (only unknown words)
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
    
temp_dict = {}
for word in temp_vocab:
    temp_dict[word] = re.sub('\-\-+', '-', word)
temp_dict = {k: v for k, v in temp_dict.items() if k != v}
    
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Join dashes:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)  

########## Step - Join dashes:
Unknown words: 11824 | Known words: 7049
-- --- -
death--see --- death-see
---- --- -


In [222]:
# Try Split word
# Local (only unknown words)
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
    
temp_dict = {}
for word in temp_vocab:
    if len(re.compile('[a-zA-Z0-9\*]').sub('', word))>0:
        chars = re.compile('[a-zA-Z0-9\*]').sub('', word)
        temp_dict[word] = ''.join([' ' + c + ' ' if c in chars else c for c in word])
    
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Try Split word:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)  

########## Step - Try Split word:
Unknown words: 10749 | Known words: 7298
#' ---  #  ' 
75-year-old --- 75 - year - old
cold-storage --- cold - storage
pre-market --- pre - market
midday,it --- midday , it
best-in-class --- best - in - class
25.950 --- 25 . 950
pdsa_itp --- pdsa _ itp
abhina_prakash --- abhina _ prakash
rich_pratt --- rich _ pratt


In [223]:
# L33T vocabulary (SLOW)
# https://simple.wikipedia.org/wiki/Leet
# Local (only unknown words)
def convert_leet(word):
    # basic conversion 
    word = re.sub('0', 'o', word)
    word = re.sub('1', 'i', word)
    word = re.sub('3', 'e', word)
    word = re.sub('\$', 's', word)
    word = re.sub('\@', 'a', word)
    return word
            
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if check_replace(k)]
    
temp_dict = {}
for word in temp_vocab:
    new_word = convert_leet(word)
    if (new_word!=word): 
        if (len(word)>2) and (new_word in local_vocab):
            temp_dict[word] = new_word
    
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - L33T (with vocab check):'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)      

########## Step - L33T (with vocab check):
Unknown words: 10745 | Known words: 7300
01l --- oil
on3 --- one
1bn --- ibn
1ra --- ira


In [224]:
# Open Holded words
# Global
temp_vocab = list(set([c for line in tweets for c in line.split()]))
temp_vocab = [k for k in temp_vocab if (not check_replace(k))]
temp_dict = {}
for word in temp_vocab:
    temp_dict[word] = re.sub('___', ' ', word[17:-1])
tweets = tweets.apply(lambda x: ' '.join([temp_dict.get(i, i) for i in x.split()]))
tweets = tweets.apply(lambda x: ' '.join([i for i in x.split()]))
if verbose: print('#' * 10, 'Step - Open Holded words:'); check_vocab(tweets, local_vocab)

########## Step - Open Holded words:
Unknown words: 9326 | Known words: 7524


In [225]:
# Search multiple form
# Local | example -> flashlights / flashlight -> False / True
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (k[-1:]=='s') and (len(k)>4)]
temp_dict = {k:k[:-1] for k in temp_vocab if (k[:-1] in local_vocab)}
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Multiple form:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)    

########## Step - Multiple form:
Unknown words: 9110 | Known words: 7606
companys --- company
paras --- para
yesterdays --- yesterday
pathogens --- pathogen
fevers --- fever
trumps --- trump
louies --- louie
persists --- persist
filipinos --- filipino
heres --- here


In [226]:
# Convert emoji to text
# Local 
temp_vocab = check_vocab(tweets, local_vocab, response='unknown_list')
temp_vocab = [k for k in temp_vocab if (k in emoji.UNICODE_EMOJI)]
temp_dict = {}
for word in temp_vocab:
    temp_dict[word] = re.compile('[:_]').sub(' ', emoji.UNICODE_EMOJI.get(word)) 
tweets = tweets.apply(lambda x: ' '.join([make_dict_cleaning(i,temp_dict) for i in x.split()]))
if verbose: print('#' * 10, 'Step - Convert emoji to text:'); check_vocab(tweets, local_vocab);
if verbose: print_dict(temp_dict)                                                                      

########## Step - Convert emoji to text:
Unknown words: 9110 | Known words: 7606


In [227]:
tweets.head()

0    4 , 000 a day dying from the so called covid -...
1    pranam message for today manifested in dhyan b...
2    hyderabad - based ? @ bharatbiotech ? has soug...
3    confirmation that chinese # vaccines " dont ha...
4    lab studies suggest # pfizer , # moderna vacci...
Name: tweet_text, dtype: object

In [228]:
LABELED.head(10)

Unnamed: 0,tweet_id,label,tweet_text
0,1.360342e+18,1,"4,000 a day dying from the so called Covid-19 ..."
1,1.382896e+18,2,Pranam message for today manifested in Dhyan b...
2,1.375673e+18,2,Hyderabad-based ?@BharatBiotech? has sought fu...
3,1.381311e+18,1,"Confirmation that Chinese #vaccines ""don�t hav..."
4,1.362166e+18,3,"Lab studies suggest #Pfizer, #Moderna vaccines..."
5,1.351285e+18,1,Still want to take the #jab? #PfizerBioNTech #...
6,1.377333e+18,2,"This time, Aerol�neas flight AR1068 goes to Mo..."
7,1.363344e+18,3,#Covaxin effective against mutant virus strain...
8,1.37258e+18,3,Safe and effective. #OxfordAstraZeneca
9,1.367507e+18,2,The day after the #Moderna #COVID19Vaccine... ...
