In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import urllib.request
import urlopen
from wordcloud import WordCloud, STOPWORDS
import operator 
import re
import gensim
import pickle
from edm import report

200


> # Text Cleaning and Embedding Initialization
> May 5, 2019

In [4]:
# Settings
pd.set_option('display.float_format', lambda x: '%.3f' % x) # disable sci notation
font = {'size'   : 14}

plt.rc('font', **font)

In [5]:
# Global params
SPL_SEQ_DICT = {"emojis": [":)", ":-)", ":(", ":-(", ":-/", ":/", "-_-", ":|",  ":-|"],
                "proper nouns": ["republican", "democrat", "trump", "clinton", "hillary"]}
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ",
                 "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'",
                 '“': '"', '”': '"', '“': '"', "£": "e", '∞':'infinity', 'θ': 'theta',
                 '÷': '/', 'α': 'alpha','•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '',
            '³': '3', 'π': 'pi', }

punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

mispell_dict = {'SB91':'senate bill','tRump':'trump','utmterm':'utm term','FakeNews':'fake news','Gʀᴇat':'great','ʙᴏᴛtoᴍ':'bottom','washingtontimes':'washington times','garycrum':'gary crum','htmlutmterm':'html utm term','RangerMC':'car','TFWs':'tuition fee waiver','SJWs':'social justice warrior','Koncerned':'concerned','Vinis':'vinys','Yᴏᴜ':'you','Trumpsters':'trump','Trumpian':'trump','bigly':'big league','Trumpism':'trump','Yoyou':'you','Auwe':'wonder','Drumpf':'trump','utmterm':'utm term','Brexit':'british exit','utilitas':'utilities','ᴀ':'a', '😉':'wink','😂':'joy','😀':'stuck out tongue', 'theguardian':'the guardian','deplorables':'deplorable', 'theglobeandmail':'the globe and mail', 'justiciaries': 'justiciary','creditdation': 'Accreditation','doctrne':'doctrine','fentayal': 'fentanyl','designation-': 'designation','CONartist' : 'con-artist','Mutilitated' : 'Mutilated','Obumblers': 'bumblers','negotiatiations': 'negotiations','dood-': 'dood','irakis' : 'iraki','cooerate': 'cooperate','COx':'cox','racistcomments':'racist comments','envirnmetalists': 'environmentalists',}

# ref: https://www.kaggle.com/adityaecdrid/public-version-text-cleaning-vocab-65
contraction_mapping = {
    "Trump's" : 'trump is',"'cause": 'because',',cause': 'because',';cause': 'because',"ain't": 'am not','ain,t': 'am not',
    'ain;t': 'am not','ain´t': 'am not','ain’t': 'am not',"aren't": 'are not',
    'aren,t': 'are not','aren;t': 'are not','aren´t': 'are not','aren’t': 'are not',"can't": 'cannot',"can't've": 'cannot have','can,t': 'cannot','can,t,ve': 'cannot have',
    'can;t': 'cannot','can;t;ve': 'cannot have',
    'can´t': 'cannot','can´t´ve': 'cannot have','can’t': 'cannot','can’t’ve': 'cannot have',
    "could've": 'could have','could,ve': 'could have','could;ve': 'could have',"couldn't": 'could not',"couldn't've": 'could not have','couldn,t': 'could not','couldn,t,ve': 'could not have','couldn;t': 'could not',
    'couldn;t;ve': 'could not have','couldn´t': 'could not',
    'couldn´t´ve': 'could not have','couldn’t': 'could not','couldn’t’ve': 'could not have','could´ve': 'could have',
    'could’ve': 'could have',"didn't": 'did not','didn,t': 'did not','didn;t': 'did not','didn´t': 'did not',
    'didn’t': 'did not',"doesn't": 'does not','doesn,t': 'does not','doesn;t': 'does not','doesn´t': 'does not',
    'doesn’t': 'does not',"don't": 'do not','don,t': 'do not','don;t': 'do not','don´t': 'do not','don’t': 'do not',
    "hadn't": 'had not',"hadn't've": 'had not have','hadn,t': 'had not','hadn,t,ve': 'had not have','hadn;t': 'had not',
    'hadn;t;ve': 'had not have','hadn´t': 'had not','hadn´t´ve': 'had not have','hadn’t': 'had not','hadn’t’ve': 'had not have',"hasn't": 'has not','hasn,t': 'has not','hasn;t': 'has not','hasn´t': 'has not','hasn’t': 'has not',
    "haven't": 'have not','haven,t': 'have not','haven;t': 'have not','haven´t': 'have not','haven’t': 'have not',"he'd": 'he would',
    "he'd've": 'he would have',"he'll": 'he will',
    "he's": 'he is','he,d': 'he would','he,d,ve': 'he would have','he,ll': 'he will','he,s': 'he is','he;d': 'he would',
    'he;d;ve': 'he would have','he;ll': 'he will','he;s': 'he is','he´d': 'he would','he´d´ve': 'he would have','he´ll': 'he will',
    'he´s': 'he is','he’d': 'he would','he’d’ve': 'he would have','he’ll': 'he will','he’s': 'he is',"how'd": 'how did',"how'll": 'how will',
    "how's": 'how is','how,d': 'how did','how,ll': 'how will','how,s': 'how is','how;d': 'how did','how;ll': 'how will',
    'how;s': 'how is','how´d': 'how did','how´ll': 'how will','how´s': 'how is','how’d': 'how did','how’ll': 'how will',
    'how’s': 'how is',"i'd": 'i would',"i'll": 'i will',"i'm": 'i am',"i've": 'i have','i,d': 'i would','i,ll': 'i will',
    'i,m': 'i am','i,ve': 'i have','i;d': 'i would','i;ll': 'i will','i;m': 'i am','i;ve': 'i have',"isn't": 'is not',
    'isn,t': 'is not','isn;t': 'is not','isn´t': 'is not','isn’t': 'is not',"it'd": 'it would',"it'll": 'it will',"It's":'it is',
    "it's": 'it is','it,d': 'it would','it,ll': 'it will','it,s': 'it is','it;d': 'it would','it;ll': 'it will','it;s': 'it is','it´d': 'it would','it´ll': 'it will','it´s': 'it is',
    'it’d': 'it would','it’ll': 'it will','it’s': 'it is',
    'i´d': 'i would','i´ll': 'i will','i´m': 'i am','i´ve': 'i have','i’d': 'i would','i’ll': 'i will','i’m': 'i am',
    'i’ve': 'i have',"let's": 'let us','let,s': 'let us','let;s': 'let us','let´s': 'let us',
    'let’s': 'let us',"ma'am": 'madam','ma,am': 'madam','ma;am': 'madam',"mayn't": 'may not','mayn,t': 'may not','mayn;t': 'may not',
    'mayn´t': 'may not','mayn’t': 'may not','ma´am': 'madam','ma’am': 'madam',"might've": 'might have','might,ve': 'might have','might;ve': 'might have',"mightn't": 'might not','mightn,t': 'might not','mightn;t': 'might not','mightn´t': 'might not',
    'mightn’t': 'might not','might´ve': 'might have','might’ve': 'might have',"must've": 'must have','must,ve': 'must have','must;ve': 'must have',
    "mustn't": 'must not','mustn,t': 'must not','mustn;t': 'must not','mustn´t': 'must not','mustn’t': 'must not','must´ve': 'must have',
    'must’ve': 'must have',"needn't": 'need not','needn,t': 'need not','needn;t': 'need not','needn´t': 'need not','needn’t': 'need not',"oughtn't": 'ought not','oughtn,t': 'ought not','oughtn;t': 'ought not',
    'oughtn´t': 'ought not','oughtn’t': 'ought not',"sha'n't": 'shall not','sha,n,t': 'shall not','sha;n;t': 'shall not',"shan't": 'shall not',
    'shan,t': 'shall not','shan;t': 'shall not','shan´t': 'shall not','shan’t': 'shall not','sha´n´t': 'shall not','sha’n’t': 'shall not',
    "she'd": 'she would',"she'll": 'she will',"she's": 'she is','she,d': 'she would','she,ll': 'she will',
    'she,s': 'she is','she;d': 'she would','she;ll': 'she will','she;s': 'she is','she´d': 'she would','she´ll': 'she will',
    'she´s': 'she is','she’d': 'she would','she’ll': 'she will','she’s': 'she is',"should've": 'should have','should,ve': 'should have','should;ve': 'should have',
    "shouldn't": 'should not','shouldn,t': 'should not','shouldn;t': 'should not','shouldn´t': 'should not','shouldn’t': 'should not','should´ve': 'should have',
    'should’ve': 'should have',"that'd": 'that would',"that's": 'that is','that,d': 'that would','that,s': 'that is','that;d': 'that would',
    'that;s': 'that is','that´d': 'that would','that´s': 'that is','that’d': 'that would','that’s': 'that is',"there'd": 'there had',
    "there's": 'there is','there,d': 'there had','there,s': 'there is','there;d': 'there had','there;s': 'there is',
    'there´d': 'there had','there´s': 'there is','there’d': 'there had','there’s': 'there is',
    "they'd": 'they would',"they'll": 'they will',"they're": 'they are',"they've": 'they have',
    'they,d': 'they would','they,ll': 'they will','they,re': 'they are','they,ve': 'they have','they;d': 'they would','they;ll': 'they will','they;re': 'they are',
    'they;ve': 'they have','they´d': 'they would','they´ll': 'they will','they´re': 'they are','they´ve': 'they have','they’d': 'they would','they’ll': 'they will',
    'they’re': 'they are','they’ve': 'they have',"wasn't": 'was not','wasn,t': 'was not','wasn;t': 'was not','wasn´t': 'was not',
    'wasn’t': 'was not',"we'd": 'we would',"we'll": 'we will',"we're": 'we are',"we've": 'we have','we,d': 'we would','we,ll': 'we will',
    'we,re': 'we are','we,ve': 'we have','we;d': 'we would','we;ll': 'we will','we;re': 'we are','we;ve': 'we have',
    "weren't": 'were not','weren,t': 'were not','weren;t': 'were not','weren´t': 'were not','weren’t': 'were not','we´d': 'we would','we´ll': 'we will',
    'we´re': 'we are','we´ve': 'we have','we’d': 'we would','we’ll': 'we will','we’re': 'we are','we’ve': 'we have',"what'll": 'what will',"what're": 'what are',"what's": 'what is',
    "what've": 'what have','what,ll': 'what will','what,re': 'what are','what,s': 'what is','what,ve': 'what have','what;ll': 'what will','what;re': 'what are',
    'what;s': 'what is','what;ve': 'what have','what´ll': 'what will',
    'what´re': 'what are','what´s': 'what is','what´ve': 'what have','what’ll': 'what will','what’re': 'what are','what’s': 'what is',
    'what’ve': 'what have',"where'd": 'where did',"where's": 'where is','where,d': 'where did','where,s': 'where is','where;d': 'where did',
    'where;s': 'where is','where´d': 'where did','where´s': 'where is','where’d': 'where did','where’s': 'where is',
    "who'll": 'who will',"who's": 'who is','who,ll': 'who will','who,s': 'who is','who;ll': 'who will','who;s': 'who is',
    'who´ll': 'who will','who´s': 'who is','who’ll': 'who will','who’s': 'who is',"won't": 'will not','won,t': 'will not','won;t': 'will not',
    'won´t': 'will not','won’t': 'will not',"wouldn't": 'would not','wouldn,t': 'would not','wouldn;t': 'would not','wouldn´t': 'would not',
    'wouldn’t': 'would not',"you'd": 'you would',"you'll": 'you will',"you're": 'you are','you,d': 'you would','you,ll': 'you will',
    'you,re': 'you are','you;d': 'you would','you;ll': 'you will',
    'you;re': 'you are','you´d': 'you would','you´ll': 'you will','you´re': 'you are','you’d': 'you would','you’ll': 'you will','you’re': 'you are',
    '´cause': 'because','’cause': 'because',"you've": "you have","could'nt": 'could not',
    "havn't": 'have not',"here’s": "here is",'i""m': 'i am',"i'am": 'i am',"i'l": "i will","i'v": 'i have',"wan't": 'want',"was'nt": "was not","who'd": "who would",
    "who're": "who are","who've": "who have","why'd": "why would","would've": "would have","y'all": "you all","y'know": "you know","you.i": "you i",
    "your'e": "you are","arn't": "are not","agains't": "against","c'mon": "common","doens't": "does not",'don""t': "do not","dosen't": "does not",
    "dosn't": "does not","shoudn't": "should not","that'll": "that will","there'll": "there will","there're": "there are",
    "this'll": "this all","u're": "you are", "ya'll": "you all","you'r": "you are","you’ve": "you have","d'int": "did not","did'nt": "did not","din't": "did not","dont't": "do not","gov't": "government",
    "i'ma": "i am","is'nt": "is not","‘I":'I',
    'ᴀɴᴅ':'and','ᴛʜᴇ':'the','ʜᴏᴍᴇ':'home','ᴜᴘ':'up','ʙʏ':'by','ᴀᴛ':'at','…and':'and','civilbeat':'civil beat',\
    'TrumpCare':'Trump care','Trumpcare':'Trump care', 'OBAMAcare':'Obama care','ᴄʜᴇᴄᴋ':'check','ғᴏʀ':'for','ᴛʜɪs':'this','ᴄᴏᴍᴘᴜᴛᴇʀ':'computer',\
    'ᴍᴏɴᴛʜ':'month','ᴡᴏʀᴋɪɴɢ':'working','ᴊᴏʙ':'job','ғʀᴏᴍ':'from','Sᴛᴀʀᴛ':'start','gubmit':'submit','CO₂':'carbon dioxide','ғɪʀsᴛ':'first',\
    'ᴇɴᴅ':'end','ᴄᴀɴ':'can','ʜᴀᴠᴇ':'have','ᴛᴏ':'to','ʟɪɴᴋ':'link','ᴏғ':'of','ʜᴏᴜʀʟʏ':'hourly','ᴡᴇᴇᴋ':'week','ᴇɴᴅ':'end','ᴇxᴛʀᴀ':'extra',\
    'Gʀᴇᴀᴛ':'great','sᴛᴜᴅᴇɴᴛs':'student','sᴛᴀʏ':'stay','ᴍᴏᴍs':'mother','ᴏʀ':'or','ᴀɴʏᴏɴᴇ':'anyone','ɴᴇᴇᴅɪɴɢ':'needing','ᴀɴ':'an','ɪɴᴄᴏᴍᴇ':'income',\
    'ʀᴇʟɪᴀʙʟᴇ':'reliable','ғɪʀsᴛ':'first','ʏᴏᴜʀ':'your','sɪɢɴɪɴɢ':'signing','ʙᴏᴛᴛᴏᴍ':'bottom','ғᴏʟʟᴏᴡɪɴɢ':'following','Mᴀᴋᴇ':'make',\
    'ᴄᴏɴɴᴇᴄᴛɪᴏɴ':'connection','ɪɴᴛᴇʀɴᴇᴛ':'internet','financialpost':'financial post', 'ʜaᴠᴇ':' have ', 'ᴄaɴ':' can ', 'Maᴋᴇ':' make ', 'ʀᴇʟɪaʙʟᴇ':' reliable ', 'ɴᴇᴇᴅ':' need ',
    'ᴏɴʟʏ':' only ', 'ᴇxᴛʀa':' extra ', 'aɴ':' an ', 'aɴʏᴏɴᴇ':' anyone ', 'sᴛaʏ':' stay ', 'Sᴛaʀᴛ':' start', 'SHOPO':'shop',
    }

In [6]:
# Load training set
train = pd.read_csv("/Users/elenabg/DetoxiPy/train.csv")

In [7]:
# Generate new columns

train['total_length'] = train['comment_text'].apply(len)
train['capitals'] = train['comment_text'].apply(lambda comment: \
                               sum(1 for c in comment if c.isupper()))
train['caps_ratio'] = train.apply(lambda row: \
                      float(row['capitals'])/float(row['total_length']),axis=1)
train['num_exclamation_marks'] = train['comment_text'].apply(lambda \
                                                             comment: comment.count('!'))
train['excl_ratio'] = train.apply(lambda row: \
                      float(row['num_exclamation_marks'])/float(row['total_length']),axis=1)
train['num_question_marks'] = train['comment_text'].apply(lambda comment: comment.count('?'))

train['quest_ratio'] = train.apply(lambda row: \
                      float(row['num_question_marks'])/float(row['total_length']),axis=1)
train['num_punctuation'] = train['comment_text'].apply(lambda \
                                            comment: sum(comment.count(w) for w in '.,;:'))

train['punct_ratio'] = train.apply(lambda row: \
                      float(row['num_punctuation'])/float(row['total_length']),axis=1)
train['num_symbols'] = train['comment_text'].apply(lambda comment:\
                sum(comment.count(w) for w in '*&$%'))
train['symb_ratio'] = train.apply(lambda row: \
                      float(row['num_symbols'])/float(row['total_length']),axis=1)
train['num_words'] = train['comment_text'].apply(lambda comment: len(comment.split()))
train['num_unique_words'] = train['comment_text'].apply(lambda comment: len(set(w for \
                                                        w in comment.split())))
train['unique_ratio'] = train['num_unique_words'] / train['num_words']
train['num_smilies'] = train['comment_text'].apply(lambda comment: \
                           sum(comment.count(w) for w in SPL_SEQ_DICT["emojis"] ))
train['prop_nouns_num'] = train['comment_text'].apply(lambda comment: \
               sum(comment.lower().count(w) for w in SPL_SEQ_DICT["proper nouns"] ))

### Cleaning functions

In [2]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def known_contractions(embed):
    known = []
    for contract in contraction_mapping:
        if contract in embed:
            known.append(contract)
    return known

def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

def add_lower(embedding, vocab):
    '''Since we are not going to get rid of uppercase, 
    add lowercase versions of words to embedding'''
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")


def check_unknown_punct(embed, punct):
    '''Similar to lowercase, add punctuation to 
    embeddings'''
    unknown = ''
    for p in punct:
        if p not in embed:
            unknown += p
            unknown += ' '
    return unknown

def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    for p in punct:
        text = text.replace(p, f' {p} ')
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    return text

def correct_spelling(x, dic):
    '''
    Very simple correction for commonly mispelled words/terms in the text
    '''
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

### Embed coverage functions

In [3]:
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    if file == '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec':
        embeddings_index = gensim.models.KeyedVectors.load_word2vec_format(crawl)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
    return embeddings_index

def check_coverage(vocab, embeddings_index):
    '''Check how many words are known in 
    the built-in glove embedding'''
    
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

### Reduce memory usage

In [13]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

### Check embeddings between cleaning stages

In [4]:
# Load glove embeddings from file (can be dowloaded here: https://www.kaggle.com/takuok/glove840b300dtxt#glove.840B.300d.txt)

file = "/Users/elenabg/Documents/6Q/AML/Project/glove.840B.300d.txt"
embed_glove = load_embed(file)

In [6]:
# Save embedding
pickle.dump(embed_glove, open("/Users/elenabg/Documents/6Q/AML/Project/glvembed.p", 'wb'))

In [12]:
#1. Initial vocabulary from raw text
vocab = build_vocab(train['comment_text']) 

In [15]:
#1. Check initial coverage

print("Glove : ")
oov_glove = check_coverage(vocab, embed_glove)

Glove : 
Found embeddings for 15.76% of vocab
Found embeddings for  89.60% of all text


In [18]:
#2. Add (missing) lower-case to embedding

print("Glove : ")
add_lower(embed_glove, vocab)

# Check Result
oov_glove[:10]

Glove : 
Added 24243 words to embedding


[("isn't", 39964),
 ("That's", 37640),
 ("won't", 29397),
 ("he's", 24353),
 ("Trump's", 23453),
 ("aren't", 20528),
 ("wouldn't", 19544),
 ('Yes,', 19043),
 ('that,', 18283),
 ("wasn't", 18153)]

In [19]:
# 3. Punctuation

print("Glove (Unkown punc %):")
print(check_unknown_punct(embed_glove, punct))

Glove (Unkown punc %):
“ ” ’ ∞ θ ÷ α • à − β ∅ ³ π ‘ ₹ ´ ° £ € × ™ √ ² — – 


In [22]:
train['treated_comment'] = train['comment_text'].apply(lambda x: clean_special_chars(x,
                                                                    punct, punct_mapping))
new_vocab = build_vocab(train['treated_comment'])

print("Glove : ")
oov_glove = check_coverage(new_vocab, embed_glove)
oov_glove[:10]

Glove : 
Found embeddings for 62.64% of vocab
Found embeddings for  99.72% of all text


[('tRump', 2525),
 ('Brexit', 1740),
 ('theglobeandmail', 1351),
 ('Québec', 1331),
 ('Drumpf', 1186),
 ('deplorables', 1026),
 ('SB91', 781),
 ('theguardian', 735),
 ('Trumpcare', 570),
 ('✰', 550)]

In [23]:
# 4 Contractions

print("- Known Contractions -")
print("   Glove :")
print(known_contractions(embed_glove))

- Known Contractions -
   Glove :
["'cause", "can't", "didn't", "doesn't", "don't", "i'd", "i'll", "i'm", "i've", "It's", "it's", "ma'am", "that's", "you'll", "you're", 'you.i', "c'mon", "d'int"]


In [25]:
train['treated_comment'] = train['treated_comment'].apply(lambda x: clean_contractions(x,
                                                        contraction_mapping))
new_vocab = build_vocab(train['treated_comment'])
print("Glove : ")
oov_glove = check_coverage(new_vocab, embed_glove)

Glove : 
Found embeddings for 62.65% of vocab
Found embeddings for  99.72% of all text


In [26]:
# 5. Fixed spelling
train['treated_comment'] = train['treated_comment'].apply(lambda x: correct_spelling(x,
                                        mispell_dict))
new_vocab = build_vocab(train['treated_comment'])
print("Glove : ")
oov_glove = check_coverage(new_vocab, embed_glove)

Glove : 
Found embeddings for 62.66% of vocab
Found embeddings for  99.74% of all text


### Measure "difficulty" of initial/final training datasets

In [27]:
df = train.sample(frac=0.003)

In [31]:
# initial texts
sents = df["comment_text"].values
labels = df["target"].values
print(report.get_difficulty_report(sents, labels))

----> Building bag of words representations...
[-------------------------------] : 5414 of 5415, 100.0% : Est. 0.0 mins Remaining               ] : 154 of 5415, 2.8% : Est. 0.3 mins Remaining    ] : 222 of 5415, 4.1% : Est. 0.3 mins Remaining] : 294 of 5415, 5.4% : Est. 0.3 mins Remaining                         ] : 372 of 5415, 6.9% : Est. 0.2 mins Remaining             ] : 435 of 5415, 8.0% : Est. 0.3 mins Remaining                    ] : 492 of 5415, 9.1% : Est. 0.3 mins Remaining                          ] : 576 of 5415, 10.6% : Est. 0.2 mins Remaining                      ] : 654 of 5415, 12.1% : Est. 0.2 mins Remaining     ] : 730 of 5415, 13.5% : Est. 0.2 mins Remaining   ] : 794 of 5415, 14.7% : Est. 0.2 mins Remaining-----                         ] : 854 of 5415, 15.8% : Est. 0.2 mins Remaining                ] : 935 of 5415, 17.3% : Est. 0.2 mins Remaining             ] : 1013 of 5415, 18.7% : Est. 0.2 mins Remaining             ] : 1083 of 5415, 20.0% : Est. 0.2 mins Remaini

In [30]:
# clean text difficulty
sents_fin = df["treated_comment"].values
print(report.get_difficulty_report(sents_fin, labels))

----> Building bag of words representations...
[-------------------------------] : 5414 of 5415, 100.0% : Est. 0.0 mins Remaining                       ] : 87 of 5415, 1.6% : Est. 0.6 mins Remaining                    ] : 147 of 5415, 2.7% : Est. 0.5 mins Remaining-                            ] : 212 of 5415, 3.9% : Est. 0.4 mins Remaining ] : 284 of 5415, 5.2% : Est. 0.4 mins Remaining              ] : 343 of 5415, 6.3% : Est. 0.3 mins Remaining    ] : 414 of 5415, 7.6% : Est. 0.3 mins Remaining           ] : 477 of 5415, 8.8% : Est. 0.3 mins Remaining                        ] : 618 of 5415, 11.4% : Est. 0.3 mins Remaining                     ] : 696 of 5415, 12.9% : Est. 0.3 mins Remaining      ] : 769 of 5415, 14.2% : Est. 0.3 mins Remaining  ] : 836 of 5415, 15.4% : Est. 0.3 mins Remaining------                        ] : 915 of 5415, 16.9% : Est. 0.3 mins Remaining---                        ] : 993 of 5415, 18.3% : Est. 0.2 mins Remaining                       ] : 1071 of 5415, 19

In [7]:
# Save clean training set
pickle.dump(train, open("/Users/elenabg/Documents/6Q/AML/Project/clean_train.p", 'wb'))