In [1]:
import emoji
import os
import pandas as pd
import re


In [2]:
def correct_ampersands(match):
    _str = match.group(0)
    splitted = re.split('&', _str, flags=re.IGNORECASE)
    if all([str.isupper(i) for i in splitted]):
        return _str
    else:
        return ' & '.join(splitted)

        
def correct_slashsplitted(match):
    _str = match.group(0)
    return ' '.join(' '.join(_str.split('/')).split('\\'))

        
    
def remove_links(phrase):
    
    phrase = re.sub("\S*(t\.co|http)\S*", "", phrase, flags=re.IGNORECASE)
    
    return phrase

def global_processing(df):
    '''
    Global preprocessing/data munging
    '''
    
    
    df['full_text'] = df['full_text'].apply(lambda x:re.sub("\’", "\'", x, flags=re.IGNORECASE))
    df['full_text'] = df['full_text'].apply(lambda x:re.sub("\”", "\"", x, flags=re.IGNORECASE))
    df['full_text'] = df['full_text'].apply(lambda x:re.sub("\“", "\"", x, flags=re.IGNORECASE))
    
    
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&amp;*', '&', x, flags=re.IGNORECASE), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&gt;*', '>', x,flags=re.IGNORECASE), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&lt;*', '<', x, flags=re.IGNORECASE), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('\S*&\S+', correct_ampersands, x, flags=re.IGNORECASE), )
    
    df['full_text'] = df['full_text'].apply(lambda x:emoji.get_emoji_regexp().sub('', x), )
    
    df['full_text'] = df['full_text'].apply(remove_links)
    
    max_num_mentions = 4
    spammy_indices_mentions = df['full_text'].apply(lambda x:re.search("(\@.*){%d,}"%(max_num_mentions+1), x, flags = re.IGNORECASE)).dropna().index
    print(len(spammy_indices_mentions))
    df = df.iloc[[i for i in df.index if i not in spammy_indices_mentions]]
    df = df.reset_index(drop=True)
    
    max_hashtags = 4
    spammy_indices_hashtags = df['full_text'].apply(lambda x:re.search("(\#\w+\s){%d,}"%(max_hashtags+1), x)).dropna().apply(lambda x:x.group(0)).index
    print(len(spammy_indices_hashtags))
    df = df.iloc[[i for i in df.index if i not in spammy_indices_hashtags]]
    
    
    df = df.reset_index(drop=True)
    

    return df

# Get Data

In [3]:
df = pd.read_csv("final_dataset_Elizabeth.csv")
display(df.nunique())
df.dtypes, df.shape

id_str        22198
created_at    20577
full_text     23753
dtype: int64

(id_str         int64
 created_at    object
 full_text     object
 dtype: object, (27970, 3))

In [4]:
df = df.drop_duplicates(subset=['id_str']).reset_index(drop=True)
df.shape

(22198, 3)

In [5]:
df = global_processing(df)


4685
32


In [6]:
df.shape

(17481, 3)

# Decontracting

In [7]:
df['full_text'].apply(lambda x:re.search("[a-zA-Z]{1,7}[\'|\"][a-zA-Z]{1,3}", x)).dropna().apply(lambda x:x.group(0)).value_counts()[:10]

don't      693
I'm        559
It's       400
didn't     357
it's       356
you're     270
can't      242
That's     235
You're     230
doesn't    207
Name: full_text, dtype: int64

In [8]:
def decontract(phrase):
    
    # specific
    phrase = re.sub(r"can\'t", "can not", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"won\'t", "will not", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"let\'s", "let us", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"let\'s", "let us", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"y\'all", "you all", phrase, flags=re.IGNORECASE)
    

    # general
    phrase = re.sub(r"n\'t", " not", phrase, flags=re.IGNORECASE) #notice the spaces
    phrase = re.sub(r"\'re", " are", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'s", " is", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'d", " would", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'ll", " will", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'t", " not", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'ve", " have", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'m", " am", phrase, flags=re.IGNORECASE)
    return phrase

In [9]:
df['full_text'] = df['full_text'].apply(decontract)

In [10]:
df['full_text'].apply(lambda x:re.search("[a-zA-Z]{1,7}[\'|\"][a-zA-Z]{1,3}", x)).dropna().apply(lambda x:x.group(0)).value_counts()[:10]

ma'am      6
your'e     4
Ma'am      2
D'Ale      1
Your'e     1
his"bef    1
is"lik     1
AIN"T      1
F'ing      1
MA'AM      1
Name: full_text, dtype: int64

# Tags

In [12]:
df['full_text'].apply(lambda x:re.search("\@\w+", x, flags=re.IGNORECASE)).dropna().apply(lambda x:x.group(0)).value_counts()[:10]

@ewarren           8926
@ProudResister      221
@cbszak             141
@BernieSanders      137
@thehill            118
@MarkYoungTruth      94
@RepSchakowsky       75
@ryangrim            63
@willclog18          62
@DisastrouslyH       51
Name: full_text, dtype: int64

In [17]:
def remove_tags(phrase):
    
    phrase = re.sub("\@ewarren", "", phrase)
    phrase = re.sub("\@TeamWarren", "", phrase)
    
    return phrase

In [18]:
df['full_text'] = df['full_text'].apply(remove_tags)

In [19]:
df['full_text'].apply(lambda x:re.search("\@\w+", x)).dropna().apply(lambda x:x.group(0)).str.strip().value_counts()

@ProudResister     251
@BernieSanders     226
@cbszak            141
@thehill           118
@RepSchakowsky     116
                  ... 
@58littleflower      1
@BScottAnderson      1
@mmillernc17         1
@Paulie_Sigh         1
@historianed_        1
Name: full_text, Length: 3403, dtype: int64

In [20]:
df.shape

(17481, 3)

# Hashtags

In [21]:
df['full_text'].apply(lambda x:re.search("\#\w*(elizabeth|warren){0,1}\w*", x, re.IGNORECASE)).dropna().apply(lambda x:x.group(0)).value_counts()[:20]

#Warren2020                     95
#MedicareForAll                 33
#TeamWarren                     32
#Pocahontas                     24
#Trump2020                      20
#ElizabethWarren                19
#MAGA                           16
#Democrats                      14
#WinWithWarren                  14
#M4A                            14
#AMJoy                          13
#FATCA                          13
#Thanksgiving                   13
#1                              13
#100DaysOfCode                  12
#Bernie2020                     12
#LiberalismIsAMentalDisorder    11
#DreamBigFightHard              11
#Fauxcahontas                   11
#ADOS                           10
Name: full_text, dtype: int64

In [28]:
def remove_hashtags(phrase):

    for i in ['#[W|w]arren2020', '#WinWithWarren', "#[T|t]eamWarren", "#[E|l]lizabeth[W|w]arren", "#[W|w]arren"]:
        phrase = re.sub(i, "", phrase)
    
    return phrase

In [29]:
df['full_text'] = df['full_text'].apply(remove_hashtags)

In [30]:
df['full_text'].apply(lambda x:re.search("\#\w*(elizabeth|warren){0,1}\w*", x)).dropna().apply(lambda x:x.group(0)).value_counts()[:20]

#MedicareForAll                 34
#Pocahontas                     24
#Trump2020                      20
#                               19
#MAGA                           16
#DreamBigFightHard              15
#Democrats                      15
#M4A                            14
#FATCA                          13
#AMJoy                          13
#Thanksgiving                   13
#1                              13
#100DaysOfCode                  12
#Bernie2020                     12
#Fauxcahontas                   11
#LiberalismIsAMentalDisorder    11
#ADOS                           10
#iacaucus                        9
#BigStructuralChange             9
#Medicare4All                    9
Name: full_text, dtype: int64

# (Obvious) Mentions

In [32]:
df['full_text'].apply(lambda x:re.search("\s+\w*([E|e]lizabeth|[W|w]arren){1,}\w*\s+", x, re.IGNORECASE)).dropna().apply(lambda x:x.group(0)).str.strip().value_counts()[:10]

Elizabeth    240
warren        29
WARREN         5
elizabeth      4
warrens        3
ELIZABETH      3
WARRENS        1
ewarren        1
wArReN         1
WARren         1
Name: full_text, dtype: int64

In [36]:
temp = df['full_text'].apply(lambda x:re.search("\s+\w*([E|e]lizabeth|[W|w]arren){1,}\w*\s+", x)).dropna().apply(lambda x:x.group(0)).value_counts().index

def remove_mentions(phrase):
    
    for to_replace in temp:
        phrase = re.sub(to_replace, "", phrase)
    
    return phrase

In [37]:
df['full_text'] = df['full_text'].apply(remove_mentions)

In [38]:
df['full_text'].apply(lambda x:re.search("\s+\w*([E|e]lizabeth|[W|w]arren){1,}\w*\s+", x)).dropna().apply(lambda x:x.group(0)).value_counts()

Series([], Name: full_text, dtype: int64)

In [39]:
df.to_csv('processed_final_Elizabeth.csv', index=False)