In [453]:
import emoji
import os
import pandas as pd
import re


In [454]:
def correct_ampersands(match):
    _str = match.group(0)
    splitted = re.split('&', _str)
    if all([str.isupper(i) for i in splitted]):
        return _str
    else:
        return ' & '.join(splitted)
        
def correct_multiplemarks(match):
    _str = match.group(0)
    if _str.startswith('?'):
        return '?'
    elif _str.startswith('!'):
        return '!'
        
        
def correct_slashsplitted(match):
    _str = match.group(0)
    return ' '.join(' '.join(_str.split('/')).split('\\'))

        
def global_processing(df):
    '''
    Global preprocessing/data munging
    '''
    
    # remove links
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('http\S+', '', x))
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&amp;*', '&', x), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&gt;*', '>', x), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&lt;*', '<', x), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('\S*&\S+', correct_ampersands, x), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('[!?]{2,}', correct_multiplemarks, x), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('[a-zA-Z]{2,}[\\|\/|\_][A-Za-z]{2,}', correct_slashsplitted, x), )
    df['full_text'] = df['full_text'].apply(lambda x:emoji.get_emoji_regexp().sub('', x), )
    
    # standardize candidate names/hashtags/mentions
    #df['full_text'] = df['full_text'].apply(lambda x:re.sub("[B|b]ern\S*\s+", " BernieSanders ", x))
    #df['full_text'] = df['full_text'].apply(lambda x:re.sub("\S*[S|s]ander\S*\s+", " BernieSanders ", x))
    #df['full_text'] = df['full_text'].apply(lambda x:re.sub("\S*[W|w]arren\S*\s+", " ElizabethWarren ", x))
    #df['full_text'] = df['full_text'].apply(lambda x:re.sub("\S*[E|e]lizabeth\S*\s+", " ElizabethWarren ", x))
    #df['full_text'] = df['full_text'].apply(lambda x:re.sub("\S*[B|b]iden\S*\s+", " JoeBiden ", x))
    #df['full_text'] = df['full_text'].apply(lambda x:re.sub("\S*[B|b]iden\S*\s+", " JoeBiden ", x))

    return df

# Get Data

In [455]:
for name in ['Bernie', 'Joe', 'Elizabeth']:
    df = pd.read_csv("final_dataset_{}.csv".format(name))
    display(df.nunique())
df.dtypes

id_str        19608
created_at    17124
full_text     20916
dtype: int64

id_str        15682
created_at    12750
full_text     16715
dtype: int64

id_str        15639
created_at    14472
full_text     17217
dtype: int64

id_str         int64
created_at    object
full_text     object
dtype: object

In [456]:
df['full_text'] = df['full_text'].str.lower()

# Decontracting

In [457]:
df['full_text'].apply(lambda x:re.search("[a-zA-Z]{1,7}[\“|\’|\'|\"|\”][a-zA-Z]{1,3}", x)).dropna().apply(lambda x:x.group(0)).value_counts()[:10]

don't     575
it's      530
don’t     497
it’s      416
i’m       379
i'm       378
you're    297
that's    279
you’re    272
that’s    248
Name: full_text, dtype: int64

In [458]:
df['full_text'][:5].apply(lambda x:print(x, end = '\n--\n'))

these dem sens. r running against #trump in #2020elections: 

- @ewarren 
- @kamalaharris 
- @michaelbennet 
- @amyklobuchar 
- @corybooker 
- @berniesanders 

should these sens. recuse themselves from sitting as a juror at senate #impeachment trial?

https://t.co/cwchhau5tw

--
.@ewarren’s granddaughter, lavinia, gives great pep talks.

we hope your thanksgiving is filled with friends, family, chosen family, and loved ones who inspire you to keep fighting. https://t.co/mwr8aty8oj

--
democrats doing the “attack the leftist policies” thing should be aware that they are providing bipartisan legitimacy to these talking points. long history of this. @realdonaldtrump will use the attacks against whoever the nominee is, from @joebiden or @petebuttigieg to @ewarren.

--
@satchelmose @ewarren i can't understand her point. she is claiming the rich are being subsidized by the broader "we", but the top 1% pay 37% of federal income taxes and the bottom 50% pay only 3%. what is she saying?

--
@ew

0    None
1    None
2    None
3    None
4    None
Name: full_text, dtype: object

In [459]:
def decontract(phrase):
    
    # specific
    phrase = re.sub(r"can[\’|\']t", "can not", phrase)
    phrase = re.sub(r"won[\’|\']t", "will not", phrase)
    phrase = re.sub(r"let[\’|\']s", "let us", phrase)
    

    # general
    phrase = re.sub(r"n[\’|\']t", " not", phrase) #notice the spaces
    phrase = re.sub(r"[\’|\']re", " are", phrase)
    phrase = re.sub(r"[\’|\']s", " is", phrase)
    phrase = re.sub(r"[\’|\']d", " would", phrase)
    phrase = re.sub(r"[\’|\']ll", " will", phrase)
    phrase = re.sub(r"[\’|\']t", " not", phrase)
    phrase = re.sub(r"[\’|\']ve", " have", phrase)
    phrase = re.sub(r"[\’|\']m", " am", phrase)
    return phrase

In [460]:
#from unicodedata import normalize
#normalize('NFKD', "I’m he's they're let's warren's.\n You will be there _0").encode('ascii','ignore').decode('utf')

In [461]:
df['full_text'] = df['full_text'].apply(decontract)

In [462]:
df['full_text'].apply(lambda x:re.search("[a-zA-Z]{1,7}[\“|\’|\'|\"|\”][a-zA-Z]{1,3}", x)).dropna().apply(lambda x:x.group(0)).value_counts()[:10]

y'all         25
y’all         25
ma’am          8
your'e         6
ma'am          3
nat'l          3
g’fat          2
allowed”is     2
f'n            2
his"bef        2
Name: full_text, dtype: int64

In [463]:
df['full_text'][:5].apply(lambda x:print(x, end = '\n--\n'))

these dem sens. r running against #trump in #2020elections: 

- @ewarren 
- @kamalaharris 
- @michaelbennet 
- @amyklobuchar 
- @corybooker 
- @berniesanders 

should these sens. recuse themselves from sitting as a juror at senate #impeachment trial?

https://t.co/cwchhau5tw

--
.@ewarren is granddaughter, lavinia, gives great pep talks.

we hope your thanksgiving is filled with friends, family, chosen family, and loved ones who inspire you to keep fighting. https://t.co/mwr8aty8oj

--
democrats doing the “attack the leftist policies” thing should be aware that they are providing bipartisan legitimacy to these talking points. long history of this. @realdonaldtrump will use the attacks against whoever the nominee is, from @joebiden or @petebuttigieg to @ewarren.

--
@satchelmose @ewarren i can not understand her point. she is claiming the rich are being subsidized by the broader "we", but the top 1% pay 37% of federal income taxes and the bottom 50% pay only 3%. what is she saying?

--


0    None
1    None
2    None
3    None
4    None
Name: full_text, dtype: object

# Links

In [464]:
df['full_text'].apply(lambda x:re.search("\w*(t\.co|http)\S*", x)).dropna().apply(lambda x:x.group(0))

0        https://t.co/cwchhau5tw
1        https://t.co/mwr8aty8oj
8        https://t.co/kzwvhpmkvm
14       https://t.co/kvynacndh8
15       https://t.co/wvr035tu4o
                  ...           
12180    https://t.co/v7d0thdr8b
12187    https://t.co/pjqholwlza
12192    https://t.co/xyk7hbwlpx
12199    https://t.co/b4rqwmi3lp
12213    https://t.co/gfne1eeznh
Name: full_text, Length: 1918, dtype: object

In [465]:
def remove_links(phrase):
    
    phrase = re.sub("\S*(t\.co|http)\S*", "", phrase)
    
    return phrase

In [466]:
df['full_text'] = df['full_text'].apply(remove_links)

In [467]:
df['full_text'].apply(lambda x:re.search("\S*(t\.co|http)\S*", x)).dropna().apply(lambda x:x.group(0))

Series([], Name: full_text, dtype: object)

# Mentions

In [468]:
df['full_text'].apply(lambda x:re.search("\@\w+", x)).dropna().apply(lambda x:x.group(0)).count()

21411

In [469]:
df['full_text'].apply(lambda x:re.search("\@\w+", x)).dropna().apply(lambda x:x.group(0)).str.strip().value_counts()

@ewarren           9075
@saracarterdc       316
@proudresister      280
@berniesanders      180
@joebiden           102
                   ... 
@russonpolitics       1
@katyturnbc           1
@rstormc              1
@david_darmofal       1
@thebondfreak         1
Name: full_text, Length: 3361, dtype: int64

In [470]:
def remove_mentions(phrase):
    
    phrase = re.sub("\@ewarren", "", phrase)
    
    return phrase

In [471]:
df['full_text'] = df['full_text'].apply(remove_mentions)

In [472]:
df['full_text'].apply(lambda x:re.search("\@\w+", x)).dropna().apply(lambda x:x.group(0)).str.strip().value_counts()

@proudresister     317
@saracarterdc      316
@berniesanders     294
@teamwarren        253
@joebiden          124
                  ... 
@katj512             1
@russonpolitics      1
@smartassjen         1
@katyturnbc          1
@thebondfreak        1
Name: full_text, Length: 3530, dtype: int64

# Hashtags

In [473]:
df['full_text'].apply(lambda x:re.search("\#\w+", x)).dropna().apply(lambda x:x.group(0)).value_counts()

#warren2020        115
#realid             47
#teamwarren         40
#medicareforall     34
#trump2020          31
                  ... 
#grimreaper          1
#thesquad            1
#lockthemup          1
#dearworld           1
#fairworkweek        1
Name: full_text, Length: 847, dtype: int64

In [474]:
def remove_hashtags(phrase):
    
    phrase = re.sub("\#\w*warren\w*", "", phrase)
    
    return phrase

In [475]:
df['full_text'] = df['full_text'].apply(remove_hashtags)

In [476]:
df['full_text'].apply(lambda x:re.search("\#\w+", x)).dropna().apply(lambda x:x.group(0)).value_counts()

#realid               47
#medicareforall       35
#trump2020            31
#maga                 26
#pocahontas           25
                      ..
#fighter               1
#theirishman           1
#resistthebullshit     1
#troop65113            1
#communism             1
Name: full_text, Length: 823, dtype: int64

In [477]:
df['full_text'].apply(lambda x:re.search("\w*elizabeth\w*", x)).dropna().apply(lambda x:x.group(0)).value_counts()

elizabeth          321
elizabethdoyle       1
elizabethmay         1
elizabethforma       1
elizabethmusgr3      1
lyingelizabeth       1
Name: full_text, dtype: int64

In [478]:
df['full_text'].apply(lambda x:re.search("\w*elizabeth\w*", x)).dropna().apply(lambda x:x.group(0)).value_counts()

elizabeth          321
elizabethdoyle       1
elizabethmay         1
elizabethforma       1
elizabethmusgr3      1
lyingelizabeth       1
Name: full_text, dtype: int64

In [479]:
df['full_text'].apply(lambda x:re.search("\w*warren\w*", x)).dropna().apply(lambda x:x.group(0)).value_counts()[:10]

warren             1104
teamwarren          339
senwarren           142
iaforwarren         132
illinois4warren      73
warrenfan4           24
michaelrwarren       17
womenwithwarren       9
artistswarren         8
nhforwarren           8
Name: full_text, dtype: int64

In [480]:
df['full_text'].apply(lambda x:re.search("\s+[B|b]ern\w*\s+", x)).dropna().apply(lambda x:x.group(0)).unique()

array([' bernie ', '  bernie ', ' berners\n', '\n\nbernie ', ' bernie\n',
       ' bernie \n', '\nbernie ', ' bern ', ' \n\nbernie ', ' bernanke ',
       ' bernie  ', ' bernese ', ' berniebros\n', ' berners '],
      dtype=object)

In [481]:
df['full_text'].apply(lambda x:re.search("\s+[S|s]ander\w*\s+", x)).dropna().apply(lambda x:x.group(0)).unique()

array([' sanders ', ' sander ', '  sanders ', ' sanderswarren ',
       '\n\nsanders ', ' sanders\n', ' \nsanders '], dtype=object)

In [482]:
df['full_text'].apply(lambda x:re.search("\s+[W|w]arren\w*\s+", x)).dropna().apply(lambda x:x.group(0)).unique()

array([' warren ', ' warren\n', '  warren ', ' \nwarren ', '\n\nwarren ',
       ' warren \n', ' warrensanders ', ' warren\n\n', '\nwarren ',
       ' warrens ', ' warren   \n', ' warren  \n', '  warren\n',
       '    warren ', '   warren ', ' warren  '], dtype=object)

In [483]:
df['full_text'].apply(lambda x:re.search("\s+[E|e]lizabeth\w*\s+", x)).dropna().apply(lambda x:x.group(0)).unique()

array([' elizabeth ', ' elizabeth\n', '  elizabeth ', '\n\nelizabeth ',
       ' \n\nelizabeth ', '  \n\nelizabeth ', '   elizabeth ',
       '\n elizabeth ', ' \nelizabeth ', '  elizabeth  ', '\nelizabeth '],
      dtype=object)

In [484]:
df['full_text'].apply(lambda x:re.search("\s+[B|b]iden\w*\s+", x)).dropna().apply(lambda x:x.group(0)).unique()

array([' biden ', ' bidens ', ' bidenville \n\n', ' biden \n', '  biden ',
       '\n\nbiden '], dtype=object)

In [485]:
df['full_text'].apply(lambda x:re.search("\s*[B|b]iden\w*\s+", x)).dropna().apply(lambda x:x.group(0)).unique()

array(['biden ', 'biden\n', 'biden  ', 'biden \n\n', 'biden2020 ',
       ' biden ', 'biden    ', 'biden \n', 'biden\n\n', 'biden  \n\n',
       ' bidens ', 'bidenbrigade ', 'bidenbrigade  ', ' bidenville \n\n',
       'biden2020\n', ' biden \n', 'biden2020 \n', 'biden   \n',
       'biden \n \n', '  biden ', 'biden\n ', '\n\nbiden '], dtype=object)

In [486]:
df['full_text'].apply(lambda x:re.search('[a-zA-Z]{2,}[\\|\/|\_][A-Za-z]{2,}', x)).dropna().apply(lambda x:x.group(0)).unique() 

array(['strom_annette', 'foundation/chinese', 'colo_trumpette',
       'brock_mchugebig', 'alyssa_milano', 'bernard_cecilia',
       'warren/bernie', 'republicans/ruskies', 'credits/packet',
       'aged_and', 'nick_zen', 'teacher/professor', 'jim_jordan',
       'see/hear', 'man/husband', 'australia/canada', 'space_expln',
       'morning_joe', 'jamie_crane', 'angelo_back', 'armon_dillo',
       'nasty_woman', 'thetruth_tx', 'millionaires/billionaires',
       'anders_aslund', 'agnes_gibboney', 'make/amend', 'pibble_racingdc',
       'pharma/healthcare', 'news/bad', 'lin_manuel', 'rich_indestin',
       'fact/abuses', 'life/liberty', 'wade_snowden', 'poormans_word',
       'andresha_bass', 'jimmy_dore', 'healthcare/climate', 'mboksr_maga',
       'mel_faith', 'debra_hendler', 'rock_valkyrie', 'laurent_weppe',
       'helen_manfred', 'mn_for', 'naep_nces', 'wendy_soxy', 'bee_dottie',
       'unrest_mag', 'ciaag_lauren', 'trump/putin', 'abogado_avocado',
       'nico_lang', 'funding/enf

In [488]:
' '.join(' '.join('a/b\c'.split('/')).split('\\'))

'a b c'

In [8]:
funct = lambda a:a*2

In [9]:
funct("a")

'aa'