In [3]:
import emoji
import os
import pandas as pd
import re


In [329]:
def correct_ampersands(match):
    _str = match.group(0)
    splitted = re.split('&', _str)
    if all([str.isupper(i) for i in splitted]):
        return _str
    else:
        return ' & '.join(splitted)
        
def correct_multiplemarks(match):
    _str = match.group(0)
    if _str.startswith('?'):
        return '?'
    elif _str.startswith('!'):
        return '!'
        
        
def correct_slashsplitted(match):
    _str = match.group(0)
    return ' '.join(' '.join(_str.split('/')).split('\\'))

        
def global_processing(df):
    '''
    Global preprocessing/data munging
    '''
    
    # remove links
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&amp;*', '&', x), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&gt;*', '>', x), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&lt;*', '<', x), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('\S*&\S+', correct_ampersands, x), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('[!?]{2,}', correct_multiplemarks, x), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('[a-zA-Z]{2,}[\\|\/|\_][A-Za-z]{2,}', correct_slashsplitted, x), )
    df['full_text'] = df['full_text'].apply(lambda x:emoji.get_emoji_regexp().sub('', x), )
    
    # standardize candidate names/hashtags/mentions
    #df['full_text'] = df['full_text'].apply(lambda x:re.sub("[B|b]ern\S*\s+", " BernieSanders ", x))
    #df['full_text'] = df['full_text'].apply(lambda x:re.sub("\S*[S|s]ander\S*\s+", " BernieSanders ", x))
    #df['full_text'] = df['full_text'].apply(lambda x:re.sub("\S*[W|w]arren\S*\s+", " ElizabethWarren ", x))
    #df['full_text'] = df['full_text'].apply(lambda x:re.sub("\S*[E|e]lizabeth\S*\s+", " ElizabethWarren ", x))
    #df['full_text'] = df['full_text'].apply(lambda x:re.sub("\S*[B|b]iden\S*\s+", " JoeBiden ", x))
    #df['full_text'] = df['full_text'].apply(lambda x:re.sub("\S*[B|b]iden\S*\s+", " JoeBiden ", x))

    return df

# Get Data

In [330]:
for name in ['Bernie', 'Joe', 'Elizabeth']:
    df = pd.read_csv("final_dataset_{}.csv".format(name))
    display(df.nunique())
df.dtypes

id_str        22307
created_at    19492
full_text     23593
dtype: int64

id_str        15714
created_at    12774
full_text     16747
dtype: int64

id_str        15639
created_at    14472
full_text     17217
dtype: int64

id_str         int64
created_at    object
full_text     object
dtype: object

In [331]:
df['full_text'] = df['full_text'].str.lower()

In [332]:
df = global_processing(df)

# Decontracting

In [333]:
df['full_text'].apply(lambda x:re.search("[a-zA-Z]{1,7}[\“|\’|\'|\"|\”][a-zA-Z]{1,3}", x)).dropna().apply(lambda x:x.group(0)).value_counts()[:10]

don't     575
it's      530
don’t     497
it’s      416
i’m       379
i'm       378
you're    297
that's    279
you’re    272
that’s    248
Name: full_text, dtype: int64

In [334]:
df['full_text'][:5].apply(lambda x:print(x, end = '\n--\n'))

these dem sens. r running against #trump in #2020elections: 

- @ewarren 
- @kamalaharris 
- @michaelbennet 
- @amyklobuchar 
- @corybooker 
- @berniesanders 

should these sens. recuse themselves from sitting as a juror at senate #impeachment trial?

https://t.co cwchhau5tw

--
.@ewarren’s granddaughter, lavinia, gives great pep talks.

we hope your thanksgiving is filled with friends, family, chosen family, and loved ones who inspire you to keep fighting. https://t.co mwr8aty8oj

--
democrats doing the “attack the leftist policies” thing should be aware that they are providing bipartisan legitimacy to these talking points. long history of this. @realdonaldtrump will use the attacks against whoever the nominee is, from @joebiden or @petebuttigieg to @ewarren.

--
@satchelmose @ewarren i can't understand her point. she is claiming the rich are being subsidized by the broader "we", but the top 1% pay 37% of federal income taxes and the bottom 50% pay only 3%. what is she saying?

--
@ew

0    None
1    None
2    None
3    None
4    None
Name: full_text, dtype: object

In [335]:
def decontract(phrase):
    
    # specific
    phrase = re.sub(r"can[\’|\']t", "can not", phrase)
    phrase = re.sub(r"won[\’|\']t", "will not", phrase)
    phrase = re.sub(r"let[\’|\']s", "let us", phrase)
    

    # general
    phrase = re.sub(r"n[\’|\']t", " not", phrase) #notice the spaces
    phrase = re.sub(r"[\’|\']re", " are", phrase)
    phrase = re.sub(r"[\’|\']s", " is", phrase)
    phrase = re.sub(r"[\’|\']d", " would", phrase)
    phrase = re.sub(r"[\’|\']ll", " will", phrase)
    phrase = re.sub(r"[\’|\']t", " not", phrase)
    phrase = re.sub(r"[\’|\']ve", " have", phrase)
    phrase = re.sub(r"[\’|\']m", " am", phrase)
    return phrase

In [336]:
#from unicodedata import normalize
#normalize('NFKD', "I’m he's they're let's warren's.\n You will be there _0").encode('ascii','ignore').decode('utf')

In [337]:
df['full_text'] = df['full_text'].apply(decontract)

In [338]:
df['full_text'].apply(lambda x:re.search("[a-zA-Z]{1,7}[\“|\’|\'|\"|\”][a-zA-Z]{1,3}", x)).dropna().apply(lambda x:x.group(0)).value_counts()[:10]

y’all          25
y'all          25
ma’am           8
your'e          6
ma'am           3
nat'l           3
allowed”is      2
his"bef         2
tudents"wer     2
ne'er           2
Name: full_text, dtype: int64

In [339]:
df['full_text'][:5].apply(lambda x:print(x, end = '\n--\n'))

these dem sens. r running against #trump in #2020elections: 

- @ewarren 
- @kamalaharris 
- @michaelbennet 
- @amyklobuchar 
- @corybooker 
- @berniesanders 

should these sens. recuse themselves from sitting as a juror at senate #impeachment trial?

https://t.co cwchhau5tw

--
.@ewarren is granddaughter, lavinia, gives great pep talks.

we hope your thanksgiving is filled with friends, family, chosen family, and loved ones who inspire you to keep fighting. https://t.co mwr8aty8oj

--
democrats doing the “attack the leftist policies” thing should be aware that they are providing bipartisan legitimacy to these talking points. long history of this. @realdonaldtrump will use the attacks against whoever the nominee is, from @joebiden or @petebuttigieg to @ewarren.

--
@satchelmose @ewarren i can not understand her point. she is claiming the rich are being subsidized by the broader "we", but the top 1% pay 37% of federal income taxes and the bottom 50% pay only 3%. what is she saying?

--


0    None
1    None
2    None
3    None
4    None
Name: full_text, dtype: object

# Links

In [340]:
df['full_text'].apply(lambda x:re.search("\w*(t\.co|http)\S*", x)).dropna().apply(lambda x:x.group(0))

0                   https://t.co
1                   https://t.co
8                   https://t.co
14                  https://t.co
15                  https://t.co
                  ...           
12180    https://t.co/v7d0thdr8b
12187               https://t.co
12192               https://t.co
12199    https://t.co/b4rqwmi3lp
12213               https://t.co
Name: full_text, Length: 1918, dtype: object

In [341]:
def remove_links(phrase):
    
    phrase = re.sub("\S*(t\.co|http)\S*", "", phrase)
    
    return phrase

In [342]:
df['full_text'] = df['full_text'].apply(remove_links)

In [343]:
df['full_text'].apply(lambda x:re.search("\S*(t\.co|http)\S*", x)).dropna().apply(lambda x:x.group(0))

Series([], Name: full_text, dtype: object)

# Tags

In [344]:
df['full_text'].apply(lambda x:re.search("\@\w+", x)).dropna().apply(lambda x:x.group(0)).count()

21411

In [345]:
df['full_text'].apply(lambda x:re.search("\@\w+", x)).dropna().apply(lambda x:x.group(0)).str.strip().value_counts()

@ewarren            9075
@saracarterdc        316
@proudresister       280
@berniesanders       180
@joebiden            102
                    ... 
@birdie4bernie20       1
@beeluvedb             1
@nickuniejewski        1
@peacekeeper2019       1
@therisingkn1ght       1
Name: full_text, Length: 3361, dtype: int64

In [346]:
def remove_tags(phrase):
    
    phrase = re.sub("\@ewarren", "", phrase)
    
    return phrase

In [347]:
df['full_text'] = df['full_text'].apply(remove_tags)

In [348]:
df['full_text'].apply(lambda x:re.search("\@\w+", x)).dropna().apply(lambda x:x.group(0)).str.strip().value_counts()

@proudresister      317
@saracarterdc       316
@berniesanders      294
@teamwarren         253
@joebiden           124
                   ... 
@maryvictoryfarm      1
@senategop            1
@rollieg2             1
@nofascistlies        1
@foxeesrldy           1
Name: full_text, Length: 3530, dtype: int64

In [349]:
df.shape

(21411, 3)

In [350]:
index_spammy = df['full_text'].apply(lambda x:re.search("(\@.*){5,}", x)).dropna().apply(lambda x:x.group(0)).index

In [351]:
len(index_spammy)

3133

In [352]:
df = df.iloc[[i for i in df.index if i not in index_spammy]].reindex()

# Hashtags

In [353]:
df['full_text'].apply(lambda x:re.search("\#\w*warren\w*", x)).dropna().apply(lambda x:x.group(0)).value_counts()

#warren2020                           156
#teamwarren                            43
#elizabethwarren                       35
#winwithwarren                         19
#warren                                13
#neverwarren                            7
#allinforwarren                         6
#warrensinsaneclownposse                3
#presidentelizabethwarren               3
#catsforwarren                          2
#warrenisafool                          2
#womenwithwarren                        2
#weakwarren                             2
#chicagoforwarren                       2
#warren4me                              2
#ilforwarren                            2
#madampresidentwarren                   2
#presidentwarren                        2
#wearwarren                             2
#warrenmemeteam                         2
#warren20never                          2
#wiforwarren                            2
#elizabethwarren2020                    2
#atlantawithwarren                

In [354]:
def remove_hashtags(phrase):

    for i in ['#warren2020', '#teamwarren', '#elizabethwarren', '#winwithwarren', '#warren']:
        phrase = re.sub(i, "", phrase)
    
    return phrase

In [355]:
df['full_text'] = df['full_text'].apply(remove_hashtags)

In [356]:
df['full_text'].apply(lambda x:re.search("\#\w*warren\w*", x)).dropna().apply(lambda x:x.group(0)).value_counts()

#neverwarren                          7
#allinforwarren                       6
#presidentelizabethwarren             3
#iwannabelikewarren                   2
#madampresidentwarren                 2
#opestatesforwarren                   2
#ilforwarren                          2
#chicagoforwarren                     2
#atlantawithwarren                    2
#rolloverwarren                       2
#dogsforwarren                        2
#catsforwarren                        2
#wiforwarren                          2
#weakwarren                           2
#poetsforwarren                       2
#wrongagainwarren                     2
#ohioforwarren                        2
#womenwithwarren                      2
#fl4warren                            2
#wearwarren                           2
#presidentwarren                      2
#elizbethwarrenliesabouteverything    1
#kushnerforwarren2020                 1
#nc4warren                            1
#isupportelizabethwarren              1


# (Obvious) Mentions

In [357]:
df['full_text'].apply(lambda x:re.search("\s+\w*elizabeth\w*\s+", x)).dropna().apply(lambda x:x.group(0)).value_counts()

 elizabeth         209
  elizabeth         27
 \n\nelizabeth       3
\n\nelizabeth        2
 elizabeth\n         2
 \nelizabeth         2
  elizabeth          1
   elizabeth         1
\n elizabeth         1
\nelizabeth          1
Name: full_text, dtype: int64

In [358]:
df['full_text'].apply(lambda x:re.search("(team|sen)*warren", x)).dropna().apply(lambda x:x.group(0)).value_counts()

warren        1405
teamwarren     318
senwarren      114
Name: full_text, dtype: int64

In [359]:
df['full_text'].apply(lambda x:re.search("(team|sen)*warren", x)).dropna()\
.apply(lambda x:re.sub("(team|sen)*warren", "", x.group(0)))

28        
31        
32        
38        
47        
        ..
21356     
21362     
21366     
21367     
21372     
Name: full_text, Length: 1837, dtype: object

In [360]:
def remove_mentions(phrase):
    
    phrase = re.sub("\s+\w*elizabeth\w*\s+", "", phrase)
    phrase = re.sub("(team|sen)*warren", "", phrase)
    
    return phrase

In [361]:
df['full_text'] = df['full_text'].apply(remove_mentions)

In [362]:
df['full_text'].apply(lambda x:re.search("\s+\w*elizabeth\w*\s+", x)).dropna().apply(lambda x:x.group(0)).value_counts()

Series([], Name: full_text, dtype: int64)

In [363]:
df['full_text'].apply(lambda x:re.search("(team|sen)*warren", x)).dropna().apply(lambda x:x.group(0)).value_counts()

Series([], Name: full_text, dtype: int64)

In [364]:
df.to_csv('processed_final_Elizabeth.csv', index=False)

In [480]:
df['full_text'].apply(lambda x:re.search("\s+[B|b]ern\w*\s+", x)).dropna().apply(lambda x:x.group(0)).unique()

array([' bernie ', '  bernie ', ' berners\n', '\n\nbernie ', ' bernie\n',
       ' bernie \n', '\nbernie ', ' bern ', ' \n\nbernie ', ' bernanke ',
       ' bernie  ', ' bernese ', ' berniebros\n', ' berners '],
      dtype=object)

In [481]:
df['full_text'].apply(lambda x:re.search("\s+[S|s]ander\w*\s+", x)).dropna().apply(lambda x:x.group(0)).unique()

array([' sanders ', ' sander ', '  sanders ', ' sanderswarren ',
       '\n\nsanders ', ' sanders\n', ' \nsanders '], dtype=object)

In [482]:
df['full_text'].apply(lambda x:re.search("\s+[W|w]arren\w*\s+", x)).dropna().apply(lambda x:x.group(0)).unique()

array([' warren ', ' warren\n', '  warren ', ' \nwarren ', '\n\nwarren ',
       ' warren \n', ' warrensanders ', ' warren\n\n', '\nwarren ',
       ' warrens ', ' warren   \n', ' warren  \n', '  warren\n',
       '    warren ', '   warren ', ' warren  '], dtype=object)

In [483]:
df['full_text'].apply(lambda x:re.search("\s+[E|e]lizabeth\w*\s+", x)).dropna().apply(lambda x:x.group(0)).unique()

array([' elizabeth ', ' elizabeth\n', '  elizabeth ', '\n\nelizabeth ',
       ' \n\nelizabeth ', '  \n\nelizabeth ', '   elizabeth ',
       '\n elizabeth ', ' \nelizabeth ', '  elizabeth  ', '\nelizabeth '],
      dtype=object)

In [484]:
df['full_text'].apply(lambda x:re.search("\s+[B|b]iden\w*\s+", x)).dropna().apply(lambda x:x.group(0)).unique()

array([' biden ', ' bidens ', ' bidenville \n\n', ' biden \n', '  biden ',
       '\n\nbiden '], dtype=object)

In [485]:
df['full_text'].apply(lambda x:re.search("\s*[B|b]iden\w*\s+", x)).dropna().apply(lambda x:x.group(0)).unique()

array(['biden ', 'biden\n', 'biden  ', 'biden \n\n', 'biden2020 ',
       ' biden ', 'biden    ', 'biden \n', 'biden\n\n', 'biden  \n\n',
       ' bidens ', 'bidenbrigade ', 'bidenbrigade  ', ' bidenville \n\n',
       'biden2020\n', ' biden \n', 'biden2020 \n', 'biden   \n',
       'biden \n \n', '  biden ', 'biden\n ', '\n\nbiden '], dtype=object)

In [486]:
df['full_text'].apply(lambda x:re.search('[a-zA-Z]{2,}[\\|\/|\_][A-Za-z]{2,}', x)).dropna().apply(lambda x:x.group(0)).unique() 

array(['strom_annette', 'foundation/chinese', 'colo_trumpette',
       'brock_mchugebig', 'alyssa_milano', 'bernard_cecilia',
       'warren/bernie', 'republicans/ruskies', 'credits/packet',
       'aged_and', 'nick_zen', 'teacher/professor', 'jim_jordan',
       'see/hear', 'man/husband', 'australia/canada', 'space_expln',
       'morning_joe', 'jamie_crane', 'angelo_back', 'armon_dillo',
       'nasty_woman', 'thetruth_tx', 'millionaires/billionaires',
       'anders_aslund', 'agnes_gibboney', 'make/amend', 'pibble_racingdc',
       'pharma/healthcare', 'news/bad', 'lin_manuel', 'rich_indestin',
       'fact/abuses', 'life/liberty', 'wade_snowden', 'poormans_word',
       'andresha_bass', 'jimmy_dore', 'healthcare/climate', 'mboksr_maga',
       'mel_faith', 'debra_hendler', 'rock_valkyrie', 'laurent_weppe',
       'helen_manfred', 'mn_for', 'naep_nces', 'wendy_soxy', 'bee_dottie',
       'unrest_mag', 'ciaag_lauren', 'trump/putin', 'abogado_avocado',
       'nico_lang', 'funding/enf

In [488]:
' '.join(' '.join('a/b\c'.split('/')).split('\\'))

'a b c'

In [8]:
funct = lambda a:a*2

In [9]:
funct("a")

'aa'