In [1]:
import emoji
import os
import pandas as pd
import re


In [2]:
def correct_ampersands(match):
    _str = match.group(0)
    splitted = re.split('&', _str, flags=re.IGNORECASE)
    if all([str.isupper(i) for i in splitted]):
        return _str
    else:
        return ' & '.join(splitted)

        
def correct_slashsplitted(match):
    _str = match.group(0)
    return ' '.join(' '.join(_str.split('/')).split('\\'))

        
    
def remove_links(phrase):
    
    phrase = re.sub("\S*(t\.co|http)\S*", "", phrase, flags=re.IGNORECASE)
    
    return phrase

def global_processing(df):
    '''
    Global preprocessing/data munging
    '''
    
    
    df['full_text'] = df['full_text'].apply(lambda x:re.sub("\’", "\'", x, flags=re.IGNORECASE))
    df['full_text'] = df['full_text'].apply(lambda x:re.sub("\”", "\"", x, flags=re.IGNORECASE))
    df['full_text'] = df['full_text'].apply(lambda x:re.sub("\“", "\"", x, flags=re.IGNORECASE))
    
    
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&amp;*', '&', x, flags=re.IGNORECASE), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&gt;*', '>', x,flags=re.IGNORECASE), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&lt;*', '<', x, flags=re.IGNORECASE), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('\S*&\S+', correct_ampersands, x, flags=re.IGNORECASE), )
    
    df['full_text'] = df['full_text'].apply(lambda x:emoji.get_emoji_regexp().sub('', x), )
    
    df['full_text'] = df['full_text'].apply(remove_links)
    
    max_num_mentions = 4
    spammy_indices_mentions = df['full_text'].apply(lambda x:re.search("(\@.*){%d,}"%(max_num_mentions+1), x, flags = re.IGNORECASE)).dropna().index
    print(len(spammy_indices_mentions))
    df = df.iloc[[i for i in df.index if i not in spammy_indices_mentions]]
    df = df.reset_index(drop=True)
    
    max_hashtags = 4
    spammy_indices_hashtags = df['full_text'].apply(lambda x:re.search("(\#\w+\s){%d,}"%(max_hashtags+1), x)).dropna().apply(lambda x:x.group(0)).index
    print(len(spammy_indices_hashtags))
    df = df.iloc[[i for i in df.index if i not in spammy_indices_hashtags]]
    
    
    df = df.reset_index(drop=True)
    

    return df

# Get Data

In [3]:
df = pd.read_csv("final_dataset_Bernie.csv")
display(df.nunique())
df.dtypes, df.shape

id_str        26500
created_at    23020
full_text     27756
dtype: int64

(id_str         int64
 created_at    object
 full_text     object
 dtype: object, (32486, 3))

In [4]:
df = df.drop_duplicates(subset=['id_str']).reset_index(drop=True)
df.shape

(26500, 3)

In [5]:
df = global_processing(df)


7039
51


In [6]:
df.shape

(19410, 3)

# Decontracting

In [7]:
df['full_text'].apply(lambda x:re.search("[a-zA-Z]{1,7}[\'|\"][a-zA-Z]{1,3}", x)).dropna().apply(lambda x:x.group(0)).value_counts()[:10]

don't      865
I'm        552
It's       484
it's       416
That's     332
can't      315
you're     282
doesn't    279
isn't      216
You're     213
Name: full_text, dtype: int64

In [8]:
def decontract(phrase):
    
    # specific
    phrase = re.sub(r"can\'t", "can not", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"won\'t", "will not", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"let\'s", "let us", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"let\'s", "let us", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"y\'all", "you all", phrase, flags=re.IGNORECASE)
    

    # general
    phrase = re.sub(r"n\'t", " not", phrase, flags=re.IGNORECASE) #notice the spaces
    phrase = re.sub(r"\'re", " are", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'s", " is", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'d", " would", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'ll", " will", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'t", " not", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'ve", " have", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'m", " am", phrase, flags=re.IGNORECASE)
    return phrase

In [9]:
df['full_text'] = df['full_text'].apply(decontract)

In [10]:
df['full_text'].apply(lambda x:re.search("[a-zA-Z]{1,7}[\'|\"][a-zA-Z]{1,3}", x)).dropna().apply(lambda x:x.group(0)).value_counts()[:10]

y'kno          2
D'Arr          1
heimers"Tru    1
x'r            1
help'g         1
I"m            1
f'ing          1
VT'r           1
I'g            1
BS'er          1
Name: full_text, dtype: int64

# Tags

In [11]:
df['full_text'].apply(lambda x:re.search("\@\w+", x, flags=re.IGNORECASE)).dropna().apply(lambda x:x.group(0)).value_counts()[:10]

@BernieSanders    7311
@SaraCarterDC      637
@ofcltarrtarr      290
@cards_fan75       148
@cenkuygur         138
@julianzelizer     103
@IlhanMN            89
@ewarren            77
@JoshuaWick6        76
@Reed2242           53
Name: full_text, dtype: int64

In [12]:
def remove_tags(phrase):
    
    phrase = re.sub("\@BernieSanders", "", phrase)
    
    return phrase

In [13]:
df['full_text'] = df['full_text'].apply(remove_tags)

In [14]:
df['full_text'].apply(lambda x:re.search("\@\w+", x)).dropna().apply(lambda x:x.group(0)).str.strip().value_counts()

@SaraCarterDC       637
@ofcltarrtarr       291
@SharylAttkisson    185
@cards_fan75        148
@cenkuygur          138
                   ... 
@lindamo787           1
@xthewriter           1
@JohnStossel          1
@tylerevansokay       1
@DennyHeck            1
Name: full_text, Length: 3913, dtype: int64

In [15]:
df.shape

(19410, 3)

# Hashtags

In [21]:
df['full_text'].apply(lambda x:re.search("\#\w*([B|b]ernie|[S|a]nders){0,1}\w*", x, re.IGNORECASE)).dropna().apply(lambda x:x.group(0)).value_counts()[:20]

#Bernie2020           171
#NotMeUs               61
#MedicareForAll        61
#GreenNewDeal          26
#YangGang              24
#BoycottMSNBC          24
#IFightForThem         16
#BernieSanders         15
#SocialismKills        15
#M4A                   15
#Trump2020             14
#BernieSanders2020     14
#Bernie                14
#MAGA                  13
#BigUs                 13
#UnionsForAll          10
#Trump                  9
#FreedomDividend        9
#BernieBeatsTrump       8
#HumanityFirst          7
Name: full_text, dtype: int64

In [22]:
def remove_hashtags(phrase):

    for i in ['#[B|b]ernie([S|s]anders){0,1}(2020){0,1}', '#FeelTheBern', "#[T|t]eamBernie"]:
        phrase = re.sub(i, "", phrase)
    
    return phrase

In [23]:
df['full_text'] = df['full_text'].apply(remove_hashtags)

In [24]:
df['full_text'].apply(lambda x:re.search("\#\w*([B|b]lizabeth|[S|a]nders){0,1}\w*", x)).dropna().apply(lambda x:x.group(0)).value_counts()[:20]

#NotMeUs            88
#MedicareForAll     66
#GreenNewDeal       30
#YangGang           25
#BoycottMSNBC       25
#IFightForThem      16
#BigUs              16
#SocialismKills     15
#Trump2020          15
#M4A                15
#MAGA               13
#UnionsForAll       10
#FreedomDividend     9
#Trump               9
#WeThePeople         7
#1                   7
#HumanityFirst       7
#Thanksgiving        7
#Hypocrites          7
#hypocrite           6
Name: full_text, dtype: int64

# (Obvious) Mentions

In [27]:
df['full_text'].apply(lambda x:re.search("\s+\w*[B|b]ernie([S|s]anders){0,1}\w*\s+", x, re.IGNORECASE)).dropna().apply(lambda x:x.group(0)).str.strip().value_counts()[:10]

Bernie           1377
bernie             60
BERNIE             14
Bernies            10
bernies             2
BernieSanders       2
berniebruh          1
Dinobernie          1
BernieThe           1
BernieI             1
Name: full_text, dtype: int64

In [28]:
temp = df['full_text'].apply(lambda x:re.search("\s+\w*[B|b]ernie([S|s]anders){0,1}\w*\s+", x)).dropna().apply(lambda x:x.group(0)).value_counts().index

def remove_mentions(phrase):
    
    for to_replace in temp:
        phrase = re.sub(to_replace, "", phrase)
    
    return phrase

In [29]:
df['full_text'] = df['full_text'].apply(remove_mentions)

In [30]:
df['full_text'].apply(lambda x:re.search("\s+\w*[B|b]ernie([S|s]anders){0,1}\w*\s+", x)).dropna().apply(lambda x:x.group(0)).value_counts()

 Berniebros     1
 berniecrat     1
Name: full_text, dtype: int64

In [31]:
df.to_csv('processed_final_Bernie.csv', index=False)