In [1]:
import emoji
import os
import pandas as pd
import re


In [2]:
def correct_ampersands(match):
    _str = match.group(0)
    splitted = re.split('&', _str, flags=re.IGNORECASE)
    if all([str.isupper(i) for i in splitted]):
        return _str
    else:
        return ' & '.join(splitted)

        
def correct_slashsplitted(match):
    _str = match.group(0)
    return ' '.join(' '.join(_str.split('/')).split('\\'))

        
    
def remove_links(phrase):
    
    phrase = re.sub("\S*(t\.co|http)\S*", "", phrase, flags=re.IGNORECASE)
    
    return phrase

def global_processing(df):
    '''
    Global preprocessing/data munging
    '''
    
    
    df['full_text'] = df['full_text'].apply(lambda x:re.sub("\’", "\'", x, flags=re.IGNORECASE))
    df['full_text'] = df['full_text'].apply(lambda x:re.sub("\”", "\"", x, flags=re.IGNORECASE))
    df['full_text'] = df['full_text'].apply(lambda x:re.sub("\“", "\"", x, flags=re.IGNORECASE))
    
    
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&amp;*', '&', x, flags=re.IGNORECASE), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&gt;*', '>', x,flags=re.IGNORECASE), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&lt;*', '<', x, flags=re.IGNORECASE), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('\S*&\S+', correct_ampersands, x, flags=re.IGNORECASE), )
    
    df['full_text'] = df['full_text'].apply(lambda x:emoji.get_emoji_regexp().sub('', x), )
    
    df['full_text'] = df['full_text'].apply(remove_links)
    
    max_num_mentions = 4
    spammy_indices_mentions = df['full_text'].apply(lambda x:re.search("(\@.*){%d,}"%(max_num_mentions+1), x, flags = re.IGNORECASE)).dropna().index
    print(len(spammy_indices_mentions))
    df = df.iloc[[i for i in df.index if i not in spammy_indices_mentions]]
    df = df.reset_index(drop=True)
    
    max_hashtags = 4
    spammy_indices_hashtags = df['full_text'].apply(lambda x:re.search("(\#\w+\s){%d,}"%(max_hashtags+1), x)).dropna().apply(lambda x:x.group(0)).index
    print(len(spammy_indices_hashtags))
    df = df.iloc[[i for i in df.index if i not in spammy_indices_hashtags]]
    
    
    df = df.reset_index(drop=True)
    

    return df

# Get Data

In [3]:
df = pd.read_csv("final_dataset_Joe.csv")
display(df.nunique())
df.dtypes, df.shape

id_str        22619
created_at    18221
full_text     23614
dtype: int64

(id_str         int64
 created_at    object
 full_text     object
 dtype: object, (25895, 3))

In [4]:
df = df.drop_duplicates(subset=['id_str']).reset_index(drop=True)
df.shape

(22619, 3)

In [5]:
df = global_processing(df)


3486
43


In [6]:
df.shape

(19090, 3)

# Decontracting

In [7]:
df['full_text'].apply(lambda x:re.search("[a-zA-Z]{1,7}[\'|\"][a-zA-Z]{1,3}", x)).dropna().apply(lambda x:x.group(0)).value_counts()[:10]

I'm       508
don't     507
That's    341
It's      302
it's      294
can't     284
you're    262
He's      252
You're    244
he's      238
Name: full_text, dtype: int64

In [8]:
def decontract(phrase):
    
    # specific
    phrase = re.sub(r"can\'t", "can not", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"won\'t", "will not", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"let\'s", "let us", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"let\'s", "let us", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"y\'all", "you all", phrase, flags=re.IGNORECASE)
    

    # general
    phrase = re.sub(r"n\'t", " not", phrase, flags=re.IGNORECASE) #notice the spaces
    phrase = re.sub(r"\'re", " are", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'s", " is", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'d", " would", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'ll", " will", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'t", " not", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'ve", " have", phrase, flags=re.IGNORECASE)
    phrase = re.sub(r"\'m", " am", phrase, flags=re.IGNORECASE)
    return phrase

In [9]:
df['full_text'] = df['full_text'].apply(decontract)

In [10]:
df['full_text'].apply(lambda x:re.search("[a-zA-Z]{1,7}[\'|\"][a-zA-Z]{1,3}", x)).dropna().apply(lambda x:x.group(0)).value_counts()[:10]

ma'am        3
f'n          2
get'em       2
O'Bid        2
if"thi       1
Smok'n       1
Jolt'n       1
troops"he    1
Keep'em      1
ne'er        1
Name: full_text, dtype: int64

# Tags

In [11]:
df['full_text'].apply(lambda x:re.search("\@\w+", x, flags=re.IGNORECASE)).dropna().apply(lambda x:x.group(0)).value_counts()

@JoeBiden           8841
@RealCandaceO       1782
@Jali_Cat            212
@petluvers4Trump     193
@natashakorecki      189
                    ... 
@CrowleyAntmarga       1
@bigchaz               1
@TinaLNeal             1
@rachelbruno           1
@beinlibertarian       1
Name: full_text, Length: 3061, dtype: int64

In [12]:
df['full_text'].apply(lambda x:re.search("\@\w+", x)).dropna().apply(lambda x:x.group(0)).str.strip().value_counts()

@JoeBiden           8841
@RealCandaceO       1782
@Jali_Cat            212
@petluvers4Trump     193
@natashakorecki      189
                    ... 
@CrowleyAntmarga       1
@bigchaz               1
@TinaLNeal             1
@rachelbruno           1
@beinlibertarian       1
Name: full_text, Length: 3061, dtype: int64

In [13]:
def remove_tags(phrase):
    
    phrase = re.sub("\@[J|e]oe[B|b]iden", "", phrase)
    
    return phrase

In [14]:
df['full_text'] = df['full_text'].apply(remove_tags)

In [15]:
df['full_text'].apply(lambda x:re.search("\@\w+", x)).dropna().apply(lambda x:x.group(0)).str.strip().value_counts()

@RealCandaceO       1783
@realDonaldTrump     362
@NRA                 240
@Jali_Cat            212
@petluvers4Trump     193
                    ... 
@Cy_Lanced             1
@mikemcdonnell         1
@kennethlang           1
@sumskrwdemo           1
@KevinJWatt1           1
Name: full_text, Length: 3252, dtype: int64

In [16]:
df.shape

(19090, 3)

# Hashtags

In [18]:
df['full_text'].apply(lambda x:re.search("\#\w*([J|j]oe|[B|b]iden){0,1}\w*", x, re.IGNORECASE)).dropna().apply(lambda x:x.group(0)).value_counts()[:20]

#Trump2020             94
#NoMalarkey            75
#Biden2020             56
#HunterBiden           47
#MAGA                  41
#QuidProJoe            32
#Bernie2020            25
#CreepyJoeBiden        24
#QuidProQuoJoe         22
#SleepyJoe             16
#                      16
#KAG2020               15
#Ukraine               15
#JoeBiden              14
#Trump2020Landslide    14
#TeamJoe               14
#CreepyJoe             13
#KAG                   13
#Democrats             12
#1                     12
Name: full_text, dtype: int64

In [19]:
def remove_hashtags(phrase):

    for i in ['#[B|b]iden2020', '#[J|j]oeBiden', "#[T|t]eamJoe", "#[J|j]oe", "#[B|b]iden"]:
        phrase = re.sub(i, "", phrase)
    
    return phrase

In [20]:
df['full_text'] = df['full_text'].apply(remove_hashtags)

In [21]:
df['full_text'].apply(lambda x:re.search("\#\w*([J|j]oe|[B|b]iden){0,1}\w*", x)).dropna().apply(lambda x:x.group(0)).value_counts()[:20]

#Trump2020                    95
#NoMalarkey                   80
#HunterBiden                  48
#MAGA                         41
#QuidProJoe                   32
#Bernie2020                   25
#CreepyJoeBiden               24
#QuidProQuoJoe                22
#Ukraine                      18
#KAG2020                      16
#SleepyJoe                    16
#                             16
#Trump2020Landslide           14
#CreepyJoe                    13
#KAG                          13
#1                            12
#Trump2020LandslideVictory    12
#Democrats                    12
#malarkey                     12
#MAGA2020                     11
Name: full_text, dtype: int64

# (Obvious) Mentions

In [22]:
df['full_text'].apply(lambda x:re.search("\s+\w*([J|j]oe|[B|b]iden){1,}\w*\s+", x, re.IGNORECASE)).dropna().apply(lambda x:x.group(0)).str.strip().value_counts()[:10]

Joe       1138
Biden     1064
joe        120
JOE         40
Bidens      39
biden       28
Joey        17
BIDEN       14
Joes         8
bidens       3
Name: full_text, dtype: int64

In [23]:
temp = df['full_text'].apply(lambda x:re.search("\s+\w*([J|j]oe|[B|b]iden){1,}\w*\s+", x)).dropna().apply(lambda x:x.group(0)).value_counts()[:5].index

def remove_mentions(phrase):
    
    for to_replace in temp:
        phrase = re.sub(to_replace, "", phrase)
    
    return phrase

In [24]:
df['full_text'] = df['full_text'].apply(remove_mentions)

In [25]:
df['full_text'].apply(lambda x:re.search("\s+\w*([J|j]oe|[B|b]iden){1,}\w*\s+", x)).dropna().apply(lambda x:x.group(0)).value_counts()

 Joeis              56
 Bidens             37
 joe\n              31
 biden              24
 Biden\n            23
                    ..
\nJoeis              1
 Joeresponsible      1
 Joetalking          1
  Joebragged         1
 Joemeant            1
Name: full_text, Length: 180, dtype: int64

In [26]:
df.to_csv('processed_final_Joe.csv', index=False)