In [127]:
import emoji
import numpy as np
import os
import pandas as pd
import re

field_names = ['retweet_count', 'author_screen_name', 'author_followers_count', 'author_following_count', 'created_at', 'id_str', 'full_text']



In [2]:
!pip install pandas emoji



In [148]:
#name would be one of the Bernie, Joe or Elizabeth!!
# returns a dataframe with duplicates removed


def remove_duplicates(name):
    raw_data = []
    for file_name in sorted(os.listdir('.')):
        
        if name in file_name:
            print('Processing', file_name)
            with open(file_name, 'r') as fhandle:
                lines = [line.replace('backslashN', '\n').split('::::') for line in fhandle.readlines()]
            for _l in lines:
                raw_data.append(_l)
    df = pd.DataFrame(data=raw_data, columns=field_names)
    df.drop_duplicates(subset=['id_str'], inplace = True)
    return df

def correct_ampersands(match):
    _str = match.group(0)
    splitted = re.split('&', _str)
    if all([str.isupper(i) for i in splitted]):
        return _str
    else:
        return ' & '.join(splitted)
        
def correct_multiplemarks(match):
    _str = match.group(0)
    if _str.startswith('?'):
        return '?'
    elif _str.startswith('!'):
        return '!'
        
        
def correct_slashsplitted(match):
    _str = match.group(0)
    temp = re.search('\$*[0-9.]+/\S+', _str)
    if temp is not None:
        return _str
    else:
        return ' '.join(_str.split('/'))

        

        
def preprocess_text(candidate_name):
    df = remove_duplicates(candidate_name)
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('http\S+', '', x))
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&amp;*', '&', x), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&gt;*', '>', x), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('&lt;*', '<', x), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('\S*&\S+', correct_ampersands, x), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('[!?]{2,}', correct_multiplemarks, x), )
    df['full_text'] = df['full_text'].apply(lambda x:re.sub('[!?]{2,}', correct_slashsplitted, x), )
    df['full_text'] = df['full_text'].apply(lambda x:emoji.get_emoji_regexp().sub('', x), )
    return df
    

In [156]:
for name in ['Bernie', 'Joe', 'Elizabeth']:
    df = preprocess_text(name)
    display(df.head())
    display(df.count())
    display(df['id_str'].nunique())
    display(df.groupby(pd.to_datetime(df['created_at']).dt.date)['id_str'].nunique())
    df.to_csv('dataset_{}.csv'.format(name), index=False)
    print('-----------')
    

Processing Bernie.txt
Processing Bernie10.txt
Processing Bernie2.txt
Processing Bernie3.txt
Processing Bernie4.txt
Processing Bernie5.txt
Processing Bernie6.txt
Processing Bernie7.txt
Processing Bernie8.txt
Processing Bernie9.txt


Unnamed: 0,retweet_count,author_screen_name,author_followers_count,author_following_count,created_at,id_str,full_text
0,0,ghostriderr74,24,85,2019-11-24 00:35:01,1198399587649613824,@BernieSanders Not -- in my town we have medic...
1,0,michaelfrank17,2078,2202,2019-11-24 00:35:01,1198399586403942400,This week I heard @MMFlint say that Nancy Pelo...
2,6,savemain_st,46654,18548,2019-11-24 00:35:00,1198399585049284672,@ninaturner @PortiaABoulger @BernieSanders @jj...
3,0,GeoffWaters5,15,21,2019-11-24 00:35:00,1198399582847340544,@stro1786 @BernieSanders Is that a good thing ...
4,0,FLOURNOYFarrell,3505,4994,2019-11-24 00:34:57,1198399569853325312,@KelticSC @thekaraboudjan @SallyAlbright @Lewi...


retweet_count             5059
author_screen_name        5059
author_followers_count    5059
author_following_count    5059
created_at                5059
id_str                    5059
full_text                 5059
dtype: int64

5059

created_at
2019-11-23    1322
2019-11-24    1665
2019-11-25      25
2019-11-27     987
2019-11-28     768
2019-11-29     292
Name: id_str, dtype: int64

-----------
Processing Joe.txt
Processing Joe10.txt
Processing Joe2.txt
Processing Joe3.txt
Processing Joe4.txt
Processing Joe5.txt
Processing Joe6.txt
Processing Joe7.txt
Processing Joe8.txt
Processing Joe9.txt


Unnamed: 0,retweet_count,author_screen_name,author_followers_count,author_following_count,created_at,id_str,full_text
0,0,rickrowse2,173,255,2019-11-24 20:55:40,1198706776062996480,@JoeBiden Beats going back to the crooked swam...
1,0,FlanAg97,5047,5439,2019-11-24 20:55:40,1198706772086861830,@PeepsPowertothe @JoeBiden @BernieSanders @Sen...
2,0,juniort22163107,186,362,2019-11-24 20:55:37,1198706761064157184,@JoeBiden congrats grandpa enjoy the new grand...
3,0,PostonRuss,1444,2177,2019-11-24 20:55:32,1198706739794894848,@JoeBiden You damn sure can’t! I can’t wait #T...
4,0,are_marines,1316,1746,2019-11-24 20:55:27,1198706717703507968,@JoeBiden and son are corrupt losers \n


retweet_count             3778
author_screen_name        3778
author_followers_count    3778
author_following_count    3778
created_at                3778
id_str                    3778
full_text                 3778
dtype: int64

3778

created_at
2019-11-24    2676
2019-11-25      26
2019-11-28    1003
2019-11-29      73
Name: id_str, dtype: int64

-----------
Processing Elizabeth.txt
Processing Elizabeth10.txt
Processing Elizabeth2.txt
Processing Elizabeth3.txt
Processing Elizabeth4.txt
Processing Elizabeth5.txt
Processing Elizabeth6.txt
Processing Elizabeth7.txt
Processing Elizabeth8.txt
Processing Elizabeth9.txt


Unnamed: 0,retweet_count,author_screen_name,author_followers_count,author_following_count,created_at,id_str,full_text
0,0,MadonnaMadsen,17987,17467,2019-11-24 16:43:35,1198643333716795392,@no_silenced @ewarren @RepPressley Democrats a...
1,0,natsanpol,322,480,2019-11-24 16:43:28,1198643305493479424,@ewarren @MayaRudolph as @KamalaHarris was als...
2,0,DavidBelfiore1,13,292,2019-11-24 16:43:22,1198643280453484545,@TeamWarren @ewarren Anybody is better than Wa...
3,0,zaxxken,33,47,2019-11-24 16:43:22,1198643278687551488,@CNNPolitics @cnnbrk Goodbye @ewarren @BernieS...
4,0,binsaudigifts,172,269,2019-11-24 16:43:19,1198643267702853639,@ryangrim @ewarren Who on earth are you?\n


retweet_count             4437
author_screen_name        4437
author_followers_count    4437
author_following_count    4437
created_at                4437
id_str                    4437
full_text                 4437
dtype: int64

4437

created_at
2019-11-24    2364
2019-11-25      24
2019-11-27     740
2019-11-28    1235
2019-11-29      74
Name: id_str, dtype: int64

-----------
