In [32]:
import gzip
import numpy as np
import pandas as pd
import re
import string
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jiechengxu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [33]:
# read data from ziped data source
file_name = 'realdonaldtrump.csv.gz'
df = pd.read_csv(file_name, compression='gzip')
df.head()

Unnamed: 0,id,link,content,date,retweets,favorites,mentions,hashtags
0,1698308935,https://twitter.com/realDonaldTrump/status/169...,Be sure to tune in and watch Donald Trump on L...,2009-05-04 13:54:25,510,917,,
1,1701461182,https://twitter.com/realDonaldTrump/status/170...,Donald Trump will be appearing on The View tom...,2009-05-04 20:00:10,34,267,,
2,1737479987,https://twitter.com/realDonaldTrump/status/173...,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08 08:38:08,13,19,,
3,1741160716,https://twitter.com/realDonaldTrump/status/174...,New Blog Post: Celebrity Apprentice Finale and...,2009-05-08 15:40:15,11,26,,
4,1773561338,https://twitter.com/realDonaldTrump/status/177...,"""My persona will never be that of a wallflower...",2009-05-12 09:07:28,1375,1945,,


In [34]:
# data integrity 
df.isna().sum()

id               0
link             0
content          0
date             0
retweets         0
favorites        0
mentions     22966
hashtags     37769
dtype: int64

In [35]:
df = df.drop(['link','mentions','hashtags'], axis=1)
df.head()

Unnamed: 0,id,content,date,retweets,favorites
0,1698308935,Be sure to tune in and watch Donald Trump on L...,2009-05-04 13:54:25,510,917
1,1701461182,Donald Trump will be appearing on The View tom...,2009-05-04 20:00:10,34,267
2,1737479987,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08 08:38:08,13,19
3,1741160716,New Blog Post: Celebrity Apprentice Finale and...,2009-05-08 15:40:15,11,26
4,1773561338,"""My persona will never be that of a wallflower...",2009-05-12 09:07:28,1375,1945


In [36]:
# clean and fix data in content column
content = df.content
content

0        Be sure to tune in and watch Donald Trump on L...
1        Donald Trump will be appearing on The View tom...
2        Donald Trump reads Top Ten Financial Tips on L...
3        New Blog Post: Celebrity Apprentice Finale and...
4        "My persona will never be that of a wallflower...
                               ...                        
43347    Joe Biden was a TOTAL FAILURE in Government. H...
43348    Will be interviewed on @ seanhannity tonight a...
43349                           pic.twitter.com/3lm1spbU8X
43350                           pic.twitter.com/vpCE5MadUz
43351                           pic.twitter.com/VLlc0BHW41
Name: content, Length: 43352, dtype: object

In [37]:
# change line into lower case, remove punctuations, bracekets, urls, and extra space
from nltk.corpus import stopwords

def clean_line(line):
    line = str(line).lower()
    line = re.sub('\[.*?\]', '', line)
    line = re.sub('https?://\S+|www\.\S+', '', line)
    line = re.sub('<.*?>+', '', line)
    line = re.sub('[%s]' % re.escape(string.punctuation), '', line)
    line = re.sub('\n', '', line)
    line = re.sub('\w*\d\w*', '', line)
    line = re.sub('\'','', line)
    line = line.strip().split()
    return ' '.join(line)

df['content'] = df['content'].apply(lambda x: clean_line(x))

# remove empty content after preprocessing
df = df[df.content != '']
df.content

0        be sure to tune in and watch donald trump on l...
1        donald trump will be appearing on the view tom...
2        donald trump reads top ten financial tips on l...
3        new blog post celebrity apprentice finale and ...
4        my persona will never be that of a wallflower ...
                               ...                        
43344                                               true …
43345    a great woman her son is looking down from hea...
43346    approval rating in the republican party thank you
43347    joe biden was a total failure in government he...
43348    will be interviewed on seanhannity tonight at ...
Name: content, Length: 42940, dtype: object

In [38]:
from textblob import TextBlob
df['sentiment'] = np.array([TextBlob(content).sentiment.polarity for content in df['content']])

# round to 2 digits
df['sentiment'] = df.sentiment.apply(lambda x: round(x,2))

In [39]:
# verify the result
for i in df['content'][df.sentiment == -1][:10]:
    print(i)

obama care is already having a devastating impact on our economy
the worst employee in todays trumpvlog
you know what is the worst part of barackobamas tuesday speech playing class warfarewe paid for it with our tax dollars
the mullahs are laughing at what they think is a very stupid president barackobama has asked for iran to return the drone timetogettough
the economy is in terrible shape barackobama is manipulating the job numbers to hide the truth
pathetic barackobama did not want to veto keystone himselfso he lobbied the democrats in the senate to defeat it
terrible wind farms are provided permits by the us government which causes the programmatic killing of bald eagles
just cancelled my subscription to usatoday boring newspaper with no mojomust be losing a fortune founder cont
price of corn has jumped over this will cause a jump in food prices perhaps beyond what weve ever seen nasty for the economy
pathetic barackobama is sweetening his offer to the taliban read the art of the d

In [40]:
df

Unnamed: 0,id,content,date,retweets,favorites,sentiment
0,1698308935,be sure to tune in and watch donald trump on l...,2009-05-04 13:54:25,510,917,0.17
1,1701461182,donald trump will be appearing on the view tom...,2009-05-04 20:00:10,34,267,0.14
2,1737479987,donald trump reads top ten financial tips on l...,2009-05-08 08:38:08,13,19,0.11
3,1741160716,new blog post celebrity apprentice finale and ...,2009-05-08 15:40:15,11,26,0.14
4,1773561338,my persona will never be that of a wallflower ...,2009-05-12 09:07:28,1375,1945,0.00
...,...,...,...,...,...,...
43344,1273061883399098368,true …,2020-06-16 20:16:20,26783,154978,0.35
43345,1273080720794279937,a great woman her son is looking down from hea...,2020-06-16 21:31:11,26468,112140,0.65
43346,1273095002563006472,approval rating in the republican party thank you,2020-06-16 22:27:56,44472,313534,0.00
43347,1273405198698975232,joe biden was a total failure in government he...,2020-06-17 19:00:32,23402,116377,-0.16


# Word Count

In [50]:
word_count = pd.Series(' '.join(df.content).split()).value_counts()
stop_set = set(stopwords.words('English'))
filtered = [i for i in word_count.index if i not in stop_set]
word_count = word_count[filtered]
word_count

realdonaldtrump    8569
great              6623
trump              4950
…                  4045
thank              3019
                   ... 
skyeshepard           1
policyif              1
espndrlou             1
baptist               1
nsc                   1
Length: 40335, dtype: int64

In [60]:
# store data file
word_count_df = pd.DataFrame(word_count).reset_index()
word_count_df.columns = ['words', 'count']
print(word_count_df.head())
word_count_df.to_csv('word_count.csv')
df.to_csv('processed_data.csv')

             words  count
0  realdonaldtrump   8569
1            great   6623
2            trump   4950
3                …   4045
4            thank   3019
