In [2]:
import gzip
import numpy as np
import pandas as pd
import re
import string
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jiechengxu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# read data from ziped data source
file_name = 'realdonaldtrump.csv.gz'
df = pd.read_csv(file_name, compression='gzip')
df.head()

Unnamed: 0,id,link,content,date,retweets,favorites,mentions,hashtags
0,1698308935,https://twitter.com/realDonaldTrump/status/169...,Be sure to tune in and watch Donald Trump on L...,2009-05-04 13:54:25,510,917,,
1,1701461182,https://twitter.com/realDonaldTrump/status/170...,Donald Trump will be appearing on The View tom...,2009-05-04 20:00:10,34,267,,
2,1737479987,https://twitter.com/realDonaldTrump/status/173...,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08 08:38:08,13,19,,
3,1741160716,https://twitter.com/realDonaldTrump/status/174...,New Blog Post: Celebrity Apprentice Finale and...,2009-05-08 15:40:15,11,26,,
4,1773561338,https://twitter.com/realDonaldTrump/status/177...,"""My persona will never be that of a wallflower...",2009-05-12 09:07:28,1375,1945,,


In [4]:
# data integrity 
df.isna().sum()

id               0
link             0
content          0
date             0
retweets         0
favorites        0
mentions     22966
hashtags     37769
dtype: int64

In [5]:
df = df.drop(['link','mentions','hashtags'], axis=1)

In [6]:
# clean and fix data in content column
content = df.content.copy()

In [7]:
# change line into lower case, remove punctuations, bracekets, urls, and extra space
from nltk.corpus import stopwords

def clean_line(line):
    line = str(line).lower()
    line = re.sub('\[.*?\]', '', line)
    line = re.sub('https?://\S+|www\.\S+', '', line)
    line = re.sub('<.*?>+', '', line)
    line = re.sub('[%s]' % re.escape(string.punctuation), '', line)
    line = re.sub('\n', '', line)
    line = re.sub('\w*\d\w*', '', line)
    line = re.sub('\'','', line)
    line = line.strip().split()
    return ' '.join(line)

df['content'] = df['content'].apply(lambda x: clean_line(x))

# remove empty content after preprocessing
df = df[df.content != '']

In [8]:
from textblob import TextBlob
df['sentiment'] = np.array([TextBlob(content).sentiment.polarity for content in df['content']])

# round to 2 digits
df['sentiment'] = df.sentiment.apply(lambda x: round(x,2))

# Aggregrate&Word Count

In [8]:
stop_set = set(stopwords.words('English'))
stop_set.add('...')
stop_set.add('-')
stop_set.add('us')

In [12]:
df.date = pd.to_datetime(df.date).dt.strftime('%Y-%m')
df

Unnamed: 0,id,content,date,retweets,favorites,sentiment
0,1698308935,be sure to tune in and watch donald trump on l...,2009-05,510,917,0.17
1,1701461182,donald trump will be appearing on the view tom...,2009-05,34,267,0.14
2,1737479987,donald trump reads top ten financial tips on l...,2009-05,13,19,0.11
3,1741160716,new blog post celebrity apprentice finale and ...,2009-05,11,26,0.14
4,1773561338,my persona will never be that of a wallflower ...,2009-05,1375,1945,0.00
...,...,...,...,...,...,...
43344,1273061883399098368,true …,2020-06,26783,154978,0.35
43345,1273080720794279937,a great woman her son is looking down from hea...,2020-06,26468,112140,0.65
43346,1273095002563006472,approval rating in the republican party thank you,2020-06,44472,313534,0.00
43347,1273405198698975232,joe biden was a total failure in government he...,2020-06,23402,116377,-0.16


In [18]:
df_grouped = df.groupby('date').mean()
df_grouped = df_grouped.apply(lambda x: round(x,2))
df_grouped = df_grouped.drop(['id'], axis=1)
df_grouped

Unnamed: 0_level_0,retweets,favorites,sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-05,117.38,185.48,0.14
2009-06,30.36,51.18,0.17
2009-07,16.20,19.60,0.09
2009-08,54.57,69.57,0.10
2009-09,47.00,57.67,0.07
...,...,...,...
2020-02,19576.81,88923.88,0.15
2020-03,22318.00,103803.92,0.15
2020-04,26146.27,124365.38,0.13
2020-05,25933.11,111138.17,0.12


# store processed data

In [19]:
df_grouped.to_csv('proceed_data_by_month')

In [11]:
# # group = df['sentiment'].groupby(df.date).mean()
# group = df.sentiment.groupby(df.date).mean()
# re_df = pd.DataFrame(group)
# re_df['word_1'] = ''
# re_df['word_2'] = ''
# re_df['word_3'] = ''
# re_df['word_4'] = ''
# re_df['word_5'] = ''
# re_df['word_6'] = ''
# re_df['word_7'] = ''
# re_df['word_8'] = ''
# re_df['word_9'] = ''
# re_df['word_10'] = ''
# re_df

Unnamed: 0_level_0,sentiment,retweets,favorites
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-05-04 13:54:25,0.17,510.0,917.0
2009-05-04 20:00:10,0.14,34.0,267.0
2009-05-08 08:38:08,0.11,13.0,19.0
2009-05-08 15:40:15,0.14,11.0,26.0
2009-05-12 09:07:28,0.00,1375.0,1945.0
...,...,...,...
2020-06-16 20:16:20,0.35,26783.0,154978.0
2020-06-16 21:31:11,0.65,26468.0,112140.0
2020-06-16 22:27:56,0.00,44472.0,313534.0
2020-06-17 19:00:32,-0.16,23402.0,116377.0


In [12]:
# for d in re_df.index:
#     df_filtered = df[df.date == d].content
#     word_count = pd.Series(' '.join(df_filtered).split()).value_counts()
#     word_count = word_count[[i for i in word_count.index if i not in stop_set and i != '…']][:10]
#     tup_li = []
#     for i in word_count.index:
#         c = word_count.loc[i]
#         tup_li.append((i,c))
#     re_df.at[d,'word_1'] = tup_li[0]
#     re_df.at[d,'word_2'] = tup_li[1]
#     re_df.at[d,'word_3'] = tup_li[2]
#     re_df.at[d,'word_4'] = tup_li[3]
#     re_df.at[d,'word_5'] = tup_li[4]
#     re_df.at[d,'word_6'] = tup_li[5]
#     re_df.at[d,'word_7'] = tup_li[6]
#     re_df.at[d,'word_8'] = tup_li[7]
#     re_df.at[d,'word_9'] = tup_li[8]
#     re_df.at[d,'word_10'] = tup_li[9]
# re_df

great        93
thank        48
president    35
new          30
mini         30
mike         28
bernie       26
hoax         25
fake         25
big          25
dtype: int64


Unnamed: 0_level_0,sentiment,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9,word_10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2009-05,0.137619,"(donald, 17)","(trump, 17)","(j, 10)","(like, 6)","(champion, 5)","(think, 5)","(new, 4)","(book, 4)","(keep, 3)","(know, 3)"
2009-06,0.172727,"(donald, 7)","(trump, 6)","(j, 3)","(dont, 2)","(–donald, 2)","(champion, 2)","(wishes, 2)","(birthday, 2)","(today, 2)","(trumps, 2)"
2009-07,0.090000,"(trump, 4)","(donald, 4)","(j, 2)","(like, 1)","(nj, 1)","(blog, 1)","(list, 1)","(innovation, 1)","(safe, 1)","(think, 1)"
2009-08,0.097143,"(donald, 4)","(trump, 3)","(live, 2)","(universe, 2)","(competition, 2)","(bahamas, 2)","(nbc, 2)","(trumps, 2)","(watch, 2)","(miss, 2)"
2009-09,0.073333,"(trump, 3)","(donald, 3)","(imagination, 2)","(dsrl, 1)","(race, 1)","(ten, 1)","(hysterical, 1)","(cant, 1)","(tips, 1)","(plus, 1)"
...,...,...,...,...,...,...,...,...,...,...,...
2020-02,0.149363,"(great, 93)","(thank, 48)","(president, 35)","(new, 30)","(mini, 30)","(mike, 28)","(bernie, 26)","(hoax, 25)","(fake, 25)","(big, 25)"
2020-03,0.152285,"(great, 94)","(thank, 81)","(news, 54)","(people, 44)","(fake, 34)","(coronavirus, 33)","(states, 29)","(new, 28)","(get, 26)","(working, 26)"
2020-04,0.132632,"(great, 81)","(news, 70)","(thank, 50)","(fake, 50)","(people, 38)","(states, 29)","(house, 28)","(white, 26)","(even, 26)","(get, 25)"
2020-05,0.116121,"(great, 108)","(people, 65)","(thank, 56)","(news, 41)","(big, 39)","(fake, 38)","(get, 36)","(job, 34)","(total, 33)","(would, 32)"


In [13]:
# word_count = pd.Series(' '.join(df.content).split()).value_counts()
# filtered = [i for i in word_count.index if i not in stop_set]
# word_count = word_count[filtered]
# word_count

realdonaldtrump    8569
great              6623
trump              4950
…                  4045
thank              3019
                   ... 
warehouses            1
surfinscotty          1
cdcdirector           1
lillirome             1
attorneys’            1
Length: 40334, dtype: int64

In [14]:
df.content = content
df

Unnamed: 0,id,content,date,retweets,favorites,sentiment
0,1698308935,Be sure to tune in and watch Donald Trump on L...,2009-05,510,917,0.17
1,1701461182,Donald Trump will be appearing on The View tom...,2009-05,34,267,0.14
2,1737479987,Donald Trump reads Top Ten Financial Tips on L...,2009-05,13,19,0.11
3,1741160716,New Blog Post: Celebrity Apprentice Finale and...,2009-05,11,26,0.14
4,1773561338,"""My persona will never be that of a wallflower...",2009-05,1375,1945,0.00
...,...,...,...,...,...,...
43344,1273061883399098368,True!https://twitter.com/realdonaldtrump/statu...,2020-06,26783,154978,0.35
43345,1273080720794279937,A GREAT woman. Her son is looking down from he...,2020-06,26468,112140,0.65
43346,1273095002563006472,96% Approval Rating in the Republican Party. T...,2020-06,44472,313534,0.00
43347,1273405198698975232,Joe Biden was a TOTAL FAILURE in Government. H...,2020-06,23402,116377,-0.16


In [15]:
# # store data file
# word_count_df = pd.DataFrame(word_count).reset_index()
# word_count_df.columns = ['words', 'count']
# print(word_count_df.head())
# word_count_df.to_csv('word_count.csv')
# df.to_csv('processed_data.csv')
# re_df.to_csv('month_with_wordcount.csv')

             words  count
0  realdonaldtrump   8569
1            great   6623
2            trump   4950
3                …   4045
4            thank   3019
