# Project Code

In [1]:
#import libraries
import numpy as np
import pandas as pd
import re
from textblob import TextBlob as tb ##TextBlob object will allow for quick sentiment analysis
from emo_unicode import EMO_UNICODE #A set of dictionaries of emoticons : their text represenstations
from emo_unicode import UNICODE_EMO ## courtesy of NeelShah18 on GitHub
                                  

In [2]:
#read tweet information into dataframe
tweets = pd.read_csv('tweets.csv')

In [3]:
#replace emojis with their text representation
##this method is courtesy of kaggle.com user SRK

UNICODE_EMO = {v: k for k, v in EMO_UNICODE.items()}

def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = re.sub(r'('+emot+')', "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()), text)
    return text

In [4]:
#create series objects of the polarity and subjectivity of the tweets
polarity = tweets.text.map(lambda x: tb(x).sentiment.polarity)
subjectivity = tweets.text.map(lambda x: tb(x).sentiment.subjectivity)

In [5]:
#add new columns to tweets df corresponding to the new info
tweets['polarity'] = polarity
tweets['subjectivity'] = subjectivity
tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,polarity,subjectivity
0,Twitter for iPhone,RT @SenBillCassidy: January #JobsReport:✅22500...,02-21-2020 16:06:40,896.0,0.0,True,1.230887e+18,0.068182,0.377273
1,Twitter for iPhone,RT @SteveDaines: Obama sure didn’t build this ...,02-21-2020 16:06:20,2086.0,0.0,True,1.230886e+18,0.625,0.888889
2,Twitter for iPhone,RT @JohnBoozman: Our servicemembers stand read...,02-21-2020 16:06:10,521.0,0.0,True,1.230886e+18,0.2,0.5
3,Twitter for iPhone,RT @RoyBlunt: ▶️ Unemployment is at a nearly 5...,02-21-2020 16:05:52,623.0,0.0,True,1.230886e+18,0.1,0.4
4,Twitter for iPhone,RT @JimInhofe: Happy 79th birthday to @USCGRes...,02-21-2020 16:05:25,533.0,0.0,True,1.230886e+18,0.75,0.75


In [6]:
#print stuff about the dataframe
tweets.describe()

Unnamed: 0,retweet_count,favorite_count,id_str,polarity,subjectivity
count,15383.0,15383.0,15383.0,15384.0,15384.0
mean,5482.033739,15590.625886,7.114691e+17,0.174146,0.417105
std,11110.767803,40191.784946,2.868582e+17,0.345875,0.308958
min,0.0,0.0,4629117000.0,-1.0,0.0
25%,27.0,24.0,5.431409e+17,0.0,0.066667
50%,802.0,177.0,6.396227e+17,0.087197,0.466667
75%,7208.0,8272.5,8.240796e+17,0.38017,0.644974
max,369530.0,814012.0,1.230887e+18,1.0,1.0


In [7]:
#print more stuff about the dataframe
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15384 entries, 0 to 15383
Data columns (total 9 columns):
source            15384 non-null object
text              15384 non-null object
created_at        15383 non-null object
retweet_count     15383 non-null float64
favorite_count    15383 non-null float64
is_retweet        15383 non-null object
id_str            15383 non-null float64
polarity          15384 non-null float64
subjectivity      15384 non-null float64
dtypes: float64(5), object(4)
memory usage: 1.1+ MB


In [8]:
#make new column that shows year
##we will use this column to group tweets together when making our comparisons
tweets['created_at'] = pd.to_datetime(tweets['created_at'])
tweets['year'] = tweets['created_at'].dt.year.astype('Int64')
tweets.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,polarity,subjectivity,year
0,Twitter for iPhone,RT @SenBillCassidy: January #JobsReport:✅22500...,2020-02-21 16:06:40,896.0,0.0,True,1.230887e+18,0.068182,0.377273,2020
1,Twitter for iPhone,RT @SteveDaines: Obama sure didn’t build this ...,2020-02-21 16:06:20,2086.0,0.0,True,1.230886e+18,0.625,0.888889,2020
2,Twitter for iPhone,RT @JohnBoozman: Our servicemembers stand read...,2020-02-21 16:06:10,521.0,0.0,True,1.230886e+18,0.2,0.5,2020
3,Twitter for iPhone,RT @RoyBlunt: ▶️ Unemployment is at a nearly 5...,2020-02-21 16:05:52,623.0,0.0,True,1.230886e+18,0.1,0.4,2020
4,Twitter for iPhone,RT @JimInhofe: Happy 79th birthday to @USCGRes...,2020-02-21 16:05:25,533.0,0.0,True,1.230886e+18,0.75,0.75,2020


In [19]:
#create the subsets for each topic
russia = tweets['text'].str.contains('russia|Russia|moscow|Moscow|putin|Putin')
iran = tweets['text'].str.contains('iran|Iran|tehran|Tehran|Nuclear Deal|Rouhani')
nkorea = tweets['text'].str.contains('north korea|North Korea|DPRK|Pyongyang|pyongyang|Jong-Un|Jong-Il')

In [18]:
tweets[nkorea]['year'].value_counts().sort_index()

2013    10
2014     2
2017    34
2019     3
Name: year, dtype: int64