In [81]:
import tweepy
from tweepy.streaming import StreamListener
import json
import pandas as pd
import csv
import re #regular expression
from textblob import TextBlob
import string
import preprocessor as p

In [64]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [3]:
#Twitter credentials for the app
consumer_key = ''
consumer_secret = ''
access_key= ''
access_secret = ''

In [6]:
#pass twitter credentials to tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

In [41]:
covid19_tweets = "../Data/covid19/covid19_data.csv"

In [39]:
def searchAndWrite_tweets(keyword, lang, count):
    tweets = tweepy.Cursor(api.search,
                        q=keyword,
                        lang=lang).items(count)
    tweets_arr = []
    # Iterate and print tweets
    for tweet in tweets:
        tweets_arr.append(tweet._json)
    
    with open("../Data/covid19/data.json", "w+") as output:
        output.write(json.dumps(tweets_arr))
    return tweets_arr

def searchAndWrite_date_tweets(keyword, date_since, lang, count):
    tweets = tweepy.Cursor(api.search,
                        q=keyword,
                        lang=lang,
                        since=date_since).items(count)
    tweets_arr = []
    # Iterate and print tweets
    for tweet in tweets:
        tweets_arr.append(tweet._json)
    
    with open("../Data/covid19/data.json", "w+") as output:
        output.write(json.dumps(tweets_arr))
    return tweets_arr

In [29]:
search_words = ["#coronavirus", "#COVID19", "#CoronavirusOutbreak"]
date_since="2020-05-01"

In [40]:
tweets_arr = searchAndWrite_tweets(keyword=search_words, date_since=date_since, lang="en", count=200)

In [53]:
a = [[tweet['user']['screen_name'],tweet['created_at']] for tweet in tweets_arr]

In [1]:
tweet_details = [[tweet['id'], tweet['created_at'], tweet['truncated'], tweet['text'], clean_tweets(p.api.clean(tweet['text'])),
                  tweet['user']['screen_name'], tweet['source'],
                  tweet['user']['followers_count'], tweet['user']['friends_count'], tweet['retweeted'], 
                  tweet['retweet_count'], tweet['user']['location']] for tweet in tweets_arr]

NameError: name 'tweets_arr' is not defined

In [131]:
tweets_df = pd.DataFrame(
    data=tweet_details, 
    columns=['id', 'created_at', 'truncated', 'text', 'text_clean', 'screen_name', 'source',
             'user_followers_count', 'user_friends_count', 'isRetweeted', 'retweet_count', 'user_location' ]
)

In [132]:
# Create a function to get the subjectivity
def getSubjectivity(text):
   return TextBlob(text).sentiment.subjectivity

# Create a function to get the polarity
def getPolarity(text):
   return  TextBlob(text).sentiment.polarity

# Create a function to get the sentiment
def getSentiment(text):
    analysis = TextBlob(text)
    if (analysis.sentiment[0] > 0):
        return 'Positive'
    elif (analysis.sentiment[0] < 0):
        return 'Negative'
    else:
        return 'Neutre'

In [133]:
# Create Three new columns 'Subjectivity' & 'Polarity' & 'Sentiment'
tweets_df['Subjectivity'] = tweets_df['text_clean'].apply(getSubjectivity)
tweets_df['Polarity'] = tweets_df['text_clean'].apply(getPolarity)
tweets_df['Sentiment'] = tweets_df['text_clean'].apply(getSentiment)

In [None]:
tweets_df.head(5)

In [None]:
# Total tweets
print('Total tweets this period:', len(tweets_df.index), '\n')

# Retweets
tweet_df = tweets_df.sort_values(by='isRetweeted', ascending=False)
tweet_df = tweet_df.reset_index(drop=True)
print('Mean retweets:', round(tweet_df['isRetweeted'].mean(),2), '\n')
print('Top 5 RTed tweets:')
print('------------------')
for i in range(5):
    print(tweet_df['text_clean'].iloc[i], '-', tweet_df['isRetweeted'].iloc[i])
print('\n')

In [None]:
# Hashtags & mentions
tag_dict = {}
mention_dict = {}

for i in tweet_df.index:
    tweet_text = tweets_df.iloc[i]['text']
    tweet = tweet_text.lower()
    tweet_tokenized = tweet.split()

    for word in tweet_tokenized:
        # Hashtags - tokenize and build dict of tag counts
        if (word[0:1] == '#' and len(word) > 1):
            key = word.translate(str.maketrans("","",string.punctuation))
            if key in tag_dict:
                tag_dict[key] += 1
            else:
                tag_dict[key] = 1

        # Mentions - tokenize and build dict of mention counts
        if (word[0:1] == '@' and len(word) > 1):
            key = word.translate(str.maketrans("","",string.punctuation))
            if key in mention_dict:
                mention_dict[key] += 1
            else:
                mention_dict[key] = 1

In [None]:
# The 10 most popular tags and counts
top_tags = dict(sorted(tag_dict.items(), key=operator.itemgetter(1), reverse=True)[:10])
top_tags_sorted = sorted(top_tags.items(), key=lambda x: x[1])[::-1]
print('Top 10 hashtags:')
print('----------------')
for tag in top_tags_sorted:
    print(tag[0], '-', str(tag[1]))
    
# The 10 most popular mentions and counts
top_mentions = dict(sorted(mention_dict.items(), key=operator.itemgetter(1), reverse=True)[:10])
top_mentions_sorted = sorted(top_mentions.items(), key=lambda x: x[1])[::-1]
print('\nTop 10 mentions:')
print('----------------')
for mention in top_mentions_sorted:
    print(mention[0], '-', str(mention[1]))

In [None]:
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
# word cloud visualization
allWords = ' '.join([twts for twts in tweets_df['text_clean']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(allWords)

plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# Plotting Sentiment analysis
plt.figure(figsize=(8,6)) 
for i in range(0, tweets_df.shape[0]):
  plt.scatter(tweets_df["Polarity"][i], tweets_df["Subjectivity"][i], color='Blue') 
# plt.scatter(x,y,color)   
plt.title('Sentiment Analysis') 
plt.xlabel('Polarity') 
plt.ylabel('Subjectivity') 
plt.show()

In [None]:
# Print the percentage of positive tweets
ptweets = tweets_df[tweets_df.Sentiment == 'Positive']
ptweets = ptweets['text_clean']
ptweets

round( (ptweets.shape[0] / tweets_df.shape[0]) * 100 , 1)

In [None]:
# Print the percentage of negative tweets
ntweets = tweets_df[tweets_df.Sentiment == 'Negative']
ntweets = ntweets['text_clean']
ntweets

round( (ntweets.shape[0] / tweets_df.shape[0]) * 100, 1)