In [None]:
!pip install tweet-preprocessor
!pip install textblob
!pip install tweet-preprocessor

Collecting tweet-preprocessor
  Downloading https://files.pythonhosted.org/packages/17/9d/71bd016a9edcef8860c607e531f30bd09b13103c7951ae73dd2bf174163c/tweet_preprocessor-0.6.0-py3-none-any.whl
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [None]:
import sys
import os
import json
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#Import the necessary methods from tweepy library  

#install tweepy if you don't have it
#!pip install tweepy
import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream

#sentiment analysis package
#!pip install textblob
from textblob import TextBlob

#general text pre-processor
#!pip install nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

#tweet pre-processor 
#!pip install tweet-preprocessor
import preprocessor as ppr

import twitter_credentials
import re
import string

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
class tweetsearch():
    '''
    This is a basic class to search and download twitter data.
    You can build up on it to extend the functionalities for more 
    sophisticated analysis
    '''
    def __init__(self,cols=None,auth=None):
        #
        if not cols is None:
            self.cols = cols
        else:
            self.cols = ['id', 'created_at', 'source', 'original_text','clean_text', 
                    'sentiment','polarity','subjectivity', 'lang',
                    'favorite_count', 'retweet_count', 'original_author',   
                    'possibly_sensitive', 'hashtags',
                    'user_mentions', 'place', 'place_coord_boundaries']
            
        if auth is None:
            #Variables that contains the user credentials to access Twitter API 
            #consumer_key = os.environ.get('TWITTER_API_KEY')
            #consumer_secret = os.environ.get('TWITTER_API_SECRET')
            #access_token = os.environ.get('TWITTER_ACCESS_TOKEN')
            #access_token_secret = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET')


            #This handles Twitter authetification and the connection to Twitter Streaming API
            
            #auth = OAuthHandler(consumer_key, consumer_secret)
            #auth.set_access_token(access_token, access_token_secret)

            auth=OAuthHandler(twitter_credentials.CONSUMER_KEY,twitter_credentials.CONSUMER_SECRET)
            auth.set_access_token(twitter_credentials.ACCES_TOKEN,twitter_credentials.ACCES_TOKEN_SECRET)
            

        #            
        self.auth = auth 
        self.api= tweepy.API(self.auth)            
      

    def clean_tweets(self,twitter_text):

        #use pre processor
        tweet = ppr.clean(twitter_text)

         #HappyEmoticons
        emoticons_happy = set([
            ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
            ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
            '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
            'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
            '<3'
            ])

        # Sad Emoticons
        emoticons_sad = set([
            ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
            ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
            ':c', ':{', '>:\\', ';('
            ])

        #Emoji patterns
        emoji_pattern = re.compile("["
                 u"\U0001F600-\U0001F64F"  # emoticons
                 u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                 u"\U0001F680-\U0001F6FF"  # transport & map symbols
                 u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                 u"\U00002702-\U000027B0"
                 u"\U000024C2-\U0001F251"
                 "]+", flags=re.UNICODE)

        #combine sad and happy emoticons
        emoticons = emoticons_happy.union(emoticons_sad)

        stop_words = set(stopwords.words('english'))
        stop_words.update(['#coronavirus', '#coronavirusoutbreak', '#coronavirusPandemic', '#covid19', '#covid_19', '#epitwitter', '#ihavecorona', 'amp', 'coronavirus', 'covid19','clean_text'])

        word_tokens = word_tokenize(tweet)
        #after tweepy preprocessing the colon symbol left remain after      
        #removing mentions
        tweet = re.sub(r':', '', tweet)
        tweet = re.sub(r'‚Ä¶', '', tweet)

        #replace consecutive non-ASCII characters with a space
        tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)

        #remove emojis from tweet
        tweet = emoji_pattern.sub(r'', tweet)

        #filter using NLTK library append it to a string
        filtered_tweet = [w for w in word_tokens if not w in stop_words]

        #looping through conditions
        filtered_tweet = []    
        for w in word_tokens:
        #check tokens against stop words , emoticons and punctuations
            if w not in stop_words and w not in emoticons and w not in string.punctuation:
                filtered_tweet.append(w)

        return ' '.join(filtered_tweet)            


    def get_tweets(self,keyword, csvfile=None):
        
        #df = pd.DataFrame(columns=self.cols)
        
        #if  csvfile is None:
        #If the file exists, then read the existing data from the CSV file.
        if os.path.exists(csvfile):
            df = pd.read_csv(csvfile, header=None)
        else:
            df=pd.DataFrame(columns=self.cols)
            

        #page attribute in tweepy.cursor and iteration
        for page in tweepy.Cursor(self.api.search, q=keyword,count=200, include_rts=False).pages():

            for status in page:
                
                new_entry = []
                status = status._json
                
                #filter by language
                if status['lang'] != 'en':
                    continue

                
                #if this tweet is a retweet update retweet count
                if status['created_at'] in df['created_at'].values:
                    i = df.loc[df['created_at'] == status['created_at']].index[0]
                    #
                    cond1 = status['favorite_count'] != df.at[i, 'favorite_count']
                    cond2 = status['retweet_count'] != df.at[i, 'retweet_count']
                    if cond1 or cond2:
                        df.at[i, 'favorite_count'] = status['favorite_count']
                        df.at[i, 'retweet_count'] = status['retweet_count']
                    continue
                #print(status['text'])

                #calculate sentiment
                filtered_tweet =self.clean_tweets(status['text'])
                blob = TextBlob(filtered_tweet)
                Sentiment = blob.sentiment     
                polarity = Sentiment.polarity
                subjectivity = Sentiment.subjectivity

                new_entry += [status['id'], status['created_at'],
                              status['source'], status['text'],filtered_tweet, 
                              Sentiment,polarity,subjectivity, status['lang'],
                              status['favorite_count'], status['retweet_count']]

                new_entry.append(status['user']['screen_name'])

                try:
                    is_sensitive = status['possibly_sensitive']
                except KeyError:
                    is_sensitive = None

                new_entry.append(is_sensitive)

                hashtags = ", ".join([hashtag_item['text'] for hashtag_item in status['entities']['hashtags']])
                new_entry.append(hashtags) #append the hashtags

                #
                mentions = ", ".join([mention['screen_name'] for mention in status['entities']['user_mentions']])
                new_entry.append(mentions) #append the user mentions

                try:
                    xyz = status['place']['bounding_box']['coordinates']
                    coordinates = [coord for loc in xyz for coord in loc]
                except TypeError:
                    coordinates = None
                #
                new_entry.append(coordinates)

                try:
                    location = status['user']['location']
                except TypeError:
                    location = ''
                #
                new_entry.append(location)
                
                #now append a row to the dataframe
                single_tweet_df = pd.DataFrame([new_entry],columns=self.cols)
                df_final = df.append(single_tweet_df, ignore_index=True)
            
            #save it to file
            csvFile = open(csvfile, 'a' ,encoding='utf-8')
            df_final.to_csv(csvFile,mode='a',columns=self.cols, encoding="utf-8")
          
         
        return df_final
      
   


In [None]:
covid_keywords = '#COVID19Ethiopia OR #COVID19Africa'  #hashtag based search
#tweets_file = 'opia_covid19_23june2020.json'
tweets_file='ethiopia_covid19_23june2020.csv'



#get data on keywords

ts = tweetsearch()
df = ts.get_tweets(covid_keywords, csvfile=tweets_file) 
   #you saved the 

In [None]:
df.head(5)

Unnamed: 0,id,created_at,source,original_text,clean_text,sentiment,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,possibly_sensitive,hashtags,user_mentions,place,place_coord_boundaries
0,1275067907710443524,Mon Jun 22 14:07:33 +0000 2020,"<a href=""http://twitter.com/download/android"" ...",RT @Amref_Worldwide: Why wear a mask? Is it ne...,Why wear mask Is necessary Is required everyon...,"(0.13333333333333333, 0.5666666666666667)",0.133333,0.566667,en,0,484,LGwara,,,"Amref_Worldwide, daktari1",,Nairobi
