# Tweet data analysis 1

## import necessary libraries

In [1]:
import tweepy
import json
import pandas as pd
import numpy as np
import time
import altair as alt
from IPython.display import display

import twitter_credentials

## Use authentication credential to connect Twitter API

In [10]:
def twitter_api_authentication():
    """ authenticate with crediential and connect to Twitter API """
    
    auth = tweepy.OAuthHandler(twitter_credentials.CONSUMER_KEY, twitter_credentials.CONSUMER_SECRET)
    auth.set_access_token(twitter_credentials.ACCESS_TOKEN, twitter_credentials.ACCESS_TOKEN_SECRET)
    api = tweepy.API(auth)
    
    return api

In [11]:
api = twitter_api_authentication()

## load tweets data and creat dataframe

In [2]:
def create_tweet_df():

    # load tweets data
    path = 'oprah_retrieved_tweets_data.json'
    with open(path, 'r') as jf:
        tweets = json.load(jf)
    
    # create tweet dataframe with specified columns
    key_list =['created_at','full_text','retweet_count', 'favorite_count','lang']
    df = pd.DataFrame(tweets, columns= key_list)
    df['created_at'] = pd.to_datetime(df['created_at'])
    return df

In [3]:
tweets_df = create_tweet_df()
print(tweets_df.shape)
tweets_df.head()

(3000, 5)


Unnamed: 0,created_at,full_text,retweet_count,favorite_count,lang
0,2022-03-27 17:21:54+00:00,@LizToo2010 @nada0971 Yes Liz such a loss to a...,9,83,en
1,2022-03-27 17:12:56+00:00,"RT @franklinleonard: Whatever happens tonight,...",19,0,en
2,2022-03-27 17:01:53+00:00,Surreal! You know why?? Cause the “Good Lord W...,93,993,en
3,2022-03-25 23:34:45+00:00,RT @OprahDaily: .@Oprah is returning to #TheCo...,80,0,en
4,2022-03-19 01:05:49+00:00,"RT @Essence: Tamela Mann, Deon Cole and David ...",63,0,en


## heatmap of when tweets are sent

In [4]:
def create_weekday_hour_df(tweets_dataframe):
    """
    Create a dataframe that counts the number of tweets per weekday (Mon-Sun) per hour of the day (0-23)
    """
    day_hour = tweets_dataframe

    keys = [0,1,2,3,4,5,6]
    values = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
    date_dict = dict(zip(keys,values))
    
    day_hour = day_hour.loc[:,['created_at',]]
    day_hour['hour'] = day_hour['created_at'].dt.hour.astype('str')
    day_hour['weekday'] = day_hour['created_at'].dt.weekday.map(date_dict)
    
    day_hour = day_hour.groupby(['weekday','hour']).count().reset_index()
    day_hour.rename(columns={'created_at':'number of tweets'}, inplace = True)
    
    return day_hour


def plot_weekday_hour(weekday_hour_df):
    """
    A visualization of the number of tweets per weekday (Mon-Sun) and per hour of the day (0-23). 
    """
    chart = alt.Chart(pd.DataFrame([])).mark_rect()

    hour_order = [str(i) for i in range(0,24)]
    weekday_order = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
    
    chart = alt.Chart(weekday_hour_df).mark_rect().encode(
        x = alt.X('hour:O', sort= hour_order),
        y = alt.Y('weekday:O', sort = weekday_order),
        color = 'number of tweets:Q')
    
    display(chart)

In [5]:
# use large json_tweets data (loaded from oprah_retrieved_tweets_data) to do vis
weekday_hour_df = create_weekday_hour_df(tweets_df)
plot_weekday_hour(weekday_hour_df)

## bar chart of when tweets are created

In [6]:
def plot_tweet_creation_date_count(tweets_dataframe):

    # prepare data
    keys = [0,1,2,3,4,5,6]
    values = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
    date_dict = dict(zip(keys,values))

    df = tweets_dataframe.loc[:,['created_at',]]
    df['weekday'] = df['created_at'].dt.weekday.map(date_dict)
    df['date'] = df['created_at'].dt.date
    df['date'] = pd.to_datetime(df['date'])
    df['date'] = df['date'].dt.strftime('%m/%d/%Y')

    # make left chart
    left = alt.Chart(df).mark_bar().encode(
        x = alt.X('weekday:O', sort = values),
        y = alt.Y('count():Q', title = 'number of tweets'))
    
    # make right chart
    # sort string date first by year, then month and day
    date = df.date.to_list()
    date_order = sorted(date, key = lambda x: (x.split('/')[-1], x.split('/')[0], x.split('/')[1]))

    right = alt.Chart(df).mark_bar().encode(
            x = alt.X('date:O', 
                      sort = date_order
                     ),
            y = alt.Y('count(date):Q', title = 'number of tweets')
        ).properties(width = 700)
    
    # final chart
    chart = (left | right)
    display(chart)

In [7]:
# to see the date vis of Oprah's latest 100 tweets 
plot_tweet_creation_date_count(tweets_df[:100])

## Bar chart of where followers are

In [8]:
def get_followers(user_handle, num_tweets):
    
    def limit_handled(cursor):
        """
        Handle twitter rate limits.
        If the rate limit is reached, print the error message and exit the procedure.
        Print a short summary of the total number of tweets retrieved and time spent.
        """
        n=0
        while True:
            print(".", end="")
            try:
                yield cursor.next()
                n += 1
            except Exception as e:
                if tweepy.TooManyRequests:  #if TRUE this means we hit a rate limit error
                    print(f"Reached rate limits after {n} iterations.")   
                    print(f'Error message: {e}')
                break
        return None
            
    followers = []
    for follower in limit_handled(tweepy.Cursor(api.get_followers, screen_name = user_handle).items(num_tweets)):
        followers.append(follower)
    
    return followers
            
            
def plot_followers_locations(followers):
    """
    A visualization for the locations of followers of the user handle.
    Note: Filtered out followers who haven't specified a location.
    """
    chart = alt.Chart(pd.DataFrame([])).mark_rect()

    # get all locations from follower user object
    locations = []
    for follower in followers:
        location = follower._json['location']
        locations.append(location)
    
    # make a dataframe
    followers_df = pd.DataFrame(locations, columns = ['location'])
    followers_df['location'] = followers_df.location.replace('', np.nan)
    followers_df.dropna(inplace = True)
    
    # make chart
    chart = alt.Chart(followers_df).mark_bar().encode(
            x = 'location:O',
            y = 'count():Q'
        ).properties(width = 700, height = 200)

    display(chart)
    return followers_df


In [12]:
followers = get_followers('Oprah', 300)
df = plot_followers_locations(followers)

.............................................................................................................................................................................................................................................................................................................Reached rate limits after 300 iterations.
Error message: 
