## Imports

In [221]:
#Library used to access and use twitter API
import tweepy  

#Libraries used to process data
import json
import jsonlines
import pandas as pd

## Twitter API info

In [216]:
#Enter your own details from twitter dev account
consumerKey = '*************************'
consumerSecret = '**************************************************'
accessKey = '**************************************************'
accessSecret = '*********************************************'

## Code to fetch tweets

In [224]:
def fetch_tweets(name):
    '''
    Fetches the tweets from the given handle
    
    Inputs - 
    1) name (string) : The name of the twitter handle you want to fetch tweets from
    Output
    1) tweet_data (list) : The list of tweepy.models.Status objects corresponding to the tweets from the handle

    '''
    
    #Code required to authorise twitter API access
    authorise = tweepy.OAuthHandler(consumer_key=consumerKey,consumer_secret=consumerSecret)
    authorise.set_access_token(accessKey,accesSecret)
    api = tweepy.API(authorise)
    
    #Initialise list to store raw tweets fetched
    tweet_data = []
    
    #Fetch the first 50 tweets from the twitter handle's timeline (or all tweets if there are less than 50)
    data = api.user_timeline(name, count = 50)
    tweet_data.extend(data)
    
    #However, the user may have more than 50 tweets, so we need to go over the remaining tweets
    
    #Find the last id in the first 
    id_last_tweet = data[-1].id - 1
        
    #While the user has tweets remaining, we get tweets in batches of 50
    while(True):
        
        #By specifying max_id = id_last_tweet, we avoid getting duplicate entries/ the first 50 elements again
        data = api.user_timeline(name,count=50, max_id=id_last_tweet)
        tweet_data.extend(data)
        
        #Update for next batch
        id_last_tweet = data[-1].id - 1
        
        #If we have seen all tweets of the user
        if (len(data) == 0):
            break

    
    return tweet_data

In [None]:
data = fetch_tweets("midasIIITD")

## Analysing fetched data (Not part of code, just part of how I went through the process

In [222]:
type(data[0])

tweepy.models.Status

In [223]:
data.__len__()

334

In [103]:
data[0]

Status(_api=<tweepy.api.API object at 0x7f15976f8a58>, _json={'created_at': 'Sun Apr 07 11:43:24 +0000 2019', 'id': 1114856195335106560, 'id_str': '1114856195335106560', 'text': 'We request all students whose interview are scheduled today to join at the time given to them and not before or aft… https://t.co/7cnrlj1b9Q', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/7cnrlj1b9Q', 'expanded_url': 'https://twitter.com/i/web/status/1114856195335106560', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [117, 140]}]}, 'source': '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1021355762575073281, 'id_str': '1021355762575073281', 'name': 'MIDAS IIITD', 'screen_name': 'midasIIITD', 'location': 'New Delhi, India', 'description

In [24]:
data[9]._json

{'created_at': 'Wed Apr 03 18:31:53 +0000 2019',
 'id': 1113509442849525760,
 'id_str': '1113509442849525760',
 'text': 'RT @stanfordnlp: What’s new in @Stanford CS224N Natural Language Processing with Deep Learning for 2019? Question answering—1D CNNs—subword…',
 'truncated': False,
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [{'screen_name': 'stanfordnlp',
    'name': 'Stanford NLP Group',
    'id': 118263124,
    'id_str': '118263124',
    'indices': [3, 15]},
   {'screen_name': 'Stanford',
    'name': 'Stanford University',
    'id': 18036441,
    'id_str': '18036441',
    'indices': [31, 40]}],
  'urls': []},
 'source': '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 1021355762575073281,
  'id_str': '1021355762575073281',
  'name': 'MIDAS IIITD',
  'screen_na

In [161]:
data[9]._json.keys()

dict_keys(['created_at', 'id', 'id_str', 'text', 'truncated', 'entities', 'extended_entities', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'retweeted_status', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'possibly_sensitive', 'lang'])

In [169]:
data[9]._json["entities"]["media"][0]["type"]

'photo'

In [189]:
data[9]._json['created_at'].split(" ")[5]

'2019'

In [28]:
json_str = json.dumps(data[9]._json)

In [29]:
json_str

'{"created_at": "Wed Apr 03 18:31:53 +0000 2019", "id": 1113509442849525760, "id_str": "1113509442849525760", "text": "RT @stanfordnlp: What\\u2019s new in @Stanford CS224N Natural Language Processing with Deep Learning for 2019? Question answering\\u20141D CNNs\\u2014subword\\u2026", "truncated": false, "entities": {"hashtags": [], "symbols": [], "user_mentions": [{"screen_name": "stanfordnlp", "name": "Stanford NLP Group", "id": 118263124, "id_str": "118263124", "indices": [3, 15]}, {"screen_name": "Stanford", "name": "Stanford University", "id": 18036441, "id_str": "18036441", "indices": [31, 40]}], "urls": []}, "source": "<a href=\\"http://twitter.com\\" rel=\\"nofollow\\">Twitter Web Client</a>", "in_reply_to_status_id": null, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "in_reply_to_user_id_str": null, "in_reply_to_screen_name": null, "user": {"id": 1021355762575073281, "id_str": "1021355762575073281", "name": "MIDAS IIITD", "screen_name": "midasIIITD", "locati

## Code to store data in .jsonl format

In [225]:
def store_tweet_data(tweet_data):
    '''
    Stores the tweets in the .jsonl or jsonlines format 
    Filename: data.jsonl
    
    Inputs 
    1) tweet_data (list) : The list of tweepy.models.Status objects corresponding to the tweets from the handle   
    '''
    
    #Convert the Status objects data to strings that are in json form
    json_data = [json.dumps(status._json) for status in tweet_data]
    
    # Store the data in the data.jsonl file
    with open('data.jsonl', 'ab') as f:  
        with jsonlines.Writer(f) as writer:
            writer.write(json_data)

In [105]:
store_tweet_data(data)

In [200]:
def extract_tweet_info(file):
    '''
    Extracts the required info from the give .jsonl file
    
    Inputs 
    1) file (str) : The filename of the .jsonl file
    Outputs
    1) data_df (pandas.Dataframe): The pandas Dataframe contained the info
    '''
    
    #Initialize lists for the info
    texts = []
    dates = []
    months = []
    times = []
    years = []
    n_favorites = []
    n_retweets = []
    n_images = []
    
    #Open file and run through the tweets' json files
    with jsonlines.open(file) as reader:
        for tweets in reader:
            for tweet in tweets:
    
                #Retrieve json in dict form from tweet
                tweetJSONobj = json.loads(tweet)
            
                #Collect the time,date,month,year info  
                '''
                Example of how tweepy stores this "created at" data : 'Wed Apr 03 18:31:53 +0000 2019'
                '''
                created_at = tweetJSONobj["created_at"]
                times.append(created_at.split(" ")[3])
                dates.append(created_at.split(" ")[2])
                months.append(created_at.split(" ")[1])
                years.append(created_at.split(" ")[5])
                
                
                #Collect the text of the tweet
                text_str = tweetJSONobj["text"]
                texts.append(text_str)
                
                #Collect number of favorites and retweets
                n_favorites.append(tweetJSONobj["favorite_count"])
                n_retweets.append(tweetJSONobj["retweet_count"])
                
                #Find the number of images
                
                #Not all tweets have the "media" attrbute inside "entities", some tweets may have no media at all
                #This is a hack to skip the entries which don't have any me
                try:
                    media_entities = tweetJSONobj["entities"]["media"]
                except(KeyError):
                    n_images.append(None)
                    continue
                    
                    
                image_count = 0
                for media in media_entities:
                    if(media["type"] == 'photo'):
                        image_count = image_count + 1
                if(image_count == 0):
                    n_images.append(None)
                else:
                    n_images.append(image_count)
                    
                    
                    
        data_df = pd.DataFrame()
        data_df['Text']  = texts
        data_df["Time"] = times
        data_df["Date"] = dates
        data_df["Month"] = months
        data_df["Year"] = years
        data_df["No. Favorites"] = n_favorites
        data_df["No. Retweets"] = n_retweets
        data_df["No. Images"] = n_images
        
        return(data_df)

                
    

In [201]:
data_extracted = extract_tweet_info("data.jsonl")

In [203]:
data_extracted.head()

Unnamed: 0,Text,Time,Date,Month,Year,No. Favorites,No. Retweets,No. Images
0,We request all students whose interview are sc...,11:43:24,7,Apr,2019,0,1,
1,"Other queries: ""none of the Tweeter Apis give ...",06:55:19,7,Apr,2019,3,2,
2,"Other queries: ""do we have to make two differe...",06:53:38,7,Apr,2019,4,1,
3,"Other queries: ""If using Twitter api, it does ...",05:32:27,7,Apr,2019,4,1,
4,Response to some queries asked by students on ...,05:29:40,7,Apr,2019,6,1,


# References 

Tweepy docs : https://buildmedia.readthedocs.org/media/pdf/tweepy/v3.2.0/tweepy.pdf

Status object structure : https://gist.github.com/dev-techmoe/ef676cdd03ac47ac503e856282077bf2

StackOverflow -
Convert Tweepy Status object into JSON : https://stackoverflow.com/questions/27900451/convert-tweepy-status-object-into-json
