## Top User Tweets Parser

*Prepared by:*  
**Jude Michael Teves**  
Data Scientist / Machine Learning Engineer, Asian Development Bank

## Preliminaries

### Import Packages

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style("darkgrid")

import glob, re, json

from itertools import chain
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

### Understanding Data Structure

Below is the typical structure for a tweet. Note that there are cases wherein some attributes will be present/missing in some tweets.

In [11]:
date = '2019_03'
files = glob.glob(f'./top users/{date}_*')
filepath = files[0]

with open(filepath) as file:
    s = file.read().splitlines()
    data = json.loads(s[0])
    
# data['id'], data['user']['id']
data['user']

{'id': 1020615708894363648,
 'id_str': '1020615708894363648',
 'name': 'Dr.Sunil Kumar Meena',
 'screen_name': 'DrSunilKumar_',
 'location': 'Manila City, National Capital ',
 'url': 'https://linktr.ee/Drsunilkumar_',
 'description': 'F.V. President of Federation Medical Student Association Philippines @IFMSA |Doctor of Medicine| Founder @IIMACouncil | Politically Centrist | Athiest!',
 'translator_type': 'none',
 'derived': {'locations': [{'country': 'Philippines',
    'country_code': 'PH',
    'locality': 'Manila',
    'region': 'National Capital Region',
    'full_name': 'Manila, National Capital Region, Philippines',
    'geo': {'coordinates': [120.9822, 14.6042], 'type': 'point'}}]},
 'protected': False,
 'verified': False,
 'followers_count': 19536,
 'friends_count': 14810,
 'listed_count': 4,
 'favourites_count': 15245,
 'statuses_count': 5327,
 'created_at': 'Sat Jul 21 10:25:21 +0000 2018',
 'utc_offset': None,
 'time_zone': None,
 'geo_enabled': True,
 'lang': None,
 'contrib

### Defining important functions

In [12]:
def extract_tweets_from_user_file(filepath:str) -> list:
    '''
    Parameters
    ---
    filepath : str
        path for the json file containing a specific user's tweets
    
    Returns
    ---
    tweets : list
        list of tweets
    '''
    tweets = []
    with open(filepath) as file:
        s = file.read().splitlines()
        for line in s:
            data = json.loads(line) 
            tweets.append([data['created_at'], 
                           data['user']['id_str'], 
                           data['user']['name'], 
                           data['user']['screen_name'], 
                           data['text'], 
                           data['retweeted']]) 
            
    return tweets

def user_tweets_to_df(tweets:list) -> pd.DataFrame:
    '''
    Parameters
    ---
    tweets : list
        list of tweets
    
    Returns
    ---
    df : pd.DataFrame
        DataFrame representation of tweets
    '''
    df = pd.DataFrame(tweets, columns=['time','userid','name','screen_name','text','retweet'])
    df.time = pd.to_datetime(df.time)
    # df = df[~df.retweet] # does not matter coz we don't get retweets here.
    df.reset_index(drop=True, inplace=True)
    return df

def process_users_tweet_folder_to_dataframe(folder_path:str, apply_threshold=False, threshold:int=50) -> pd.DataFrame:
    '''
    Parameters
    ---
    folder_path : str
        folder path of JSON files of different users' tweets
    apply_threshold : bool
        set a limit to the number of tweets
    threshold : int
        limit to the number of tweets
    
    Returns
    ---
    df : pd.DataFrame
        DataFrame representation of tweets
    '''
    files = glob.glob(folder_path)
    
    df = pd.DataFrame()
    for filepath in files:
        tweets = extract_tweets_from_user_file(filepath)
        user_df = user_tweets_to_df(tweets)
        if apply_threshold and (user_df.shape[0] > threshold):
            user_df = user_df.sample(n=threshold)
        df = df.append(user_df)
        
    return df

## Parse and save tweets 

In [16]:
date = '2019_03'
folder_path = f'./top users/{date}_*'
df = process_users_tweet_folder_to_dataframe(folder_path)
print(f'DataFrame Shape: {df.shape}')
display(df.head())

# df_2019 = df.sort_values('screen_name').copy()
df_2019 = df.copy()
df_2019.to_csv('top_tweets_2019_03.csv', index=False)

DataFrame Shape: (4338, 6)


Unnamed: 0,time,userid,name,screen_name,text,retweet
0,2019-03-27 08:49:13+00:00,1020615708894363648,Dr.Sunil Kumar Meena,DrSunilKumar_,India in space:-\n1962: INCOSPAR by Jawaharlal...,False
1,2019-03-26 14:21:00+00:00,1020615708894363648,Dr.Sunil Kumar Meena,DrSunilKumar_,RT @NewsHtn: Exclusive interview with #Raghura...,False
2,2019-03-26 01:30:19+00:00,1020615708894363648,Dr.Sunil Kumar Meena,DrSunilKumar_,A salesman at our door! After selling us jumla...,False
3,2019-03-25 01:54:17+00:00,1020615708894363648,Dr.Sunil Kumar Meena,DrSunilKumar_,RT @dhruv_rathee: Read the truth behind how Mo...,False
4,2019-03-23 01:07:43+00:00,1020615708894363648,Dr.Sunil Kumar Meena,DrSunilKumar_,#Dubai Burj Khalifa sends peace &amp; respect ...,False


In [17]:
date = '2020_03'
folder_path = f'./top users/{date}_*'
df = process_users_tweet_folder_to_dataframe(folder_path)
print(f'DataFrame Shape: {df.shape}')
display(df.head())

# df_2020 = df.sort_values('screen_name').copy()
df_2020 = df.copy()
df_2020.to_csv('top_tweets_2020_03.csv', index=False)

DataFrame Shape: (15284, 6)


Unnamed: 0,time,userid,name,screen_name,text,retweet
0,2020-03-31 17:58:25+00:00,1020615708894363648,Dr.Sunil Kumar Meena,DrSunilKumar_,RT @SINGHSAKSHI15: Dear @DrSJaishankar and @PM...,False
1,2020-03-31 17:30:00+00:00,1020615708894363648,Dr.Sunil Kumar Meena,DrSunilKumar_,RT @Drsunil0198: All international indian stud...,False
2,2020-03-31 17:10:26+00:00,1020615708894363648,Dr.Sunil Kumar Meena,DrSunilKumar_,The 538 new cases and 10 Deaths reported today...,False
3,2020-03-31 17:05:14+00:00,1020615708894363648,Dr.Sunil Kumar Meena,DrSunilKumar_,This is to inform you that students are studyi...,False
4,2020-03-31 17:02:52+00:00,1020615708894363648,Dr.Sunil Kumar Meena,DrSunilKumar_,RT @Sandra68043039: Indian Students stuck in U...,False
