## 0. Dataset loading

In [95]:
# DATASET FOUND AT: https://www.kaggle.com/datasets/prathamsharma123/farmers-protest-tweets-dataset-raw-json

import json

DATA_PATH = "farmers-protest-tweets-2021-03-5.json"

def load_tweets(path=DATA_PATH):
    """Generator, that yields json file line by line (each line is a JSON object)

    Args:
        path (str): The relative path to this file, where the json file is located.

    Returns:
        dict: returns a json object loaded as a python dict.

    """

    with open(DATA_PATH, "r", encoding='utf-8') as f:
        for line in f:
            yield json.loads(line)


## 1. Top 10 most retweeted tweets

In [96]:
## QUERY: "retweetCount": <int>

import operator
import heapq


def retrieve_most_retweeted(n_highests: int=10):
    """Finds most retweeted tweets of dataset, by number of retweets.

    Args:
        n_highests (int): The first n highest tweets by number of retweets to return.

    Returns:
        list: returns a list of tuple triples of user displayname, 
              retweet count and content, sorted.

    """

    # [tweet_1, tweet_2, ... , tweet_10]
    topten_most_retweeted_tweets = heapq.nlargest(n_highests, load_tweets(), key=operator.itemgetter("retweetCount"))

    # [(displayname_1, retweet_count_1, content_1), ... (displayname_10, retweet_count_10, content_10)]
    return [
        (tweet["user"]["displayname"], tweet["retweetCount"], tweet["content"])
        for tweet in topten_most_retweeted_tweets
     ]
    

### Example

In [97]:
# topten_most_retweeted = retrieve_most_retweeted()
# print(f"Top {len(topten_most_retweeted)} most retweeted found!")

In [98]:
# topten_most_retweeted

## 2. Top 10 users by n° of emitted tweets

In [99]:
## QUERY: "user" (object) --> {"id": my_id, "name": my_name, ...}

from collections import defaultdict, Counter

def retrieve_most_active_users(n_highests: int=10):
    """Finds most active users of dataset, by number of tweets.

    Args:
        n_highests (int): The first n highest users by number of tweets to return.

    Returns:
        list: returns a list of tuple pairs of displayname and counter, sorted.

    """


    # {uid_1: counter_1, uid_2: counter_2, ...}
    users_and_tweets_counter = defaultdict(int)
    
    # {uid_1: displayname_1, uid_2: displayname_2, ...}
    displayname_by_uid = dict()


    for tweet in load_tweets():

        user_id = tweet["user"]["id"]
        users_and_tweets_counter[user_id] += 1
        displayname_by_uid[user_id] = tweet["user"]["displayname"]

    # [(id_1: int, counter_1: int), (id_2, counter_2), ... , (id_N, counter_N)]
    most_active_uids = Counter(users_and_tweets_counter).most_common(n_highests)

    # [(name_1: str, counter_1: int), (name_2, counter_2), ... , (name_N, counter_N)]
    return [(displayname_by_uid[user_id], n_tweets) for user_id, n_tweets  in most_active_uids]


### Example

In [100]:
# most_active_users = retrieve_most_active_users()
# print(f"Top {len(most_active_users)} most retweeted found!")

In [101]:
# most_active_users

## 3. Top 10 days with most tweets

In [102]:
## QUERY: "date": str --> "2021-03-30T03:33:46+00:00"

from collections import defaultdict, Counter

def retrieve_most_active_days(n_highests: int=10):
    """Finds most active users of dataset, by number of tweets.

    Args:
        n_highests (int): The first n days to return, by highest activity.

    Returns:
        list: returns a list of tuple pairs of date and counter, sorted.

    """


    # {date_1: counter_1, date_2: counter_2, ...}
    days_and_tweets_counter = defaultdict(int)
    

    for tweet in load_tweets():

        # day = datetime.strftime(
        #     datetime.strptime(
        #     tweet["date"], '%Y-%m-%dT%H:%M:%S+00:00'),
        #     '%a-%b-%d-%Y'
        # )

        # %Y-%m-%d
        day = tweet["date"].split("T")[0]

        days_and_tweets_counter[day] += 1


    # [(date_1: int, counter_1: int), (date_2, counter_2), ... , (date_N, counter_N)]
    return Counter(days_and_tweets_counter).most_common(n_highests)


### Example

In [103]:
# most_active_days = retrieve_most_active_days()
# print(f"Top {len(most_active_days)} most active days found!")

In [104]:
# most_active_days

## 4. Top 10 most used hashtags

In [105]:
## QUERY: "content": str --> "Example tweet #myHashtag end of message"

from collections import defaultdict, Counter
import re

def retrieve_most_used_hashtags(n_highests: int=10):
    """Finds most used hashtags of dataset, by number of tweets.

    Args:
        n_highests (int): The first n hashtags to return, by highest activity.

    Returns:
        list: returns a list of tuple pairs of hashtag and counter, sorted.

    """


    # {hashtag_1: counter_1, hashtag_2: counter_2, ...}
    hashtags_activity_counter = defaultdict(int)
    
    for tweet in load_tweets():
        hashtags = re.findall(r"#(\w+)", tweet["renderedContent"])
        if hashtags:
            for hashtag in hashtags:
                hashtags_activity_counter[hashtag] += 1


    # [(hashtag_1: int, counter_1: int), (hashtag_2, counter_2), ... , (hashtag_N, counter_N)]
    return Counter(hashtags_activity_counter).most_common(n_highests)

# REGEX MATCHER SOURCE: https://stackoverflow.com/questions/2527892/parsing-a-tweet-to-extract-hashtags-into-an-array

### Example

In [106]:
# most_used_hashtags = retrieve_most_used_hashtags()
# print(f"Top {len(most_used_hashtags)} most used hashtags found!")

In [107]:
# most_used_hashtags

## MAIN

remember, you need to download the dataset, and have it at the same path as this notebook.
The filename is `farmers-protest-tweets-2021-03-5.json` but can be changed at **(0) Dataset Loading**.

In [108]:
def main(func_number: int):
    """ Runs desired function (options below)."""

    functions = {
        1: retrieve_most_retweeted,
        2: retrieve_most_active_users,
        3: retrieve_most_active_days,
        4: retrieve_most_used_hashtags,
     }
    functions_str = {
        1: "retrieve_most_retweeted",
        2: "retrieve_most_active_users",
        3: "retrieve_most_active_days",
        4: "retrieve_most_used_hashtags",
     }

    for key, val in functions_str.items():
        print(f"{key}: {val}")

    if type(func_number) != int or func_number not in functions:
        print(f"{func_number} not a valid option: 1-{len(functions)}")
        return

    function = functions[func_number]
    print("\n", f"Executing {functions_str[func_number]} ... \n")
    result = function()
    for r in result:
        print(r)

In [110]:
# 1: retrieve_most_retweeted
# 2: retrieve_most_active_users
# 3: retrieve_most_active_days
# 4: retrieve_most_used_hashtags

# CALL THIS WITH AN INT, REPRESENTING THE DESIRED FUNCTION FROM ABOVE
main(func_number=2)

1: retrieve_most_retweeted
2: retrieve_most_active_users
3: retrieve_most_active_days
4: retrieve_most_used_hashtags

 Executing retrieve_most_active_users ... 

('Harjot Singh', 7134)
('ਕਿਸਾਨAndolajivi ravinder kaur चोकीदार ही कातिल है', 2091)
('Jaspal Kaur Bains.For love of Punjab,Sikhi & India', 1991)
('Jot', 1841)
('Theinactiveactivist', 1806)
('🍊raman🚜', 1722)
('Jaz 🇨🇦🌾ਗਰਮ ਖਿਆਲੀ 📌', 1502)
('JassG', 1460)
('Kisan Bot🚜🌾', 1453)
('mohd khaliquzzama', 1446)
