## 0. Dataset loading

In [2]:
# DATASET FOUND AT: https://www.kaggle.com/datasets/prathamsharma123/farmers-protest-tweets-dataset-raw-json

import json

DATA_PATH = "data.json"

def load_tweets(path=DATA_PATH):
    """Generator, that yields json file line by line (each line is a JSON object)

    Args:
        path (str): The relative path to this file, where the json file is located.

    Returns:
        dict: returns a json object loaded as a python dict.

    """

    with open(DATA_PATH, "r", encoding='utf-8') as f:
        for line in f:
            yield json.loads(line)


## 1. Top 10 most retweeted tweets

In [27]:
## QUERY: "retweetCount": <int>

import operator
import heapq


def retrieve_most_retweeted(n_highests: int=10):
    """Finds most retweeted tweets of dataset, by number of retweets.

    Args:
        n_highests (int): The first n highest tweets by number of retweets to return.

    Returns:
        list: returns a list of tuple triples of user displayname, 
              retweet count and content, sorted.

    """

    # [tweet_1, tweet_2, ... , tweet_10]
    topten_most_retweeted_tweets = heapq.nlargest(n_highests, load_tweets(), key=operator.itemgetter("retweetCount"))

    # [(displayname_1, retweet_count_1, content_1), ... (displayname_10, retweet_count_10, content_10)]
    return [
        (tweet["user"]["displayname"], tweet["retweetCount"], tweet["content"])
        for tweet in topten_most_retweeted_tweets
     ]
    

### Example

In [28]:
topten_most_retweeted = retrieve_most_retweeted()
print(f"Top {len(topten_most_retweeted)} most retweeted found!")

Top 10 most retweeted found!


In [42]:
print("USER : RETWEETS")
print("CONTENT", "\n")

for tweet in topten_most_retweeted:
    print(f"{tweet[0]} : {tweet[1]}")
    print(tweet[2], "\n")

USER : RETWEETS
CONTENT 

Rihanna : 315547
why aren’t we talking about this?! #FarmersProtest https://t.co/obmIlXhK9S 

Greta Thunberg : 103957
We stand in solidarity with the #FarmersProtest in India.
 https://t.co/tqvR0oHgo0 

Greta Thunberg : 67694
I still #StandWithFarmers and support their peaceful protest.
No amount of hate, threats or violations of human rights will ever change that. #FarmersProtest 

Mia K. : 35921
“Paid actors,” huh? Quite the casting director, I hope they’re not overlooked during awards season. I stand with the farmers. #FarmersProtest https://t.co/moONj03tN0 

Mia K. : 26972
What in the human rights violations is going on?! They cut the internet around New Delhi?! #FarmersProtest https://t.co/a5ml1P2ikU 

JuJu Smith-Schuster : 23251
Happy to share that I’ve donated $10,000 to provide medical assistance to the farmers in need in India to help save lives during these times. I hope we can prevent any additional life from being lost. 🙏🏾 #FarmersProtest https://t

## 2. Top 10 users by n° of emitted tweets

In [None]:
## QUERY: "user" (object) --> {"id": my_id, "name": my_name, ...}

from collections import defaultdict, Counter

def retrieve_most_active_users(n_highests: int=10):
    """Finds most active users of dataset, by number of tweets.

    Args:
        n_highests (int): The first n highest users by number of tweets to return.

    Returns:
        list: returns a list of tuple pairs of displayname and counter, sorted.

    """


    # {uid_1: counter_1, uid_2: counter_2, ...}
    users_and_tweets_counter = defaultdict(int)
    
    # {uid_1: displayname_1, uid_2: displayname_2, ...}
    displayname_by_uid = dict()


    for tweet in load_tweets():

        user_id = tweet["user"]["id"]
        users_and_tweets_counter[user_id] += 1
        displayname_by_uid[user_id] = tweet["user"]["displayname"]

    # [(id_1: int, counter_1: int), (id_2, counter_2), ... , (id_10, counter_10)]
    most_active_uids = Counter(users_and_tweets_counter).most_common(n_highests)

    # [(name_1: str, counter_1: int), (name_2, counter_2), ... , (name_10, counter_10)]
    return [(displayname_by_uid[user_id], n_tweets) for user_id, n_tweets  in most_active_uids]


### Example

In [19]:
most_active_users = retrieve_most_active_users()
print(f"Top {len(most_active_users)} most retweeted found!")

Top 10 most retweeted found!


In [20]:
most_active_users

[('Harjot Singh', 7134),
 ('ਕਿਸਾਨAndolajivi ravinder kaur चोकीदार ही कातिल है', 2091),
 ('Jaspal Kaur Bains.For love of Punjab,Sikhi & India', 1991),
 ('Jot', 1841),
 ('Theinactiveactivist', 1806),
 ('🍊raman🚜', 1722),
 ('Jaz 🇨🇦🌾ਗਰਮ ਖਿਆਲੀ 📌', 1502),
 ('JassG', 1460),
 ('Kisan Bot🚜🌾', 1453),
 ('mohd khaliquzzama', 1446)]

## 3. Top 10 days with most tweets

In [37]:
## "date": "2021-03-30T03:33:46+00:00"
## QUERY: "user" (object) --> {"id": my_id, "name": my_name, ...}

from datetime import datetime
from collections import defaultdict, Counter

def retrieve_most_active_days(n_highests: int=10):
    """Finds most active users of dataset, by number of tweets.

    Args:
        n_highests (int): The first n days to return, by highest activity.

    Returns:
        list: returns a list of tuple pairs of date and counter, sorted.

    """


    # {date_1: counter_1, date_2: counter_2, ...}
    days_and_tweets_counter = defaultdict(int)
    

    for tweet in load_tweets():

        # day = datetime.strftime(
        #     datetime.strptime(
        #     tweet["date"], '%Y-%m-%dT%H:%M:%S+00:00'),
        #     '%a-%b-%d-%Y'
        # )

        # %Y-%m-%d
        day = tweet["date"].split("T")[0]

        days_and_tweets_counter[day] += 1


    # [(date_1: int, counter_1: int), (date_2, counter_2), ... , (date_10, counter_10)]
    return Counter(days_and_tweets_counter).most_common(n_highests)


In [38]:
most_active_days = retrieve_most_active_days()
print(f"Top {len(most_active_days)} most active days found!")

Top 10 most active days found!


In [39]:
most_active_days

[('2021-02-03', 83866),
 ('2021-02-04', 58607),
 ('2021-02-05', 33332),
 ('2021-02-02', 28548),
 ('2021-02-06', 22420),
 ('2021-02-07', 11325),
 ('2021-02-09', 9320),
 ('2021-02-08', 8920),
 ('2021-02-10', 7973),
 ('2021-02-11', 5698)]

## 4. Top 10 most used hashtags