In [1]:
# Importing required libraries

import pandas as pd
from pandas.io.json import json_normalize
import warnings
warnings.filterwarnings("ignore")
# Read JSON file containing tweets data and removce tweets not in English

raw_tweets = pd.read_json(r'farmers-protest-tweets-2021-03-5.json', lines=True)
raw_tweets = raw_tweets[raw_tweets['lang']=='en']

In [2]:
# Normalize 'user' field

users = json_normalize(raw_tweets['user'])
users.drop(['description', 'linkTcourl'], axis=1, inplace=True)
users.rename(columns={'id':'userId', 'url':'profileUrl'}, inplace=True)

# Transform 'raw_tweets' DataFrame

# Add column for 'userId'
user_id = []
for user in raw_tweets['user']:
    uid = user['id']
    user_id.append(uid)
raw_tweets['userId'] = user_id

In [3]:
# Remove less important columns
cols = ['url', 'date', 'renderedContent', 'id', 'userId', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount', 'source', 'media', 'retweetedTweet', 'quotedTweet', 'mentionedUsers']
tweets = raw_tweets[cols]
tweets.rename(columns={'id':'tweetId', 'url':'tweetUrl'}, inplace=True)

# Convert to DataFrame, remove duplicates and keep only English tweets
tweets = pd.DataFrame(tweets)
tweets.drop_duplicates(subset=['tweetId'], inplace=True)

In [4]:
top_tweets = tweets.nlargest(10, 'retweetCount')
top_tweets.head()

Unnamed: 0,tweetUrl,date,renderedContent,tweetId,userId,replyCount,retweetCount,likeCount,quoteCount,source,media,retweetedTweet,quotedTweet,mentionedUsers
408128,https://twitter.com/rihanna/status/13566258896...,2021-02-02 15:29:51+00:00,why aren’t we talking about this?! #FarmersPro...,1356625889602199552,79293791,163065,315547,944307,45832,"<a href=""http://twitter.com/download/iphone"" r...",,,,
395142,https://twitter.com/GretaThunberg/status/13566...,2021-02-02 20:04:01+00:00,We stand in solidarity with the #FarmersProtes...,1356694884615340037,1006419421244678144,49793,103957,319363,13815,"<a href=""http://twitter.com/download/iphone"" r...",,,,
266196,https://twitter.com/GretaThunberg/status/13572...,2021-02-04 10:59:01+00:00,I still #StandWithFarmers and support their pe...,1357282507616645122,1006419421244678144,39596,67694,234676,10587,"<a href=""http://twitter.com/download/iphone"" r...",,,,
366579,https://twitter.com/miakhalifa/status/13568483...,2021-02-03 06:14:01+00:00,"“Paid actors,” huh? Quite the casting director...",1356848397899112448,2835653131,15569,35921,139959,5681,"<a href=""http://twitter.com/download/iphone"" r...",[{'previewUrl': 'https://pbs.twimg.com/media/E...,,,
372793,https://twitter.com/miakhalifa/status/13568277...,2021-02-03 04:51:48+00:00,What in the human rights violations is going o...,1356827705161879553,2835653131,9082,26972,99227,4606,"<a href=""http://twitter.com/download/iphone"" r...",[{'previewUrl': 'https://pbs.twimg.com/media/E...,,,


In [8]:
users_tweets = tweets[['tweetId', 'tweetUrl','renderedContent', 'userId']].merge(users[['userId','displayname']], on='userId')
top_users = users_tweets.groupby(['displayname']).size().reset_index(name='tweets').sort_values(by='tweets', ascending=False)
top_users.head(10)

Unnamed: 0,displayname,tweets
21541,Harjot Singh,50905719
76395,ਕਿਸਾਨAndolajivi ravinder kaur चोकीदार ही कातिल है,4359744
26144,"Jaspal Kaur Bains.For love of Punjab,Sikhi & I...",3964081
27464,Jot,3389564
59968,Theinactiveactivist,3250809
79541,🍊raman🚜,2965284
26757,Jaz 🇨🇦🌾ਗਰਮ ਖਿਆਲੀ 📌,2223081
26308,JassG,2125764
29904,Kisan Bot🚜🌾,2111209
69678,mohd khaliquzzama,2090916


Código extraído de https://www.kaggle.com/code/prathamsharma123/clean-raw-json-tweets-data