In [2]:
import pandas as pd
import sys
sys.path.append('../utils')
from ETL_utils import*

# Understanding the data

## TwiBot-22 Dataset structure
The TwiBot-22 Twitter bot detection benchmark contains `tweet_i.json` (with i from 0 to 8), `user.json`, `list.json`, `hashtag.json`, `split.csv`, `label.csv` and `edge.csv`, representing the four entity types (tweet, user, list, and hashtag), train/dev/test split, ground-truth labels, and edges in the TwiBot-22 network.

- `tweet_i.json`
This file contains tweet information. The field of each tweet is identical with what is retrievable with the Twitter API.

The tweet information of TwiBot-22 is too big for a single file, so we use `tweet_0.json` to `tweet_8.json` to store them.

- `user.json`
This file contains user information. The field of each user is identical with what is retrievable with the Twitter API.

- `list.json`
This file contains list information. The field of each list is identical with what is retrievable with the Twitter API.

- `hashtag.json`
This file contains hashtag information. The field of each hashtag is identical with what is retrievable with the Twitter API.

- `split.csv`
This file contains data split information, where the first column (id) is the user id and the second column (split) is the corresponding split (train, valid or test).

- `label.csv`
This file contains the ground truth labels, where the first column (id) is the user id and the second column (label) is the corresponding label (human or bot).

- `edge.csv`
This file contains relations of entities appear in `node.json`. Each of the entries contains source_id, target_id and relation type.

There are 14 possible relations in the TwiBot-22 graph: `pinned`, `following`, `own`, `post`, `followers`, `membership`, `retweeted`, `like`, `followed`, `quoted`, `discuss`, `replied`, `mentioned`, `contain`

# Create Dataset of tweets per user

In [6]:
pd.read_json("../Data/Twi22/list.json")

Unnamed: 0,id,name,created_at,description,follower_count,member_count,private,owner_id
0,l1128774,StimulatingBroadband.com,2009-10-30 14:38:00+00:00,Subscribe to our Twitter firehose of 500 telec...,35,452,False,32745876
1,l733341248057180166,Local news,2016-05-19 16:59:13+00:00,Follow for breaking news and updates in North ...,3,47,False,316508410
2,l7983816,Futerrans,2010-02-26 18:01:03+00:00,Futerra staff past and present,6,25,False,23102054
3,l7008265,Beautiful People,2010-02-07 13:40:48+00:00,People who are strong minded and stand up for ...,28,179,False,23246523
4,l6807414,celebrities,2010-02-03 09:56:17+00:00,,7,6,False,16454856
...,...,...,...,...,...,...,...,...
21865,l863566424375209984,Deep learning,2017-05-14 01:27:34+00:00,,0,0,False,107770718
21866,l10667576,divulgacion & educacion,2010-04-16 10:03:29+00:00,,29,373,False,29024460
21867,l90172505,Post Graphics Staff,2013-05-23 15:20:46+00:00,The visual journalists who work in The Washing...,67,32,False,87968068
21868,l702528800886648832,General science,2016-02-24 16:21:33+00:00,,0,4,False,19732234


In [8]:
pd.read_json("../Data/Twi22/user.json")

Unnamed: 0,created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld
0,2020-01-16 02:02:55+00:00,Theoretical Computer Scientist. See also https...,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",u1217628182611927040,"Cambridge, MA",Boaz Barak,,https://pbs.twimg.com/profile_images/125226236...,False,"{'followers_count': 7316, 'following_count': 2...",https://t.co/BoMip9FF17,boazbaraktcs,False,
1,2014-07-02 17:56:46+00:00,creative _,,u2664730894,🎈,olawale 💨,,https://pbs.twimg.com/profile_images/147837638...,False,"{'followers_count': 123, 'following_count': 10...",,wale_io,False,
2,2020-05-30 12:10:45+00:00,👽,,u1266703520205549568,,panagiota_.b,,https://pbs.twimg.com/profile_images/142608606...,False,"{'followers_count': 3, 'following_count': 62, ...",,b_panagiota,False,
3,2019-01-26 13:52:49+00:00,mama to maya. ABIM research pathway fellow @UV...,"{'description': {'mentions': [{'start': 43, 'e...",u1089159225148882949,"Charlottesville, VA","Jacqueline Hodges, MD MPH",,https://pbs.twimg.com/profile_images/130229171...,False,"{'followers_count': 350, 'following_count': 57...",,jachodges_md,False,
4,2009-04-30 19:01:42+00:00,Father / SWT Alumnus / Longhorn Fan,,u36741729,United States,Matthew Stubblefield,,https://pbs.twimg.com/profile_images/145808462...,True,"{'followers_count': 240, 'following_count': 29...",,Matthew_Brody,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,2013-02-05 14:50:17+00:00,イラストACは高品質イラストアート/年賀状等が全無料DL可能♪AIベクター・EPS形式素材全...,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",u1151138281,↓利用者600万人無料素材サイト↓　商用利用編集ＯＫ表記不要,フリー素材集かわいい無料イラストAC/おしゃれフレーム枠★IllustACイラストレーター,1.301109e+18,https://pbs.twimg.com/profile_images/139750409...,False,"{'followers_count': 1877, 'following_count': 2...",https://t.co/L6PE11Blkl,Illustratorjpn,False,
999996,2013-04-09 12:09:34+00:00,next➬未定 紫･緑ﾃﾞｨｯｷ 色々な曲聴きます,"{'url': {'urls': [{'start': 0, 'end': 23, 'url...",u1339035361,OKAYAMA CITY,りょうやん,6.067787e+17,https://pbs.twimg.com/profile_images/135842413...,False,"{'followers_count': 13952, 'following_count': ...",https://t.co/NjDtATyqGc,_y3oa,False,
999997,2011-06-16 20:09:29+00:00,Heart of a lion with a Mind of a maniac. Louis...,,u318636852,"Lake Charles, LA",Gavin Cecchini,,https://pbs.twimg.com/profile_images/781352355...,False,"{'followers_count': 13743, 'following_count': ...",,GavinCecchini2,True,
999998,2009-05-30 00:25:19+00:00,"Marketplace Minister, Christ follower, Indepen...","{'url': {'urls': [{'start': 0, 'end': 23, 'url...",u43443354,Rockhampton Australia,Martin Allan,,https://pbs.twimg.com/profile_images/131699997...,False,"{'followers_count': 2460, 'following_count': 2...",https://t.co/r3R5Bkng9m,MartinfromOz,False,


In [10]:
pd.read_json("../Data/Twi22/tweet_8-003.json")

In [11]:
# For tweets DataFrame
df_tweets_train = make_tweets_df("../Data/Twi22/train.json")
df_tweets_test = make_tweets_df("../Data/Twi20/test.json")
df_tweets = pd.concat([df_tweets_train, df_tweets_test], axis=0)
df_tweets = df_tweets.reset_index(drop=True)

In [12]:
df_tweets

Unnamed: 0,tweet,ID
0,RT @CarnivalCruise: 🎉 Are you ready to see wha...,17461978
1,Who has time for receipts? Not me. @epson rece...,17461978
2,Steady wants to encourage you to invest in you...,17461978
3,"Good one, @rishid. But let’s see if y'all can ...",17461978
4,#lsunationalchamps\n,17461978
...,...,...
1598323,"Man, the 70s was a bad-looking decade. Take th...",3385331674
1598324,RT @RobinsonCano: The RC22 DREAM School is sta...,3385331674
1598325,RT @JonHeymanCBS: this is last season for @Don...,3385331674
1598326,RT @whitesox: #SoxSurprise! @BoJackson takes o...,3385331674


In [11]:
df_tweets['ID'].nunique() #number of users

9461

# Get domain of interest of each user

In [13]:
# For ID and domain DataFrame
df_id_domain_train = create_id_domain_df("../Data/Twi20/train.json")
df_id_domain_test = create_id_domain_df("../Data/Twi20/test.json")
df_id_domain = pd.concat([df_id_domain_train, df_id_domain_test], axis=0)
df_id_domain = df_id_domain.reset_index(drop=True)

In [14]:
all_domains = df_id_domain['domain'].str.split(', ').explode().unique()
all_domains

array(['Politics', 'Business', 'Entertainment', 'Sports'], dtype=object)

In [16]:
df_id_domain

Unnamed: 0,ID,domain
0,17461978,"Politics, Business, Entertainment"
1,1297437077403885568,Politics
2,17685258,"Politics, Entertainment, Sports"
3,15750898,Politics
4,1659167666,Politics
...,...,...
9456,452754350,Sports
9457,850435801687183360,Sports
9458,2188795745,Sports
9459,940687680,Sports


# Create Dataset of profile for each user

In [25]:
# Lista das colunas usadas para selecionar dados específicos de um DataFrame.
USECOLS = [
    'id',
    'id_str',
    'name',
    'screen_name',
    'location',
    'profile_location',
    'description',
    # 'url',
    # 'entities',
    'protected',
    'followers_count',
    'friends_count',
    'listed_count',
    'created_at',
    'favourites_count',
    'utc_offset',
    'time_zone',
    'geo_enabled',
    'verified',
    'statuses_count',
    'lang',
    'contributors_enabled',
    'is_translator',
    'is_translation_enabled',
    'profile_background_color',
    'profile_background_image_url',
    'profile_background_image_url_https',
    'profile_background_tile',
    'profile_image_url',
    'profile_image_url_https',
    'profile_link_color',
    'profile_sidebar_border_color',
    'profile_sidebar_fill_color',
    'profile_text_color',
    'profile_use_background_image',
    'has_extended_profile',
    'default_profile',
    'default_profile_image']

In [26]:
# For profile DataFrame
df_profile_train = make_profile_df("../Data/Twi20/train.json", usecols=USECOLS)
df_profile_test = make_profile_df("../Data/Twi20/test.json", usecols=USECOLS)
df_profile = pd.concat([df_profile_train, df_profile_test], axis=0)
df_profile = df_profile.reset_index(drop=True)

In [39]:
# Verifica se as colunas 'id' e 'id_str' são iguais
if 'id_str' in df_profile.columns and df_profile['id'].equals(df_profile['id_str']):
    # Remove a coluna 'id_str'
    df_profile.drop(columns=['id_str'], inplace=True)

# Renomeia a coluna 'id' para 'ID'
df_profile.rename(columns={'id': 'ID'}, inplace=True)

In [41]:
df_profile.head()

Unnamed: 0,ID,name,screen_name,location,profile_location,description,protected,followers_count,friends_count,listed_count,...,profile_image_url,profile_image_url_https,profile_link_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_text_color,profile_use_background_image,has_extended_profile,default_profile,default_profile_image
0,17461978,SHAQ,SHAQ,"Orlando, FL","{'id': '55b4f9e5c516e0b6', 'url': 'https://api...","VERY QUOTATIOUS, I PERFORM RANDOM ACTS OF SHAQ...",False,15349596,692,45568,...,http://pbs.twimg.com/profile_images/1673907275...,https://pbs.twimg.com/profile_images/167390727...,2FC2EF,181A1E,252429,666666,True,False,False,False
1,1297437077403885568,Jennifer Fishpaw,JenniferFishpaw,,,,False,0,44,0,...,http://pbs.twimg.com/profile_images/1297437406...,https://pbs.twimg.com/profile_images/129743740...,1DA1F2,C0DEED,DDEEF6,333333,True,True,True,False
2,17685258,Brad Parscale,parscale,Florida,,Owner @ Parscale Strategy. Senior Advisor Digi...,False,762839,475,3201,...,http://pbs.twimg.com/profile_images/1295453225...,https://pbs.twimg.com/profile_images/129545322...,AB2316,FFFFFF,FFFFFF,666666,False,False,False,False
3,15750898,FOX 13 Tampa Bay,FOX13News,"Tampa, FL",,Bringing you the important stuff like breaking...,False,327587,4801,1744,...,http://pbs.twimg.com/profile_images/1293193013...,https://pbs.twimg.com/profile_images/129319301...,0B2F8A,FFFFFF,E8EEF0,333333,True,False,False,False
4,1659167666,Vonte The Plug 🎤🔌,VonteThePlugNC,"Jacksonville Beach, FL","{'id': '5e281c17a74c170f', 'url': 'https://api...",MOTIVATION 3 OUT NOW 🔥 Singles: ‘Lil Shawdy’ &...,False,13324,647,44,...,http://pbs.twimg.com/profile_images/1181662400...,https://pbs.twimg.com/profile_images/118166240...,1DA1F2,C0DEED,DDEEF6,333333,True,False,True,False


In [28]:
df_profile['id'].nunique()

9461

# Get following and followers for each user

In [33]:
# For ID, followers, and following DataFrame
df_id_neighbor_train = create_id_neighbor_df("../Data/Twi20/train.json")
df_id_neighbor_test = create_id_neighbor_df("../Data/Twi20/test.json")
df_id_neighbor = pd.concat([df_id_neighbor_train, df_id_neighbor_test], axis=0)
df_id_neighbor = df_id_neighbor.reset_index(drop=True)
df_id_neighbor

Unnamed: 0,ID,followers,following
0,17461978,,
1,1297437077403885568,[],"[170861207, 23970102, 47293791, 29458079, 1799..."
2,17685258,"[1275068515666386945, 2535843469, 129365759103...","[46464108, 21536398, 18643437, 589490020, 1363..."
3,15750898,"[855194021458739200, 1267566832598290432, 1290...","[2324715174, 24030137, 2336676015, 192684124, ..."
4,1659167666,"[893137540185718785, 1063858543, 26665819, 241...","[1628313708, 726405625, 130868956, 26652768, 3..."
...,...,...,...
9456,452754350,"[2308703630, 230020648, 20673104, 818336445102...","[2924422992, 2365623499, 3383893516, 304921770..."
9457,850435801687183360,"[333490198, 905966469929979904, 12875470492238...","[704144006129692674, 953363306244227072, 84551..."
9458,2188795745,"[249907794, 4843189571, 694904945393426432, 29...","[66762778, 2981733093, 186186153, 198600462, 7..."
9459,940687680,"[942435278, 280899355, 1262431498751184896, 13...","[559791853, 1008065499136249856, 107059213, 36..."


# Create the labels

In [34]:
# For ID and label DataFrame
df_id_label_train = create_id_label_df("../Data/Twi20/train.json")
df_id_label_test = create_id_label_df("../Data/Twi20/test.json")
df_id_label = pd.concat([df_id_label_train, df_id_label_test], axis=0)
df_id_label = df_id_label.reset_index(drop=True)

In [35]:
df_id_label

Unnamed: 0,ID,label
0,17461978,0
1,1297437077403885568,1
2,17685258,0
3,15750898,0
4,1659167666,1
...,...,...
9456,452754350,1
9457,850435801687183360,1
9458,2188795745,1
9459,940687680,1


# Save datasets

In [None]:
df_id_label.to_parquet('../Data/Twi20/ETL/Twi20_label.parquet')
df_tweets.to_parquet('../Data/Twi20/ETL/Twi20_tweets.parquet')
df_id_neighbor.to_parquet('../Data/Twi20/ETL/Twi20_neighbor.parquet')
df_id_domain.to_parquet('../Data/Twi20/ETL/Twi20_domain.parquet')
df_profile.to_parquet('../Data/Twi20/ETL/Twi20_profile.parquet')