# Network baseline

The original baseline of our network was taken from *Kaggle* and is available at
[https://www.kaggle.com/datasets/bwandowando/ukraine-russian-crisis-twitter-dataset-1-2-m-rows](https://www.kaggle.com/datasets/bwandowando/ukraine-russian-crisis-twitter-dataset-1-2-m-rows). 

# Network source

In [None]:
import pandas as pd
import time
from dotenv import load_dotenv
load_dotenv("./.env.local")
import os
BEARER_TOKEN = os.getenv("BEARER_TOKEN1")
# https://docs.tweepy.org/en/stable/client.html
import tweepy
client = tweepy.Client(BEARER_TOKEN, wait_on_rate_limit=True)
from pathlib import Path
import networkx as nx

## Preprocessing of the Kaggle dataset

* Filter to only keep english
* Only keep relevant columns for our analysis: userid, tweetid, text, hashtags columns
* Delete duplicate
* Create a sample of the dataset (10k per each day)
    * The sample is saved under the csvdataframes folder

In [None]:
print("Creating folders")
Path.mkdir(Path("csvdataframes"), exist_ok=True)
Path.mkdir(Path("edgelists"), exist_ok=True)
Path.mkdir(Path("csvdataframes_wOgIds"), exist_ok=True)

In [None]:
print("Preprocessing the data")
for idx, day in enumerate(Path("dataset").iterdir()):
    full_dataset = pd.read_csv(day, compression='gzip', low_memory=False)
    df_en = full_dataset[full_dataset['language']=='en']
    df_en_filteted = df_en[["userid", "tweetid", "text", "hashtags"]]
    df_no_duplicate = df_en_filteted.drop_duplicates(subset='text', keep='first')
    df_sampled = df_no_duplicate.sample(10000)
    df_sampled.to_csv(Path("csvdataframes") / f"day_{idx}.csv")
    print(".", end='', flush=True)

## Retrieve original tweet id

Because the baseline dataset may contain retweet, we need to get the original
tweet in order to retrieve the list of people who liked and retweeted the tweet.
The sample will be augmented with the original tweet id and saved under the csvdataframes_wOgIds folder.

In [None]:
def get_tweets(ids):
    tweets = client.get_tweets(ids=ids, expansions=['referenced_tweets.id', 'referenced_tweets.id.author_id'])
    return tweets


def get_original_tweets_ids(ids):
    ids_list = ids["tweetid"].tolist()
    tweets = get_tweets(ids_list)
    return_df = pd.DataFrame(columns=["author_id", "original_tweet_id"])
    if not tweets.data:
        return None, False
    for tweet in tweets.data:
        referenced_tweet = tweet.get('referenced_tweets')
        original_author = tweet.get('author_id')
        original_tweet_id = referenced_tweet[0].id if referenced_tweet else tweet.id
        return_df = pd.concat([return_df, pd.DataFrame({
            "author_id": [original_author],
            "original_tweet_id": [original_tweet_id]
        })], ignore_index=True)
    return_df['Unnamed: 0'] = ids['Unnamed: 0']
    return return_df, True

In [None]:
# By batch of 100 Tweets, get the original Tweet ID
for idx, file in enumerate(Path("csvdataframes").iterdir()):
    print(file)
    df = pd.read_csv(file)
    # temp df to store original tweet ids
    temp_df = pd.DataFrame()
    # split df by 100
    for i in range(0, len(df), 100):
        print(f"{file} Iteration {i}-{i+100} / {len(df)}")
        df_100 = df[i:i+100]
        original_tweet_ids_df, empty_check = get_original_tweets_ids(df_100)
        if not empty_check:
            continue
        temp_df = pd.concat(
            [temp_df, original_tweet_ids_df], ignore_index=True)

    if temp_df.empty:
        continue
    df = pd.merge(df, temp_df, on="Unnamed: 0")
    df.to_csv(Path(f"csvdataframes_wOgIds/day_{idx}.csv"), index=False)

## Retrieve likers and retweeters

Retrieve likers and retweeters based on the original tweet id and create a
edgelist. The result is saved under the edgelists folder.

In [None]:
# get_retweeters from one id and one pagination token
def get_retweeters(id, pagination_token=None):
    print(".", end='', flush=True)
    retweeters = client.get_retweeters(id, pagination_token=pagination_token)
    next_token = retweeters.meta.get('next_token')

    if not retweeters.data:
        return None, None

    retweeter_ids = [retweeter.id for retweeter in retweeters.data]
    return retweeter_ids, next_token



# get_all_retweeters from one id
def get_all_retweeters(id):
    retweeter_ids = []
    next_token = None
    while True:
        retweeter_ids_, next_token = get_retweeters(id, pagination_token=next_token)
        if retweeter_ids_ is None:
            break
        retweeter_ids += retweeter_ids_
    return retweeter_ids


def get_linkin_users(id, pagination_token=None):
    print(".", end='', flush=True)
    retweeters = client.get_liking_users(id, pagination_token=pagination_token)
    next_token = retweeters.meta.get('next_token')

    if not retweeters.data:
        return None, None

    retweeter_ids = [retweeter.id for retweeter in retweeters.data]
    return retweeter_ids, next_token


def get_all_linkin_users(id):
    retweeter_ids = []
    next_token = None
    while True:
        retweeter_ids_, next_token = get_linkin_users(id, pagination_token=next_token)
        if retweeter_ids_ is None:
            break
        retweeter_ids += retweeter_ids_
    return retweeter_ids


def create_retweeters_edgelist():
    print("Creating edgelist for retweets")
    for idx, file in enumerate(Path("csvdataframes_wOgIds").iterdir()):
        edgelists_df = pd.DataFrame(columns=["user_id", "author_id"])
        print(file)
        df = pd.read_csv(file)
        
        for tweetId, author_id in zip(df['original_tweet_id'], df['author_id']):
            print(tweetId, end='')
            retweeter_ids = get_all_retweeters(tweetId)
            retweeter_ids_df = pd.DataFrame(retweeter_ids, columns=["user_id"])
            retweeter_ids_df.insert(1, "author_id", author_id)
            edgelists_df = pd.concat([edgelists_df, retweeter_ids_df], ignore_index=True)
            print("Done")

        edgelists_df.to_csv(Path(f"edgelists/retweeters_{idx}.csv"), index=False)

    
def create_liking_edgelist():
    print("Creating edgelist for likes")
    for idx, file in enumerate(Path("csvdataframes_wOgIds").iterdir()):
        edgelists_df = pd.DataFrame(columns=["user_id", "author_id"])
        print(file)
        df = pd.read_csv(file)
        for tweetId, author_id in zip(df['original_tweet_id'], df['author_id']):
            print(tweetId, end='')
            liker_ids = get_all_linkin_users(tweetId)
            liker_ids_df = pd.DataFrame(liker_ids, columns=["user_id"])
            liker_ids_df.insert(1, "author_id", author_id)
            edgelists_df = pd.concat([edgelists_df, liker_ids_df], ignore_index=True)
            print("Done")

        edgelists_df.to_csv(Path(f"edgelists/liking_{idx}.csv"), index=False)

In [None]:
create_retweeters_edgelist()
create_liking_edgelist()