<a href="https://colab.research.google.com/github/DDPSscipo/Civica_Netzero/blob/main/Twitter_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install ndjson

Note: you may need to restart the kernel to use updated packages.


In [None]:
### Merge (if needed) and convert ndjson data to json-ish format
import ndjson
import os

def merge_ndjson_files(directory):
    merged_data = []
    seen = set()

    # Iterate through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".ndjson"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r") as file:
                data = ndjson.load(file)
                # Iterate through each item in the data
                for item in data:
                    # Check that the item is not a duplicate
                    tweet_id = item["item_id"]
                    if tweet_id not in seen:
                        merged_data.append(item)
                        seen.add(tweet_id)
    return merged_data

merged_data = merge_ndjson_files('./Twitter')

In [None]:
### Clean twitter data

def clean_data(raw_data):
    cleaned = []

    for item in raw_data:
        # Tweet id
        tweet_id = item["item_id"]

        # The author's metadata
        userInfos = item["data"]['core']['user_results']['result']

        user_id = userInfos['rest_id']
        user_followers_count = userInfos['legacy']['followers_count']
        user_location = userInfos['legacy']['location']
        user_screen_name = userInfos['legacy']['screen_name'] #user's name shown in url

        # Viewed
        views_count = item["data"]["views"].get("count")
        # Favorited
        favorite_count = item["data"]["legacy"]["favorite_count"]
        # Retweeted
        retweet_count = item["data"]["legacy"]["retweet_count"]
        # Full text
        full_text = item["data"]["legacy"]["full_text"]
        # Created at
        created_at = item["data"]["legacy"]["created_at"]
        # Hashtags
        hashtags = [tag["text"] for tag in item["data"]["legacy"]["entities"]["hashtags"]]

        #Conversation Id
        conversation_id = item["data"]["legacy"]["conversation_id_str"]

        #Quoted tweet
        quoted_info = item["data"].get("quoted_status_result")
        quotedTweetInfo = quoted_info['result'] if (quoted_info and 'result' in quoted_info) else None

        quoted_tweet_id = None
        quoted_tweet_user_id = None
        quoted_tweet_user_name = None
        if (quotedTweetInfo and quotedTweetInfo["__typename"] == 'Tweet'):
            quoted_tweet_id = quotedTweetInfo["rest_id"]
            quoted_tweet_user_id = quotedTweetInfo["core"]["user_results"]["result"]["rest_id"]
            quoted_tweet_user_name = quotedTweetInfo["core"]["user_results"]["result"]["legacy"]["screen_name"]

        #Mentioned user
        mentioned_list = item["data"]["legacy"]["entities"]["user_mentions"]

        cleaned.append({
            "tweet_id":tweet_id,
            "views_count":views_count,
            "favorite_count":favorite_count,
            "retweet_count":retweet_count,
            "full_text":full_text,
            "created_at":created_at,
            "hashtags":hashtags,
            "user_id":user_id,
            "user_followers_count":user_followers_count,
            "user_location":user_location,
            "user_screen_name":user_screen_name,
            "conversation_id":conversation_id,
            "quoted_tweet_id":quoted_tweet_id,
            "quoted_tweet_user_id":quoted_tweet_user_id,
            "quoted_tweet_user_name":quoted_tweet_user_name,
            "mentioned_list":mentioned_list,
        })
    return cleaned

cleaned_data = clean_data(merged_data)

In [None]:
# Output to csv format
import csv

# Change name to 2021 or 2023
output_csv_path = "./data_202X.csv"

# Extract headers
headers = cleaned_data[0].keys() if cleaned_data else []

with open(output_csv_path, "w", newline="") as output_csv_file:
    # Create a CSV writer object
    csv_writer = csv.DictWriter(output_csv_file, fieldnames=headers)

    # Write the header row
    csv_writer.writeheader()

    # Write the data rows
    for item in cleaned_data:
        csv_writer.writerow(item)

In [None]:
### Co-hashtag Network

from itertools import combinations

# Change name to 2021 or 2023
output_node='./cohash_nodes_202X.csv'
output_edge='./cohash_edge_202X.csv'

headers_node=['Id', 'Label']
headers_edge=['Source', 'Target']

nodes = set()
edges = []

for item in cleaned_data:
    hashtags = item["hashtags"]
    if hashtags and len(hashtags) > 0:
        # add nodes
        for hashtag in hashtags:
            if hashtag not in nodes:
                nodes.add(hashtag)
        # add edges
        for tag1, tag2 in combinations(hashtags, 2):
            edges.append({"Source": tag1, "Target": tag2})

with open(output_node, 'w', newline='') as output_node_file:
    csv_writer = csv.DictWriter(output_node_file, fieldnames=headers_node)
    csv_writer.writeheader()
    for node in nodes:
        output={
            "Id": node,
            "Label":f"#{node}",
        }
        csv_writer.writerow(output)

with open(output_edge, 'w', newline='') as output_edge_file:
    csv_writer = csv.DictWriter(output_edge_file, fieldnames=headers_edge)
    csv_writer.writeheader()
    for edge in edges:
        csv_writer.writerow(edge)


In [None]:
### User Interaction Network

# Change name to 2021 or 2023
output_node='./Gephi_nodes_202X.csv'
output_edge='./Gephi_edge_202X.csv'

headers_node=['Id', 'Label']
headers_edge=['Source', 'Target', 'Relation']

with open(output_node, 'w', newline='') as output_node_file:
    csv_writer = csv.DictWriter(output_node_file, fieldnames=headers_node)
    csv_writer.writeheader()
    for item in cleaned_data:
        output={
            "Id":item["user_id"],
            "Label":item["user_screen_name"],
        }
        csv_writer.writerow(output)


edge_data = []
for item in cleaned_data:
    user_id = item["user_id"]
    quoted_tweet_user_id = item["quoted_tweet_user_id"]
    mentioned_list=item["mentioned_list"]
    if quoted_tweet_user_id:
        edge_data.append({
            "Source":quoted_tweet_user_id,
            "Target":user_id,
            "Relation":"Quoted by",
        })
    if len(mentioned_list) > 0:
        for mentioned in mentioned_list:
            edge_data.append({
                "Source":mentioned["id_str"],
                "Target":user_id,
                "Relation":"Mentioned by",
            })


with open(output_edge, 'w', newline='') as output_edge_file:
    csv_writer = csv.DictWriter(output_edge_file, fieldnames=headers_edge)
    csv_writer.writeheader()
    for item in edge_data:
        csv_writer.writerow(item)


In [None]:
### See if some of the influential 2023 users were in 2021 user interaction

red_users_to_highlight = ['rec777777', 'LordOfFreedom2','MikeTho0495078'
                          ,'PaulssonChris','lanEnglishman','NorthBritannia']
blue_users_to_highlight = ['ScienceBlog3', 'Prof_Dr_SG', 'Fossil_Herb'
                           , 'Royalacresrod', 'Sasha67Oz', 'peblackstock']

active_in_2021 = set()

for item in cleaned_data:
    label = item["user_screen_name"]
    if (label in red_users_to_highlight) or (label in blue_users_to_highlight):
        activeIn2021.add(label)

print(active_in_2021) # It is a empty set