# TwiBot-22 Data Cleaning
In this notebook, we explore the TwiBot-22 dataset and extracting clean CSVs for our model training. TwiBot-22 is the largest and most comprehensive Twitter dataset to date. Due to the files being very large, we need to take a different action to read and extract the desired data without overwhelming the hardware.  
For this project, we only need three files form the dataset. `users`, `labels` and one of the `tweet`s files. After downloading, we make sure they are named:
- user.json
- label.json
- tweet.json
 
Then, we follow these procedures:

1. Reading and loading the labels.
2. Reading and loading the users's metadata.
3. Merging users with labels and filtering out unwanted columns.
4. Straming tweets using `ijson` to read the data item by item. This way, we will not need to open the entire file on the device and overload the memory.
5. Building `posts` dataframe by matching users and their tweets, and only keep users with a selected minimum tweets per user.
6. Downloading the profile image of the users that we kept in the previuous step and updating the users dataframe with the local direction of their profile image.
7. Exporting `users.csv` and `posts.csv`.

## Section 1: Imports & Configuration

In [None]:
import os, json, requests
from PIL import Image
from io import BytesIO
from tqdm.auto import tqdm
from collections import defaultdict
import pandas as pd
import numpy as np
from pathlib import Path

# --- Paths and Configurations --- #
ROOT = Path.cwd().parent.resolve()
DATA_DIR = ROOT / "data/twibot22"
PROCESSED_DIR = DATA_DIR / "processed"
IMAGE_DIR = DATA_DIR / "profile_images"

# --- Downloaded files direction (rename file names if needed)
LABEL_PATH = DATA_DIR / "label.csv"
USER_PATH  = DATA_DIR / "user.json"
TWEET_PATH = DATA_DIR / "tweet.json"

PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
IMAGE_DIR.mkdir(parents=True, exist_ok=True)

# --- Balancing & Sampling --- #
TARGET_USER_PER_CLASS = 2000   # Humans and Bots each
MAX_TWEETS_PER_USER   = 50     # Cap per user to keep dataset managable
MIN_TWEETS_PER_USER   = 5      # Users with fewer than this will be dropped

print("Label path:    ", LABEL_PATH)
print("User json:     ", USER_PATH)
print("Tweet json:    ", TWEET_PATH)
print("Processed dir: ", PROCESSED_DIR)

def space():
    print("\n" ,"-" * 100, "\n")

## Section 2: Loading labels (human vs bot)

In [None]:
# --- Loading labels --- #
labels_df = pd.read_csv(LABEL_PATH)

print("Raw label head:")
display(labels_df.head())

space()

print("\nLabel value counts:")
print(labels_df["label"].value_counts())

# --- Mapping 'human' -> 0, 'bot' -> 1 (ignoring others if present) --- #
valid_labels = ["human", "bot"]
labels_df = labels_df[labels_df["label"].isin(valid_labels)].copy()

label_map = {"human": 0, "bot": 1}
labels_df["label_num"] = labels_df["label"].map(label_map)

space()

print("\nAfter filtering to {human, bot}:")
print(labels_df["label"].value_counts())
print("\nTotal labled users:", len(labels_df))

## Section 3: Loading user metadata

In [None]:
# --- Loading user metadata ---#
print("Loading user.json (this might take a bit)...", end="\r")

with open(str(USER_PATH), 'r', encoding='utf-8') as f:
    users_raw = json.load(f)    # users.json is a JSON array: [ {...}, {...}, ... ]

print(f"Total users in user.json: {len(users_raw)}           ")

# --- Converting to DataFrame ---#
users_df = pd.DataFrame(users_raw)

print("\nUser keys:", users_df.columns.tolist()[:30])

space()

display(users_df.head())

## Section 4: Merging labels with users
Here, we only keep users who both exist in `user.json` and have a label in `label.csv`

In [None]:
# --- Merging labels & users --- #
# labels_df: columns [id, label, label_num]
# users_df:  columns [id, username, description, ...]

merged_users = labels_df.merge(users_df, on="id", how="inner")

print("Users with both metadata and label:", len(merged_users))
print(merged_users['label'].value_counts())

space()

# --- Keeping only a few user fields --- #
keep_cols = ["id", "label", "label_num", "username", "description", "created_at", "profile_image_url"]
keep_cols = [c for c in keep_cols if c in merged_users.columns]    # Just to be safe
merged_users = merged_users[keep_cols].copy()

print("\nMerged user sample:")
display(merged_users.head())

## Section 5: Selecting balanced user subset
We balance **by user**, not by tweet.  
We pick up to **2000 humans** and **2000 bots** (or less, if the dataset has fewer)

In [None]:
# --- Selecting balanced user subset --- #
humans = merged_users[merged_users["label_num"] == 0]
bots   = merged_users[merged_users["label_num"] == 1]

print("Available human users:", len(humans))
print("Available bot users:", len(bots))

space()

humans_num = min(len(humans), TARGET_USER_PER_CLASS)
bots_num   = min(len(bots), TARGET_USER_PER_CLASS)

print(f"Sampling {humans_num} humans and {bots_num} bots")

humans_sample = humans.sample(humans_num, random_state=42) if humans_num > 0 else humans
bots_sample = bots.sample(bots_num, random_state=42) if bots_num > 0 else bots

selected_users = pd.concat([humans_sample, bots_sample]).sample(frac=1, random_state=42).reset_index(drop=True)

space()

print("Selected users:", len(selected_users))
print(selected_users['label'].value_counts())

selected_user_ids = set(selected_users['id'].tolist())

## Section 6: Stream tweets from tweet.json

In [None]:
# --- Tweet streaming helper for json array --- #
import ijson

def iter_tweet_array(path):
    """
    Iterates tweets from a large json file.
    We use ijson to stream items without loading the entire file.
    """
    with open(path, "r", encoding="utf-8") as f:
        for tweet in ijson.items(f, "item"):
            if tweet is None:
                continue
            yield tweet

# --- Using helper to stream tweets --- #
tweets_records = []
tweet_counts = defaultdict(int)

total_lines = 0
kept_tweets = 0

print(f"Streaming tweets from:", TWEET_PATH)
print(f"Target max tweet per user: {MAX_TWEETS_PER_USER}")

for tweet in iter_tweet_array(TWEET_PATH):
    total_lines += 1

    author_id = 'u' + str(tweet.get("author_id"))  # To match the id in users and labels files.
    text = tweet.get("text")

    if author_id is None or text is None:
        continue

    if author_id not in selected_user_ids:
        continue

    if tweet_counts[author_id] >= MAX_TWEETS_PER_USER:
        # Already reached per-user cap
        continue

    tweets_records.append({
        "id":  author_id,
        "tweet_id": tweet.get("id", ""),
        "text":     text
    })
    tweet_counts[author_id] += 1
    kept_tweets += 1

    # Early stopping
    if kept_tweets % 100000 == 0:
        # Check progress occasionally
        users_with_any = sum(1 for u in selected_user_ids if tweet_counts[u] > 0)
        users_full = sum(1 for u in selected_user_ids if tweet_counts[u] >= MAX_TWEETS_PER_USER)
        print(f"Processed {total_lines} lines, kept {kept_tweets} tweets | "
              f"users with any tweets: {users_with_any}, full: {users_full}")

        # Early stopping condition: if ALL selected users reached max_capped tweets
        if users_full == len(selected_user_ids):
            print("All selected users reached the max tweet cap. Stopping early.")
            break

print("\nDone straming.")
print("Total lines scanned:", total_lines)
print("Total tweets kept:", kept_tweets)
print("Distinct users with tweets:", len(tweet_counts))

## Section 7: Building posts.csv and filter users by `MIN_TWEETs_PER_USER`

In [None]:
# --- Building posts DataFrame and filter users --- #
posts_df = pd.DataFrame(tweets_records)
print("Raw posts_df shape:", posts_df.shape)
display(posts_df.head())

space()

# --- Counting tweets per user --- #
tweet_counts_series = posts_df['id'].value_counts()
print("\nTweets per user (summary):")
display(tweet_counts_series.describe())

space()

# --- Keeping only users with at least MIN_TWEETS_PER_USER tweets --- #
eligible_user_ids = set(tweet_counts_series[tweet_counts_series >= MIN_TWEETS_PER_USER].index)
print(f"\nUsers with at least {MIN_TWEETS_PER_USER} tweets:", len(eligible_user_ids))

space()

# --- Filtering posts --- #
posts_df = posts_df[posts_df['id'].isin(eligible_user_ids)].reset_index(drop=True)
print("Filtered posts_df shape:", posts_df.shape)

# --- Filtering user metadata to those with sufficient tweets --- #
final_users = selected_users[selected_users["id"].isin(eligible_user_ids)].copy()
print("Final users count:", len(final_users))
print(final_users["label"].value_counts())

space()

display(final_users.head())

## Section 8: Downloading profile images

In [None]:
def download_image(url, save_path):
    """Downloading an image from url with error handling"""
    try:
        resp = requests.get(url, timeout=5)
        if resp.status_code == 200:
            img = Image.open(BytesIO(resp.content)).convert("RGB")
            img.save(save_path, format="JPEG")
            return True
        else:
            return False
    except Exception:
        return False

# --- Downloading images for all final users --- #
local_paths = []

print("Downloading TwiBot profile images...")

for _, row in tqdm(final_users.iterrows(), total=len(final_users)):
    user_id = row["id"]
    url = row.get("profile_image_url", None)

    if url is None or not isinstance(url, str):
        local_paths.append(None)
        continue

    save_path = IMAGE_DIR / f"{user_id}.jpg"
    
    if Path.exists(save_path):     # Skipping if image is already downloaded
        local_paths.append(str(save_path))
        continue

    # Trying to download
    ok = download_image(url, save_path)
    if ok:
        local_paths.append(str(save_path))
    else:
        local_paths.append(None)

print("Download Completed.")

# --- Adding resolved local image paths --- #
final_users["profile_image_path"] = local_paths

# --- Removing the original URL column (Optional, can be commented) --- #
final_users = final_users.drop(columns=["profile_image_url"], errors="ignore")

## Section 9: Saving cleaned users and posts

In [None]:
users_out_path = PROCESSED_DIR / "users.csv"
posts_out_path = PROCESSED_DIR / "posts.csv"

final_users.to_csv(str(users_out_path), index=False)
posts_df.to_csv(str(posts_out_path), index=False)

print("Saved users to:", users_out_path)
print("Saved posts to:", posts_out_path)

space()

print("\nFinal summary:")
print("Users:", len(final_users))
print("Posts:", len(posts_df))
print("Label distribution (users):")
print(final_users["label"].value_counts())