### ***DATASET PREPARATION***

In [2]:
import pandas as pd
import numpy as np

# ============================================================
# STEP 1: STRATIFIED SAMPLING DARI LABELS
# ============================================================
print("="*70)
print("CREATING STRATIFIED SAMPLE")
print("="*70)

# Load labels
labels = pd.read_csv('x_dataset/label.csv')
print(f"Original dataset: {len(labels):,} users")

# Target sample size
SAMPLE_SIZE = 20000  # Adjust: 10000, 20000, or 50000

# Stratified sampling (maintain bot:human ratio)
bot_ratio = 0.14  # 14% bots
n_bots = int(SAMPLE_SIZE * bot_ratio)
n_humans = SAMPLE_SIZE - n_bots

print(f"\nTarget sample: {SAMPLE_SIZE:,} users")
print(f"  - Bots: {n_bots:,}")
print(f"  - Humans: {n_humans:,}")

# Sample
bots = labels[labels['label'] == 'bot'].sample(n=n_bots, random_state=42)
humans = labels[labels['label'] == 'human'].sample(n=n_humans, random_state=42)
sampled_labels = pd.concat([bots, humans]).reset_index(drop=True)

# Save sampled labels
sampled_labels.to_csv('final_x_dataset/sampled_labels.csv', index=False)
print(f"\n‚úÖ Saved to 'sampled_labels.csv'")

# Get sampled user IDs
sampled_user_ids = set(sampled_labels['id'].values)
print(f"\n‚úÖ {len(sampled_user_ids):,} users sampled")

# ============================================================
# STEP 2: FILTER EDGES (POST RELATIONS) FOR SAMPLED USERS
# ============================================================
print("\n" + "="*70)
print("FILTERING TWEETS FOR SAMPLED USERS")
print("="*70)

# Load edges in chunks to avoid memory issues
chunk_size = 100000
filtered_edges = []

print("Processing edge.csv in chunks...")
for i, chunk in enumerate(pd.read_csv('x_dataset/edge.csv', 
                                       names=['source_id', 'relation', 'target_id'],
                                       chunksize=chunk_size)):
    # Filter post relations for sampled users
    post_chunk = chunk[(chunk['relation'] == 'post') & 
                       (chunk['source_id'].isin(sampled_user_ids))]
    filtered_edges.append(post_chunk)
    
    if (i+1) % 10 == 0:
        print(f"  Processed {(i+1)*chunk_size:,} edges...")

# Combine filtered edges
sampled_edges = pd.concat(filtered_edges).reset_index(drop=True)
sampled_edges.to_csv('final_x_dataset/sampled_edges.csv', index=False)
print(f"\n‚úÖ Filtered tweets: {len(sampled_edges):,}")
print(f"‚úÖ Saved to 'sampled_edges.csv'")

# ============================================================
# STEP 3: TWEETS PER USER STATISTICS (SAMPLED)
# ============================================================
print("\n" + "="*70)
print("TWEETS PER USER DISTRIBUTION (SAMPLED)")
print("="*70)

tweets_per_user = sampled_edges.groupby('source_id').size()
print(f"Users with tweets: {len(tweets_per_user):,}")
print(f"\nStatistics:")
print(tweets_per_user.describe())

print(f"\nüìå Users with ‚â•2 tweets: {(tweets_per_user >= 2).sum():,} ({(tweets_per_user >= 2).sum()/len(tweets_per_user)*100:.1f}%)")
print(f"üìå Users with ‚â•5 tweets: {(tweets_per_user >= 5).sum():,} ({(tweets_per_user >= 5).sum()/len(tweets_per_user)*100:.1f}%)")
print(f"üìå Users with ‚â•10 tweets: {(tweets_per_user >= 10).sum():,} ({(tweets_per_user >= 10).sum()/len(tweets_per_user)*100:.1f}%)")
print(f"üìå Users with 1 tweet: {(tweets_per_user == 1).sum():,} ({(tweets_per_user == 1).sum()/len(tweets_per_user)*100:.1f}%)")

# ============================================================
# STEP 4: FINAL USABLE DATASET
# ============================================================
print("\n" + "="*70)
print("FINAL USABLE DATASET")
print("="*70)

# Users with ‚â•2 tweets (can calculate TIE)
users_with_min_tweets = set(tweets_per_user[tweets_per_user >= 2].index)
final_user_ids = users_with_min_tweets.intersection(sampled_user_ids)

final_labels = sampled_labels[sampled_labels['id'].isin(final_user_ids)]
print(f"‚úÖ Final users (‚â•2 tweets): {len(final_user_ids):,}")
print(f"  - Bots: {(final_labels['label'] == 'bot').sum():,}")
print(f"  - Humans: {(final_labels['label'] == 'human').sum():,}")

# Save final user list
final_labels.to_csv('final_x_dataset/final_labels.csv', index=False)
print(f"\n‚úÖ Saved to 'final_labels.csv'")

# Save final tweet IDs for extraction
final_edges = sampled_edges[sampled_edges['source_id'].isin(final_user_ids)]
final_tweet_ids = set(final_edges['target_id'].values)
print(f"‚úÖ Total tweets to extract: {len(final_tweet_ids):,}")

# Save tweet IDs for next step
with open('final_tweet_ids.txt', 'w') as f:
    for tid in final_tweet_ids:
        f.write(f"{tid}\n")
print(f"‚úÖ Saved tweet IDs to 'final_tweet_ids.txt'")

CREATING STRATIFIED SAMPLE
Original dataset: 1,000,000 users

Target sample: 20,000 users
  - Bots: 2,800
  - Humans: 17,200

‚úÖ Saved to 'sampled_labels.csv'

‚úÖ 20,000 users sampled

FILTERING TWEETS FOR SAMPLED USERS
Processing edge.csv in chunks...
  Processed 1,000,000 edges...
  Processed 2,000,000 edges...
  Processed 3,000,000 edges...
  Processed 4,000,000 edges...
  Processed 5,000,000 edges...
  Processed 6,000,000 edges...
  Processed 7,000,000 edges...
  Processed 8,000,000 edges...
  Processed 9,000,000 edges...
  Processed 10,000,000 edges...
  Processed 11,000,000 edges...
  Processed 12,000,000 edges...
  Processed 13,000,000 edges...
  Processed 14,000,000 edges...
  Processed 15,000,000 edges...
  Processed 16,000,000 edges...
  Processed 17,000,000 edges...
  Processed 18,000,000 edges...
  Processed 19,000,000 edges...
  Processed 20,000,000 edges...
  Processed 21,000,000 edges...
  Processed 22,000,000 edges...
  Processed 23,000,000 edges...
  Processed 24,000

In [5]:
import pandas as pd
import random

print("="*70)
print("CREATING ULTRA-SMALL SAMPLE (MEMORY-SAFE)")
print("="*70)

# ============================================================
# REDUCED SAMPLE: 2,000 USERS (instead of 20,000)
# ============================================================
SAMPLE_SIZE = 2000  # Much smaller!

labels = pd.read_csv('x_dataset/label.csv')
print(f"Original dataset: {len(labels):,} users")

# Stratified sampling
bot_ratio = 0.14
n_bots = int(SAMPLE_SIZE * bot_ratio)
n_humans = SAMPLE_SIZE - n_bots

print(f"\nTarget sample: {SAMPLE_SIZE:,} users")
print(f"  - Bots: {n_bots:,}")
print(f"  - Humans: {n_humans:,}")

# Sample
bots = labels[labels['label'] == 'bot'].sample(n=n_bots, random_state=42)
humans = labels[labels['label'] == 'human'].sample(n=n_humans, random_state=42)
sampled_labels = pd.concat([bots, humans]).reset_index(drop=True)

# Save
sampled_labels.to_csv('final_x_dataset/sampled_labels_small.csv', index=False)
print(f"\n‚úÖ Saved to 'sampled_labels_small.csv'")

sampled_user_ids = set(sampled_labels['id'].values)

# ============================================================
# FILTER EDGES
# ============================================================
print("\n" + "="*70)
print("FILTERING TWEETS FOR SMALL SAMPLE")
print("="*70)

chunk_size = 50000
filtered_edges = []

for i, chunk in enumerate(pd.read_csv('x_dataset/edge.csv', 
                                       names=['source_id', 'relation', 'target_id'],
                                       chunksize=chunk_size)):
    post_chunk = chunk[(chunk['relation'] == 'post') & 
                       (chunk['source_id'].isin(sampled_user_ids))]
    filtered_edges.append(post_chunk)
    
    if (i+1) % 20 == 0:
        print(f"  Processed {(i+1)*chunk_size:,} edges...")

sampled_edges = pd.concat(filtered_edges).reset_index(drop=True)
sampled_edges.to_csv('final_x_dataset/sampled_edges_small.csv', index=False)

print(f"\n‚úÖ Filtered tweets: {len(sampled_edges):,}")

# Statistics
tweets_per_user = sampled_edges.groupby('source_id').size()
print(f"\nüìä Users with tweets: {len(tweets_per_user):,}")
print(f"üìä Users with ‚â•2 tweets: {(tweets_per_user >= 2).sum():,}")

# Save final tweet IDs
final_tweet_ids = set(sampled_edges['target_id'].values)
with open('final_tweet_ids_small.txt', 'w') as f:
    for tid in final_tweet_ids:
        f.write(f"{tid}\n")

print(f"‚úÖ Saved {len(final_tweet_ids):,} tweet IDs")
print("\n‚úÖ SMALL SAMPLE READY!")

CREATING ULTRA-SMALL SAMPLE (MEMORY-SAFE)
Original dataset: 1,000,000 users

Target sample: 2,000 users
  - Bots: 280
  - Humans: 1,720

‚úÖ Saved to 'sampled_labels_small.csv'

FILTERING TWEETS FOR SMALL SAMPLE
  Processed 1,000,000 edges...
  Processed 2,000,000 edges...
  Processed 3,000,000 edges...
  Processed 4,000,000 edges...
  Processed 5,000,000 edges...
  Processed 6,000,000 edges...
  Processed 7,000,000 edges...
  Processed 8,000,000 edges...
  Processed 9,000,000 edges...
  Processed 10,000,000 edges...
  Processed 11,000,000 edges...
  Processed 12,000,000 edges...
  Processed 13,000,000 edges...
  Processed 14,000,000 edges...
  Processed 15,000,000 edges...
  Processed 16,000,000 edges...
  Processed 17,000,000 edges...
  Processed 18,000,000 edges...
  Processed 19,000,000 edges...
  Processed 20,000,000 edges...
  Processed 21,000,000 edges...
  Processed 22,000,000 edges...
  Processed 23,000,000 edges...
  Processed 24,000,000 edges...
  Processed 25,000,000 edges.

In [3]:
import ijson
import pandas as pd
import gc
from tqdm import tqdm

print("="*70)
print("EXTRACTING TWEETS (IJSON STREAMING - FOR VERY LARGE FILES)")
print("="*70)

# ============================================================
# STEP 1: LOAD TWEET IDs TO EXTRACT
# ============================================================
print("\n[1/3] Loading tweet IDs to extract...")
with open('final_x_dataset/final_tweet_ids_small.txt', 'r') as f:
    final_tweet_ids = set(line.strip() for line in f)
print(f"‚úÖ Loaded {len(final_tweet_ids):,} tweet IDs to extract")

# ============================================================
# STEP 2: STREAMING EXTRACTION WITH IJSON
# ============================================================
print("\n[2/3] Extracting tweets with ijson (streaming parser)...")
print("‚ö†Ô∏è This may take 20-60 minutes for 11GB file...")
print("‚è≥ Processing... (no progress bar due to streaming)")

file_path = 'x_dataset/tweet_0.json'
extracted_tweets = []
tweets_processed = 0
tweets_found = 0

def extract_tweet_data(tweet):
    """Extract relevant data from a tweet object"""
    try:
        entities = tweet.get('entities', {})
        mentions = entities.get('user_mentions', [])
        hashtags = entities.get('hashtags', [])
        urls = entities.get('urls', [])
        
        return {
            'tweet_id': tweet.get('id'),
            'user_id': tweet.get('author_id'),
            'created_at': tweet.get('created_at'),
            'text': tweet.get('text', ''),
            'retweet_count': tweet.get('public_metrics', {}).get('retweet_count', 0),
            'reply_count': tweet.get('public_metrics', {}).get('reply_count', 0),
            'like_count': tweet.get('public_metrics', {}).get('like_count', 0),
            'quote_count': tweet.get('public_metrics', {}).get('quote_count', 0),
            'mention_count': len(mentions) if mentions else 0,
            'hashtag_count': len(hashtags) if hashtags else 0,
            'url_count': len(urls) if urls else 0
        }
    except Exception as e:
        return None

# Open file with ijson (streaming mode)
with open(file_path, 'rb') as f:
    # Parse as array items (each tweet is an item in the array)
    parser = ijson.items(f, 'item')
    
    for tweet in parser:
        tweets_processed += 1
        
        # Check if this tweet is in our list
        tweet_id = tweet.get('id')
        if tweet_id in final_tweet_ids:
            tweet_data = extract_tweet_data(tweet)
            if tweet_data:
                extracted_tweets.append(tweet_data)
                tweets_found += 1
                
                # Progress update every 1000 tweets
                if tweets_found % 1000 == 0:
                    print(f"\r  ‚úÖ Found: {tweets_found:,} / {len(final_tweet_ids):,} tweets ({tweets_found/len(final_tweet_ids)*100:.1f}%)", end='', flush=True)
        
        # Progress update every 100k tweets processed
        if tweets_processed % 100000 == 0:
            print(f"\n  ‚è≥ Processed: {tweets_processed:,} tweets, Found: {tweets_found:,}", flush=True)
            gc.collect()
        
        # Early stop if found all
        if tweets_found == len(final_tweet_ids):
            print(f"\n‚úÖ Found all tweets! Stopping early.")
            break

print(f"\n\n‚úÖ Extraction complete!")
print(f"   Tweets processed: {tweets_processed:,}")
print(f"   Tweets found: {tweets_found:,}")
print(f"   Coverage: {tweets_found / len(final_tweet_ids) * 100:.1f}%")

# ============================================================
# STEP 3: CONVERT TO DATAFRAME & SAVE
# ============================================================
print("\n[3/3] Converting to DataFrame and saving...")

if len(extracted_tweets) == 0:
    print("\n‚ùå ERROR: No tweets extracted!")
    raise ValueError("No tweets found - check ID format or file structure")

tweets_df = pd.DataFrame(extracted_tweets)
tweets_df['created_at'] = pd.to_datetime(tweets_df['created_at'])
tweets_df = tweets_df.sort_values(['user_id', 'created_at']).reset_index(drop=True)
tweets_df['text_length'] = tweets_df['text'].str.len()

print(f"‚úÖ DataFrame created: {tweets_df.shape}")

# ============================================================
# STATISTICS
# ============================================================
print("\n" + "="*70)
print("EXTRACTION STATISTICS")
print("="*70)

print(f"\nüìä Tweets extracted: {len(tweets_df):,}")
print(f"üìä Unique users: {tweets_df['user_id'].nunique():,}")

tweets_per_user = tweets_df.groupby('user_id').size()
print(f"\nüìä Tweets per user:")
print(f"   Mean:   {tweets_per_user.mean():.1f}")
print(f"   Median: {tweets_per_user.median():.0f}")
print(f"   Min:    {tweets_per_user.min()}")
print(f"   Max:    {tweets_per_user.max()}")

users_with_gte2 = (tweets_per_user >= 2).sum()
print(f"\nüìä Users with ‚â•2 tweets: {users_with_gte2:,} ({users_with_gte2/len(tweets_per_user)*100:.1f}%)")

print("\nüìã Sample:")
print(tweets_df.head())

# ============================================================
# SAVE
# ============================================================
output_file = 'final_x_dataset/extracted_tweets.csv'
tweets_df.to_csv(output_file, index=False)
print(f"\n‚úÖ Saved to '{output_file}'")

del extracted_tweets, tweets_df
gc.collect()

print("\n" + "="*70)
print("‚úÖ EXTRACTION COMPLETE!")
print("="*70)

EXTRACTING TWEETS (IJSON STREAMING - FOR VERY LARGE FILES)

[1/3] Loading tweet IDs to extract...
‚úÖ Loaded 182,328 tweet IDs to extract

[2/3] Extracting tweets with ijson (streaming parser)...
‚ö†Ô∏è This may take 20-60 minutes for 11GB file...
‚è≥ Processing... (no progress bar due to streaming)

  ‚è≥ Processed: 100,000 tweets, Found: 200

  ‚è≥ Processed: 200,000 tweets, Found: 280

  ‚è≥ Processed: 300,000 tweets, Found: 480

  ‚è≥ Processed: 400,000 tweets, Found: 680

  ‚è≥ Processed: 500,000 tweets, Found: 880

  ‚è≥ Processed: 600,000 tweets, Found: 959
  ‚úÖ Found: 1,000 / 182,328 tweets (0.5%)
  ‚è≥ Processed: 700,000 tweets, Found: 1,156

  ‚è≥ Processed: 800,000 tweets, Found: 1,360

  ‚è≥ Processed: 900,000 tweets, Found: 1,558

  ‚è≥ Processed: 1,000,000 tweets, Found: 1,790

  ‚è≥ Processed: 1,100,000 tweets, Found: 1,984
  ‚úÖ Found: 2,000 / 182,328 tweets (1.1%)
  ‚è≥ Processed: 1,200,000 tweets, Found: 2,253

  ‚è≥ Processed: 1,300,000 tweets, Found: 2,446

  ‚è≥ P

In [2]:
import pandas as pd
import json
from tqdm import tqdm

print("="*70)
print("CREATING FINAL DATASET (1 CSV FILE - LIKE OLD DATASET)")
print("="*70)

# ============================================================
# STEP 1: LOAD ALL DATA
# ============================================================
print("\n[1/4] Loading extracted tweets...")
tweets_df = pd.read_csv('final_x_dataset/extracted_tweets.csv')
# Setelah load tweets_df, tambahkan ini:
tweets_df['user_id'] = 'u' + tweets_df['user_id'].astype(str)
print(f"‚úÖ Loaded {len(tweets_df):,} tweets")
print(f"   user_id type: {tweets_df['user_id'].dtype}")

print("\n[2/4] Loading user metadata from user.json...")
with open('x_dataset/user.json', 'r', encoding='utf-8') as f:
    users_data = json.load(f)

# Convert to DataFrame
users_list = []
for user in tqdm(users_data, desc="Processing users"):
    users_list.append({
        'user_id': user['id'],  # Keep as string from JSON
        'followers_count': user['public_metrics']['followers_count'],
        'following_count': user['public_metrics']['following_count'],
        'user_tweet_count': user['public_metrics']['tweet_count'],
        'verified': user['verified'],
        'account_created_at': user['created_at']
    })

users_df = pd.DataFrame(users_list)
print(f"‚úÖ Loaded {len(users_df):,} users")
print(f"   user_id type: {users_df['user_id'].dtype}")

print("\n[3/4] Loading labels...")
labels_df = pd.read_csv('final_x_dataset/sampled_labels.csv')
labels_df.columns = ['user_id', 'label']
print(f"   label user_id type: {labels_df['user_id'].dtype}")

# Convert label to binary: bot=1, human=0
labels_df['Bot_Label'] = labels_df['label'].map({'bot': 1, 'human': 0})
print(f"‚úÖ Loaded {len(labels_df):,} labels")

# ============================================================
# FIX DATA TYPES - CONVERT ALL user_id TO STRING
# ============================================================
print("\nüîß Fixing data types...")

# Convert all user_id columns to string for consistent merging
tweets_df['user_id'] = tweets_df['user_id'].astype(str)
users_df['user_id'] = users_df['user_id'].astype(str)
labels_df['user_id'] = labels_df['user_id'].astype(str)

print(f"‚úÖ user_id standardized to string type")

# ============================================================
# STEP 2: FILTER users_df TO ONLY SAMPLED USERS (REDUCE MEMORY)
# ============================================================
print("\n[4/4] Filtering and merging data...")

# Get list of users we actually need (from tweets)
sampled_user_ids = set(tweets_df['user_id'].unique())
print(f"   Filtering users_df to {len(sampled_user_ids):,} sampled users...")

# Filter users_df to only include sampled users
users_df = users_df[users_df['user_id'].isin(sampled_user_ids)].copy()
print(f"‚úÖ Filtered users_df: {len(users_df):,} users")

# ============================================================
# STEP 3: MERGE ALL DATA (LIKE OLD DATASET FORMAT!)
# ============================================================
print("\nMerging datasets...")

# Merge tweets with user metadata
df = tweets_df.merge(users_df, on='user_id', how='left')
print(f"‚úÖ Merged with user metadata: {df.shape}")

# Merge with labels
df = df.merge(labels_df[['user_id', 'Bot_Label']], on='user_id', how='left')
print(f"‚úÖ Merged with labels: {df.shape}")

# ============================================================
# STEP 4: RENAME & REORDER COLUMNS (MATCH OLD DATASET FORMAT)
# ============================================================
print("\nRenaming columns to match old dataset format...")

# Rename columns to match old dataset
df_final = df.rename(columns={
    'user_id': 'User ID',
    'created_at': 'Created At',
    'text': 'Tweet',
    'retweet_count': 'Retweet Count',
    'mention_count': 'Mention Count',
    'followers_count': 'Follower Count',
    'verified': 'Verified',
    'hashtag_count': 'Hashtag Count',
    'url_count': 'URL Count',
    'text_length': 'Tweet Length'
}).copy()

# Ensure Bot Label column exists
if 'Bot_Label' not in df_final.columns:
    print("‚ö†Ô∏è Warning: Bot_Label not found after merge!")
    df_final['Bot Label'] = pd.NA
else:
    df_final['Bot Label'] = df_final['Bot_Label']

# Select columns to keep (only those that exist)
available_columns = [
    'User ID',
    'Created At',
    'Tweet',
    'Retweet Count',
    'Mention Count',
    'Follower Count',
    'Verified',
    'Bot Label',
    'Hashtag Count',
    'URL Count',
    'Tweet Length'
]

# Filter to only existing columns
final_columns = [col for col in available_columns if col in df_final.columns]
df_final = df_final[final_columns]

print(f"‚úÖ Columns selected: {len(final_columns)}")

# ============================================================
# STEP 5: DATA CLEANING & VALIDATION
# ============================================================
print("\n" + "="*70)
print("DATA VALIDATION & CLEANING")
print("="*70)

print(f"\nBefore cleaning: {df_final.shape}")

# Check for missing values
print("\nüìä Missing values:")
missing = df_final.isnull().sum()
print(missing[missing > 0])

# Remove rows with missing critical data
critical_cols = ['User ID', 'Created At']
if 'Bot Label' in df_final.columns:
    critical_cols.append('Bot Label')

df_final = df_final.dropna(subset=critical_cols)
print(f"\nAfter removing missing critical data: {df_final.shape}")

# Convert data types
df_final['Verified'] = df_final['Verified'].astype(int)
if 'Bot Label' in df_final.columns:
    df_final['Bot Label'] = df_final['Bot Label'].astype(int)

print(f"‚úÖ Data types converted")

# ============================================================
# STEP 6: STATISTICS
# ============================================================
print("\n" + "="*70)
print("FINAL DATASET STATISTICS")
print("="*70)

print(f"\nüìä Dataset shape: {df_final.shape}")
print(f"   Total tweets: {len(df_final):,}")
print(f"   Unique users: {df_final['User ID'].nunique():,}")

# Tweets per user distribution
tweets_per_user = df_final.groupby('User ID').size()
print(f"\nüìä Tweets per user:")
print(f"   Mean:   {tweets_per_user.mean():.1f}")
print(f"   Median: {tweets_per_user.median():.0f}")
print(f"   Min:    {tweets_per_user.min()}")
print(f"   Max:    {tweets_per_user.max()}")

# Users with ‚â•2 tweets (can calculate TIE)
users_gte2 = (tweets_per_user >= 2).sum()
print(f"\nüìä Users with ‚â•2 tweets: {users_gte2:,} ({users_gte2/len(tweets_per_user)*100:.1f}%)")

# Label distribution
if 'Bot Label' in df_final.columns:
    label_counts = df_final.groupby('User ID')['Bot Label'].first().value_counts()
    print(f"\nüìä User labels:")
    print(f"   Bots (1):   {label_counts.get(1, 0):,} ({label_counts.get(1, 0)/label_counts.sum()*100:.1f}%)")
    print(f"   Humans (0): {label_counts.get(0, 0):,} ({label_counts.get(0, 0)/label_counts.sum()*100:.1f}%)")
    
    print(f"\nüéØ Comparison with literature:")
    print(f"   vs Perdana et al. (56 users):  {users_gte2/56:.1f}x larger")
    print(f"   vs Aditya et al. (39 users):   {users_gte2/39:.1f}x larger")
    print(f"   vs Priyatno et al. (32 users): {users_gte2/32:.1f}x larger")

# Sample data
print("\nüìã Sample data (first 5 rows):")
print(df_final.head())

print("\nüìã Column info:")
print(df_final.info())

# ============================================================
# STEP 7: SAVE FINAL DATASET
# ============================================================
output_file = 'final_x_dataset/bot_detection_data_1.csv'
df_final.to_csv(output_file, index=False)
print(f"\n‚úÖ Saved to '{output_file}'")

print("\n" + "="*70)
print("‚úÖ DATASET PREPARATION COMPLETE!")
print("="*70)
print(f"\nüéØ Ready for modeling!")
print(f"   File: {output_file}")
print(f"   Format: SAME as old dataset (multiple tweets per user)")
print(f"   Can use: EXACT same code flow as before")
print(f"\nüìÅ File size: {df_final.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

CREATING FINAL DATASET (1 CSV FILE - LIKE OLD DATASET)

[1/4] Loading extracted tweets...
‚úÖ Loaded 20,123 tweets
   user_id type: object

[2/4] Loading user metadata from user.json...


Processing users: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000000/1000000 [00:12<00:00, 82819.91it/s]


‚úÖ Loaded 1,000,000 users
   user_id type: object

[3/4] Loading labels...
   label user_id type: object
‚úÖ Loaded 20,000 labels

üîß Fixing data types...
‚úÖ user_id standardized to string type

[4/4] Filtering and merging data...
   Filtering users_df to 640 sampled users...
‚úÖ Filtered users_df: 640 users

Merging datasets...
‚úÖ Merged with user metadata: (20123, 17)
‚úÖ Merged with labels: (20123, 18)

Renaming columns to match old dataset format...
‚úÖ Columns selected: 11

DATA VALIDATION & CLEANING

Before cleaning: (20123, 11)

üìä Missing values:
Series([], dtype: int64)

After removing missing critical data: (20123, 11)
‚úÖ Data types converted

FINAL DATASET STATISTICS

üìä Dataset shape: (20123, 11)
   Total tweets: 20,123
   Unique users: 640

üìä Tweets per user:
   Mean:   31.4
   Median: 40
   Min:    1
   Max:    102

üìä Users with ‚â•2 tweets: 582 (90.9%)

üìä User labels:
   Bots (1):   57 (8.9%)
   Humans (0): 583 (91.1%)

üéØ Comparison with literature:

In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

# Baca dataset
df = pd.read_csv('final_x_dataset/bot_detection_data.csv')

print("=" * 60)
print("INFORMASI DATASET AWAL")
print("=" * 60)
print(f"Shape awal: {df.shape}")
print(f"\nDistribusi Bot Label:")
print(df['Bot Label'].value_counts())
print(f"\nPersentase:")
print(df['Bot Label'].value_counts(normalize=True) * 100)

# Pisahkan berdasarkan label
bot_data = df[df['Bot Label'] == 1]
non_bot_data = df[df['Bot Label'] == 0]

print(f"\nJumlah Bot (Label 1): {len(bot_data)}")
print(f"Jumlah Non-Bot (Label 0): {len(non_bot_data)}")

# ===== METODE 1: UNDERSAMPLING (Kurangi kelas mayoritas) =====
# Ambil jumlah minimum dari kedua kelas
min_samples = min(len(bot_data), len(non_bot_data))

bot_downsampled = resample(bot_data, 
                           replace=False,
                           n_samples=min_samples,
                           random_state=42)

non_bot_downsampled = resample(non_bot_data,
                               replace=False,
                               n_samples=min_samples,
                               random_state=42)

# Gabungkan dan shuffle
balanced_df_undersampling = pd.concat([bot_downsampled, non_bot_downsampled])
balanced_df_undersampling = balanced_df_undersampling.sample(frac=1, random_state=42).reset_index(drop=True)

print("\n" + "=" * 60)
print("METODE 1: UNDERSAMPLING")
print("=" * 60)
print(f"Shape setelah balancing: {balanced_df_undersampling.shape}")
print(f"\nDistribusi Bot Label:")
print(balanced_df_undersampling['Bot Label'].value_counts())
print(f"\nPersentase:")
print(balanced_df_undersampling['Bot Label'].value_counts(normalize=True) * 100)

# ===== METODE 2: OVERSAMPLING (Tambah kelas minoritas) =====
max_samples = max(len(bot_data), len(non_bot_data))

bot_upsampled = resample(bot_data,
                        replace=True,
                        n_samples=max_samples,
                        random_state=42)

non_bot_upsampled = resample(non_bot_data,
                            replace=True,
                            n_samples=max_samples,
                            random_state=42)

# Gabungkan dan shuffle
balanced_df_oversampling = pd.concat([bot_upsampled, non_bot_upsampled])
balanced_df_oversampling = balanced_df_oversampling.sample(frac=1, random_state=42).reset_index(drop=True)

print("\n" + "=" * 60)
print("METODE 2: OVERSAMPLING")
print("=" * 60)
print(f"Shape setelah balancing: {balanced_df_oversampling.shape}")
print(f"\nDistribusi Bot Label:")
print(balanced_df_oversampling['Bot Label'].value_counts())
print(f"\nPersentase:")
print(balanced_df_oversampling['Bot Label'].value_counts(normalize=True) * 100)

# ===== METODE 3: HYBRID (Kombinasi) =====
# Tentukan target jumlah untuk masing-masing kelas (misalnya rata-rata)
target_samples = (len(bot_data) + len(non_bot_data)) // 2

bot_hybrid = resample(bot_data,
                     replace=(len(bot_data) < target_samples),
                     n_samples=target_samples,
                     random_state=42)

non_bot_hybrid = resample(non_bot_data,
                         replace=(len(non_bot_data) < target_samples),
                         n_samples=target_samples,
                         random_state=42)

# Gabungkan dan shuffle
balanced_df_hybrid = pd.concat([bot_hybrid, non_bot_hybrid])
balanced_df_hybrid = balanced_df_hybrid.sample(frac=1, random_state=42).reset_index(drop=True)

print("\n" + "=" * 60)
print("METODE 3: HYBRID")
print("=" * 60)
print(f"Shape setelah balancing: {balanced_df_hybrid.shape}")
print(f"\nDistribusi Bot Label:")
print(balanced_df_hybrid['Bot Label'].value_counts())
print(f"\nPersentase:")
print(balanced_df_hybrid['Bot Label'].value_counts(normalize=True) * 100)

# ===== SIMPAN DATASET =====
print("\n" + "=" * 60)
print("MENYIMPAN DATASET")
print("=" * 60)

# Pilih metode yang ingin disimpan (sesuaikan dengan kebutuhan)
balanced_df_undersampling.to_csv('bot_detection_balanced_undersampling.csv', index=False)
print("‚úì Tersimpan: bot_detection_balanced_undersampling.csv")

balanced_df_oversampling.to_csv('bot_detection_balanced_oversampling.csv', index=False)
print("‚úì Tersimpan: bot_detection_balanced_oversampling.csv")

balanced_df_hybrid.to_csv('bot_detection_balanced_hybrid.csv', index=False)
print("‚úì Tersimpan: bot_detection_balanced_hybrid.csv")

print("\n" + "=" * 60)
print("REKOMENDASI")
print("=" * 60)
print("""
1. UNDERSAMPLING: 
   - Gunakan jika dataset sangat besar dan ingin mengurangi ukuran
   - Kehilangan data dari kelas mayoritas
   - Dataset lebih kecil, training lebih cepat

2. OVERSAMPLING:
   - Gunakan jika data minoritas terlalu sedikit
   - Mempertahankan semua data asli
   - Dataset lebih besar, risk overfitting lebih tinggi

3. HYBRID:
   - Kompromi antara undersampling dan oversampling
   - Ukuran dataset moderate
   - Balance antara efisiensi dan informasi

Untuk bot detection, biasanya UNDERSAMPLING atau HYBRID lebih baik
karena menghindari duplikasi data yang berlebihan.
""")

INFORMASI DATASET AWAL
Shape awal: (20123, 11)

Distribusi Bot Label:
Bot Label
0    18173
1     1950
Name: count, dtype: int64

Persentase:
Bot Label
0    90.309596
1     9.690404
Name: proportion, dtype: float64

Jumlah Bot (Label 1): 1950
Jumlah Non-Bot (Label 0): 18173

METODE 1: UNDERSAMPLING
Shape setelah balancing: (3900, 11)

Distribusi Bot Label:
Bot Label
1    1950
0    1950
Name: count, dtype: int64

Persentase:
Bot Label
1    50.0
0    50.0
Name: proportion, dtype: float64

METODE 2: OVERSAMPLING
Shape setelah balancing: (36346, 11)

Distribusi Bot Label:
Bot Label
1    18173
0    18173
Name: count, dtype: int64

Persentase:
Bot Label
1    50.0
0    50.0
Name: proportion, dtype: float64

METODE 3: HYBRID
Shape setelah balancing: (20122, 11)

Distribusi Bot Label:
Bot Label
1    10061
0    10061
Name: count, dtype: int64

Persentase:
Bot Label
1    50.0
0    50.0
Name: proportion, dtype: float64

MENYIMPAN DATASET
‚úì Tersimpan: bot_detection_balanced_undersampling.csv
‚úì T