In [1]:

import dask.bag as db
import json
import requests
import csv


In [2]:

# Read the JSONL file with Dask and partition it (Replace with your actual file path)
b = db.read_text('../stream_tweetids/streamV2_tweetnet_2023-06.jsons/streamV2_tweetnet_2023-06.jsons', blocksize='64MB').map(json.loads)
print(f"Number of partitions: {b.npartitions}")


Number of partitions: 18


In [3]:

# Define the transformation function to extract required fields from JSON
def extract_fields(json_obj):
    tweet_id = json_obj.get('tweet_id', '')
    tweet_type = json_obj.get('tweet_type', '')
    hashtags = json_obj.get('hashtags', [])
    mentions = json_obj.get('mentions', [])
    return {
        'tweet_id': tweet_id,
        'tweet_type': tweet_type,
        'hashtags': hashtags,
        'mentions': mentions
    }


In [4]:

# Function to make API call
def fetch_additional_info(tweet_id):
    url = "https://cdn.syndication.twimg.com/tweet-result"
    querystring = {"id": tweet_id, "lang": "en", "token": "x"}
    payload = ""
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0",
    "Accept": "*/*",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Origin": "https://platform.twitter.com",
    "Connection": "keep-alive",
    "Referer": "https://platform.twitter.com/",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "cross-site",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache",
    "TE": "trailers"
    }
    response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
    return response.text


In [None]:

def parse_api_response(api_response):
    parsed_data = json.loads(api_response)
    lang = parsed_data.get('lang', '')
    favorite_count = parsed_data.get('favorite_count', 0)
    created_at = parsed_data.get('created_at', '')
    text = parsed_data.get('text', '')
    parent_tweet_id = parsed_data.get('parent', {}).get('id_str', '')
    
    return {
        'lang': lang,
        'favorite_count': favorite_count,
        'created_at': created_at,
        'text': text,
        'parent_tweet_id': parent_tweet_id
    }

In [5]:

# Final transformation and writing to a CSV file
def write_to_csv(partition, partition_number):
    with open(f'output_{partition_number}.csv', 'a', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['tweet_id', 'tweet_type', 'hashtags', 'mentions', 'lang', 'favorite_count', 'created_at', 'text', 'parent_tweet_id'])
        for item in partition:
            # Perform your final transformations and API calls here
            writer.writerow(item)

In [None]:

# Use Dask's map_partitions method to apply the function to each partition
b.map_partitions(write_to_csv).compute()

# Only use belove if you know that we have analyzed the first 5 partitions
# b.partitions[5:].map_partitions(write_to_csv).compute()

