# Twitter Data Processing Pipeline with Dask and Incremental Writes


## Introduction

This notebook provides a complete pipeline for processing tweet data, optimized for large datasets using Dask. 
It also includes incremental writes to a CSV file to save progress.


In [1]:

import dask.bag as db
import json
import csv


In [13]:

# Read the JSONL file
b = db.read_text('../stream_tweetids/streamV2_tweetnet_2023-06.jsons/streamV2_tweetnet_2023-06.jsons').map(json.loads)


In [3]:

# Define the transformation function
def extract_fields(json_obj):
    tweet_id = json_obj.get('tweet_id', '')
    tweet_type = json_obj.get('tweet_type', '')
    hashtags = json_obj.get('hashtags', [])
    mentions = json_obj.get('mentions', [])
    return {
        'tweet_id': tweet_id,
        'tweet_type': tweet_type,
        'hashtags': hashtags,
        'mentions': mentions
    }


In [5]:

# API call
import requests

def fetch_additional_info(tweet_id):
    url = "https://cdn.syndication.twimg.com/tweet-result"
    querystring = {"id": tweet_id, "lang": "en", "token": "x"}
    payload = ""
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0",
    "Accept": "*/*",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Origin": "https://platform.twitter.com",
    "Connection": "keep-alive",
    "Referer": "https://platform.twitter.com/",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "cross-site",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache",
    "TE": "trailers"
    }
    response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
    return response.text


In [6]:

def parse_api_response(api_response):
    parsed_data = json.loads(api_response)
    lang = parsed_data.get('lang', '')
    favorite_count = parsed_data.get('favorite_count', 0)
    created_at = parsed_data.get('created_at', '')
    text = parsed_data.get('text', '')
    parent_tweet_id = parsed_data.get('parent', {}).get('id_str', '')
    
    return {
        'lang': lang,
        'favorite_count': favorite_count,
        'created_at': created_at,
        'text': text,
        'parent_tweet_id': parent_tweet_id
    }


In [7]:

# Initialize a CSV writer and write the header
with open('output.csv', 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['tweet_id', 'tweet_type', 'hashtags', 'mentions', 'lang', 'favorite_count', 'created_at', 'text', 'parent_tweet_id'])
    writer.writeheader()

# Function to write a single row to the CSV file
def write_row_to_csv(row):
    with open('output.csv', 'a', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['tweet_id', 'tweet_type', 'hashtags', 'mentions', 'lang', 'favorite_count', 'created_at', 'text', 'parent_tweet_id'])
        writer.writerow(row)

# Function to process a single JSON object (this includes the API call)
def process_json_object(json_obj):
    # Extract initial fields
    row = extract_fields(json_obj)
    
    # Fetch additional info from API (You'll have to add your API logic)
    api_response = fetch_additional_info(row['tweet_id'])
    
    # Parse the API response
    additional_info = parse_api_response(api_response)
    
    # Merge initial data and additional info
    row.update(additional_info)
    
    # Write the row to CSV
    write_row_to_csv(row)


In [14]:
# Create partitions (for example, 1000 tweets per partition)
n_partition = 100
bag = b.repartition(npartitions=b.npartitions // n_partition)

def process_partition(partition):
    results = []
    for json_obj in partition:
        row = extract_fields(json_obj)
        api_response = fetch_additional_info(row['tweet_id'])
        additional_info = parse_api_response(api_response)
        row.update(additional_info)
        results.append(row)
    return results

# Apply the function to each partition
processed_partitions = bag.map_partitions(process_partition).compute

ZeroDivisionError: division by zero