# Twitter Data Processing Pipeline with Dask and Incremental Writes


## Introduction

This notebook provides a complete pipeline for processing tweet data, optimized for large datasets using Dask. 
It also includes incremental writes to a CSV file to save progress.


In [25]:
import warnings
import pandas as pd
import json
import csv
import requests
from typing import Dict, List, Any
import logging
import numpy as np

In [15]:
warnings.filterwarnings(action='ignore', category=FutureWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning, module='pyspark')

In [16]:
# Initialize Spark session
# Configure logging
logging.basicConfig(level=logging.INFO)
# spark = SparkSession.builder \
#     .appName("Twitter Data Processing") \
#     .getOrCreate()


In [17]:
data_name = 'streamV2_tweetnet_2023-06'

In [18]:
# Read the JSONL file
df = pd.read_json(f'../data/{data_name}.jsons', lines=True)

In [19]:
# Define the transformation function
def extract_fields(json_obj):
    tweet_id = json_obj.get('tweet_id', '')
    tweet_type = json_obj.get('tweet_type', '')
    hashtags = json_obj.get('hashtags', [])
    mentions = json_obj.get('mentions', [])
    return {
        'tweet_id': tweet_id,
        'tweet_type': tweet_type,
        'hashtags': hashtags,
        'mentions': mentions
    }


In [31]:
# API call
import requests

def fetch_additional_info(tweet_id):
    url = "https://cdn.syndication.twimg.com/tweet-result"
    querystring = {"id": tweet_id, "lang": "en", "token": "x"}
    payload = ""
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0",
    "Accept": "*/*",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Origin": "https://platform.twitter.com",
    "Connection": "keep-alive",
    "Referer": "https://platform.twitter.com/",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "cross-site",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache",
    "TE": "trailers"
    }
    try:
        response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
        if response.status_code != 200:
            print(f"Failed to fetch additional info for tweet_id {tweet_id}: {response.text}")
            return None
    except Exception as e:
        logging.error(f'Failed to fetch additional info for tweet_id {tweet_id}: {e}')
        return None
    return response.text


In [32]:
# fetch_additional_info(1664013863526735874)

'{"__typename":"Tweet","in_reply_to_screen_name":"DrSinanOgan","in_reply_to_status_id_str":"1664012893799448579","in_reply_to_user_id_str":"177884455","lang":"und","favorite_count":0,"created_at":"2023-05-31T20:59:55.000Z","display_text_range":[13,22],"entities":{"hashtags":[],"urls":[],"user_mentions":[{"id_str":"177884455","indices":[0,12],"name":"Dr. Sinan Oğan","screen_name":"DrSinanOgan"}],"symbols":[]},"id_str":"1664013863526735874","text":"@DrSinanOgan Offf 🤦🤦","user":{"id_str":"1256361951862689797","name":"Ivan Karamazov","profile_image_url_https":"https://pbs.twimg.com/profile_images/1673470909993168899/vHLg8HHU_normal.jpg","screen_name":"fyodorkrmzv","verified":false,"is_blue_verified":false,"profile_image_shape":"Circle"},"edit_control":{"edit_tweet_ids":["1664013863526735874"],"editable_until_msecs":"1685568595000","is_edit_eligible":false,"edits_remaining":"5"},"conversation_count":0,"news_action_type":"conversation","parent":{"lang":"tr","reply_count":14795,"retweet_count

In [21]:

def parse_api_response(api_response):
    if not api_response:
        return {}
    try:
        parsed_data = json.loads(api_response)
    except json.JSONDecodeError:
        print(f"Failed to parse API response: {api_response}")
        return {}
    
    lang = parsed_data.get('lang', '')
    favorite_count = parsed_data.get('favorite_count', 0)
    created_at = parsed_data.get('created_at', '')
    text = parsed_data.get('text', '')
    parent_tweet_id = parsed_data.get('parent', {}).get('id_str', '')
    
    return {
        'lang': lang,
        'favorite_count': favorite_count,
        'created_at': created_at,
        'text': text,
        'parent_tweet_id': parent_tweet_id
    }


In [22]:

# # Initialize a CSV writer and write the header
# with open(f'output_{data_name}.csv', 'w', newline='') as f:
#     writer = csv.DictWriter(f, fieldnames=['tweet_id', 'tweet_type', 'hashtags', 'mentions', 'lang', 'favorite_count', 'created_at', 'text', 'parent_tweet_id'])
#     writer.writeheader()

# Function to write a single row to the CSV file
def write_row_to_csv(row):
    try:
        with open('output_{data_name}.csv', 'a', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=['tweet_id', 'tweet_type', 'hashtags', 'mentions', 'lang', 'favorite_count', 'created_at', 'text', 'parent_tweet_id'])
            writer.writerow(row)
    except Exception as e:
        print(f"Failed to write row to CSV: {e}")


# Function to process a single JSON object (this includes the API call)
def process_json_object(json_obj):
    try: 
        # Extract initial fields
        row = extract_fields(json_obj)
        
        # Fetch additional info from API (You'll have to add your API logic)
        api_response = fetch_additional_info(row['tweet_id'])
        
        # Parse the API response
        additional_info = parse_api_response(api_response)
        
        # Merge initial data and additional info
        row.update(additional_info)
        
        # Write the row to CSV
        write_row_to_csv(row)
    except Exception as e:
        print(f"Failed to process JSON object: {e}")

# Define a function to process a chunk of data
def process_chunk(df_chunk: pd.DataFrame) -> pd.DataFrame:
    results = []
    for idx, row in df_chunk.iterrows():
        row_dict = row.to_dict()
        api_response = fetch_additional_info(row_dict['tweet_id'])
        additional_info = parse_api_response(api_response)
        row_dict.update(additional_info)
        # Convert hashtags and mentions array to a comma-separated string
        row_dict['hashtags'] = ','.join(row_dict['hashtags']) if isinstance(row_dict['hashtags'], (list, tuple)) else ''
        row_dict['mentions'] = ','.join(row_dict['mentions']) if isinstance(row_dict['mentions'], (list, tuple)) else ''
        results.append(row_dict)
    result_df = pd.DataFrame(results)
    # Filter the DataFrame to only include the columns specified in the schema
    result_df = result_df[['tweet_id', 'tweet_type', 'hashtags', 'mentions', 'lang', 'favorite_count', 'created_at', 'text', 'parent_tweet_id']]
    return result_df


In [23]:
def custom_write_csv(df: pd.DataFrame, file_name: str):
    chunks = np.array_split(df, 10)  # Splitting the DataFrame into 10 chunks
    for idx, chunk in enumerate(chunks):
        try:
            # Append the chunk to the CSV file
            chunk.to_csv(file_name, mode='a' if idx > 0 else 'w', index=False, header= idx == 0)
        except Exception as e:
            print(f"Failed to write chunk: {e}")

In [33]:
# Process the DataFrame in chunks
processed_chunks = [process_chunk(chunk) for chunk in np.array_split(df, 10)]

# Concatenate the processed chunks into a single DataFrame
processed_df = pd.concat(processed_chunks, ignore_index=True)


Failed to fetch additional info for tweet_id 1664013858896326656: <!DOCTYPE html>
<html lang="en" class="dog">
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Twitter / ?</title>
    <meta name="version" content="1">
    <link href="https://abs.twimg.com/favicons/favicon.ico" rel="shortcut icon" type="image/x-icon">
    <link rel="stylesheet" href="https://abs.twimg.com/errors/fullscreen_errors-39d97faf8c7cf33e502c3c015176efd9.css">
  </head>
  <body dir="auto">
    <div class="top">
      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1519 200">
        <defs/>
        <path d="M708 103l9-5 13 1 2-2 5 1h8l1-1 42-5h7l7 3v1l12-2a9368 9368 0 0129-6l2-5 12 1 6-2 3 1 2 1v-2l13-2v1l4 2 7-1h1l17-3 1 2 4-1 18-7 3-1 4-1h9l13-1 1-4 3-2 13 1v2l22-2 10-3h6l10 5 16-5 15-3 2-1 8-4 6-2 12-1 7-2 8 5 7 1 22-9h7l5-1v-1l25-3 1 2 5 2v1h11l2 1 8-4h3l-1 1h11l10-6 6 3 9-1 4-1 1 1 1-2 

In [None]:
# Write the processed data to CSV
custom_write_csv(processed_df, f'output_{data_name}.csv')