# Twitter Data Processing Pipeline with Dask and Incremental Writes


## Introduction

This notebook provides a complete pipeline for processing tweet data, optimized for large datasets using Dask. 
It also includes incremental writes to a CSV file to save progress.


In [2]:
import warnings
import pandas as pd
import json
import csv
import requests
from typing import Dict, List, Any
import logging
import numpy as np
import concurrent.futures
import os

In [3]:
warnings.filterwarnings(action='ignore', category=FutureWarning)
# warnings.filterwarnings(action='ignore', category=FutureWarning, module='pyspark')

In [4]:
# Initialize Spark session
# Configure logging
logging.basicConfig(level=logging.INFO)
logging.basicConfig(filename='processing.log', level=logging.INFO)
# spark = SparkSession.builder \
#     .appName("Twitter Data Processing") \
#     .getOrCreate()


In [5]:
# data_name = 'streamV2_tweetnet_2023-06'

In [6]:
# Read the JSONL file
# df = pd.read_json(f'../data/{data_name}.jsons', lines=True)

In [7]:
# Define the transformation function
def extract_fields(json_obj):
    tweet_id = json_obj.get('tweet_id', '')
    tweet_type = json_obj.get('tweet_type', '')
    hashtags = json_obj.get('hashtags', [])
    mentions = json_obj.get('mentions', [])
    return {
        'tweet_id': tweet_id,
        'tweet_type': tweet_type,
        'hashtags': hashtags,
        'mentions': mentions
    }


In [8]:
# API call
def fetch_additional_info(tweet_id):
    url = "https://cdn.syndication.twimg.com/tweet-result"
    querystring = {"id": tweet_id, "lang": "en", "token": "x"}
    payload = ""
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0",
    "Accept": "*/*",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Origin": "https://platform.twitter.com",
    "Connection": "keep-alive",
    "Referer": "https://platform.twitter.com/",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "cross-site",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache",
    "TE": "trailers"
    }
    try:
        response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
        if response.status_code != 200:
            # print(f"Failed to fetch additional info for tweet_id {tweet_id}")
            return None
    except Exception as e:
        logging.error(f'Failed to fetch additional info for tweet_id {tweet_id}')
        return None
    return response.text


In [9]:
# api_response_test = fetch_additional_info(1664013863526735874)

In [10]:

def parse_api_response(api_response):
    if not api_response:
        return {}
    try:
        parsed_data = json.loads(api_response)
    except json.JSONDecodeError:
        logging.error(f'Failed parse_api_response {api_response}')
        return {}
    
    lang = parsed_data.get('lang', '')
    favorite_count = parsed_data.get('favorite_count', 0)
    created_at = parsed_data.get('created_at', '')
    text = parsed_data.get('text', '')
    parent_tweet_id = parsed_data.get('parent', {}).get('id_str', '')
    
    return {
        'lang': lang,
        'favorite_count': favorite_count,
        'created_at': created_at,
        'text': text,
        'parent_tweet_id': parent_tweet_id
    }


In [11]:
# parsed_api_response_test = parse_api_response(api_response_test)

In [12]:
# this cell not used

# # Initialize a CSV writer and write the header
# with open(f'output_{data_name}.csv', 'w', newline='') as f:
#     writer = csv.DictWriter(f, fieldnames=['tweet_id', 'tweet_type', 'hashtags', 'mentions', 'lang', 'favorite_count', 'created_at', 'text', 'parent_tweet_id'])
#     writer.writeheader()

# Function to write a single row to the CSV file
def write_row_to_csv(row):
    try:
        with open('output_{data_name}.csv', 'a', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=['tweet_id', 'tweet_type', 'hashtags', 'mentions', 'lang', 'favorite_count', 'created_at', 'text', 'parent_tweet_id'])
            writer.writerow(row)
    except Exception as e:
        print(f"Failed to write row to CSV: {e}")


# Function to process a single JSON object (this includes the API call)
def process_json_object(json_obj):
    try: 
        # Extract initial fields
        row = extract_fields(json_obj)
        
        # Fetch additional info from API (You'll have to add your API logic)
        api_response = fetch_additional_info(row['tweet_id'])
        
        # Parse the API response
        additional_info = parse_api_response(api_response)
        
        # Merge initial data and additional info
        row.update(additional_info)
        
        # Write the row to CSV
        write_row_to_csv(row)
    except Exception as e:
        print(f"Failed to process JSON object: {e}")



In [13]:
def custom_write_csv(df: pd.DataFrame, output_path: str, data_name: str):
    file_name = os.path.join(output_path, f'output_{data_name}.csv')
    try:
        df.to_csv(file_name, mode='a', index=False, header=False)
    except Exception as e:
        logging.error(f'Failed to write chunk {e}')
        

# Define a function to process a chunk of data
def process_chunk(df_chunk: pd.DataFrame, output_path: str, data_name: str):
    logging.info(f'Processing chunk entered')
    results = []
    for idx, row in df_chunk.iterrows():
        row_dict = row.to_dict()
        logging.info(f'Processing tweet id: {row_dict["tweet_id"]}')
        api_response = fetch_additional_info(row_dict['tweet_id'])
        additional_info = parse_api_response(api_response)
        row_dict.update(additional_info)
        # Convert hashtags and mentions array to a comma-separated string
        row_dict['hashtags'] = ','.join(row_dict['hashtags']) if isinstance(row_dict['hashtags'], (list, tuple)) else ''
        row_dict['mentions'] = ','.join(row_dict['mentions']) if isinstance(row_dict['mentions'], (list, tuple)) else ''
        results.append(row_dict)
    result_df = pd.DataFrame(results)
    # Filter the DataFrame to only include the columns specified in the schema
    result_df = result_df[['tweet_id', 'tweet_type', 'hashtags', 'mentions', 'lang', 'favorite_count', 'created_at', 'text', 'parent_tweet_id']]
    custom_write_csv(result_df, output_path, data_name)  # Pass output_path and data_name to custom_write_csv

def process_data_in_parallel(df, output_path: str, data_name: str):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        chunks = np.array_split(df, 10)
        # Use a lambda function to pass the output_path and data_name arguments to process_chunk
        executor.map(lambda chunk: process_chunk(chunk, output_path, data_name), chunks)


In [14]:
input_folder_path = './test_Folder'
output_folder_path = '../data/output/test_Folder'
os.makedirs(output_folder_path, exist_ok=True)  # Create output folder if it doesn't exist

def process_file(file_path, output_path):
    # Extract data_name from the file path
    data_name = os.path.basename(file_path).replace('.jsons', '')
    df = pd.read_json(file_path, lines=True)
    
    # Create the output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    
    # Write the header to the output file
    output_file = os.path.join(output_path, f'output_{data_name}.csv')
    header_df = pd.DataFrame(columns=['tweet_id', 'tweet_type', 'hashtags', 'mentions', 'lang', 'favorite_count', 'created_at', 'text', 'parent_tweet_id'])
    header_df.to_csv(output_file, index=False)
    
    # Process each chunk in parallel
    process_data_in_parallel(df, output_path, data_name)

def process_all_files_in_folder(folder_path, output_folder_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.jsons'):
            file_path = os.path.join(folder_path, file_name)
            logging.info(f'Processing file: {file_name}')
            process_file(file_path, output_folder_path)

In [15]:
process_all_files_in_folder(input_folder_path, output_folder_path)

INFO:root:Processing file: streamV2_tweetnet_2023-06_1.jsons
INFO:root:Processing chunk entered
INFO:root:Processing tweet id: 1664013858896326656
INFO:root:Processing chunk entered
INFO:root:Processing chunk entered
INFO:root:Processing chunk entered
INFO:root:Processing chunk entered
INFO:root:Processing tweet id: 1664013868543234048
INFO:root:Processing chunk entered
INFO:root:Processing tweet id: 1664013879880327168
INFO:root:Processing chunk entered
INFO:root:Processing chunk entered
INFO:root:Processing tweet id: 1664013888923533312
INFO:root:Processing chunk entered
INFO:root:Processing tweet id: 1664013897005780992
INFO:root:Processing chunk entered
INFO:root:Processing tweet id: 1664013905071308800
INFO:root:Processing tweet id: 1664013913027911680
INFO:root:Processing tweet id: 1664013918501576704
INFO:root:Processing tweet id: 1664013930677641216
INFO:root:Processing tweet id: 1664013936839077888
INFO:root:Processing tweet id: 1664013872343203840
INFO:root:Processing tweet i

In [None]:
# # Call the function to process all files in the specified folder
# file_path = "./test_Folder/streamV2_tweetnet_2023-06_0.jsons"
# # Extract data_name from the file path
# data_name = os.path.basename(file_path).replace('.jsons', '')
# df = pd.read_json(file_path, lines=True)
# output_file = os.path.join(output_folder_path, f'output_{data_name}.csv')  # Update this line to use output_folder_path

#     # Write the header to the output file
# header_df = pd.DataFrame(columns=['tweet_id', 'tweet_type', 'hashtags', 'mentions', 'lang', 'favorite_count', 'created_at', 'text', 'parent_tweet_id'])
# header_df.to_csv(output_file, index=False)
# process_chunk(df, output_file)

## For other one-time purposes

In [None]:
# # Split .jsons file into several equal parts
# import os
# data_name_to_be_splitted = "streamV2_tweetnet_2023-03"

# # Define the new folder name
# new_folder = f"{data_name_to_be_splitted}_splitted"

# # Ensure the folder exists, create it if not
# os.makedirs(new_folder, exist_ok=True)

# def split_file(large_file_path, lines_per_file):
#     with open(large_file_path, 'r') as file:
#         file_count = 0
#         current_line_count = 0
#         # Modify the file path to include the new folder
#         current_file = open(f'{new_folder}/{data_name_to_be_splitted}_{file_count}.jsons', 'w')
#         for line in file:
#             if current_line_count < lines_per_file:
#                 current_file.write(line)
#                 current_line_count += 1
#             else:
#                 current_file.close()
#                 file_count += 1
#                 # Modify the file path to include the new folder
#                 current_file = open(f'{new_folder}/{data_name_to_be_splitted}_{file_count}.jsons', 'w')
#                 current_file.write(line)
#                 current_line_count = 1
#         current_file.close()

# # Call the function as usual
# split_file(f'../data/{data_name_to_be_splitted}.jsons', 10000)

In [None]:
# # TEST Usage:
# folder_path = './test_Folder'
# output_folder_path = '../data/output_files'
# os.makedirs(output_folder_path, exist_ok=True)  # Create output folder if it doesn't exist

# def process_file(file_path):
#     # Extract data_name from the file path
#     data_name = os.path.basename(file_path).replace('.jsons', '')
#     df = pd.read_json(file_path, lines=True)
#     output_file = os.path.join(output_folder_path, f'output_{data_name}.csv')  # Update this line to use output_folder_path

#     # Write the header to the output file
#     header_df = pd.DataFrame(columns=['tweet_id', 'tweet_type', 'hashtags', 'mentions', 'lang', 'favorite_count', 'created_at', 'text', 'parent_tweet_id'])
#     header_df.to_csv(output_file, index=False)
    
#     # Process each chunk in parallel
#     process_data_in_parallel(df, output_file)

# def process_all_files_in_folder(folder_path):
#     for file_name in os.listdir(folder_path):
#         if file_name.endswith('.jsons'):
#             file_path = os.path.join(folder_path, file_name)
#             print(f'Processing file: {file_name}')  # Print the file name for tracking
#             process_file(file_path)

In [None]:
# Call the function to process all files in the specified folder
process_all_files_in_folder(folder_path)