In [1]:
from mpi4py import MPI
import os
import math
import json
import re

In [None]:
def extract_usefull(input_string:str):
    author_id_pattern = r'"author_id"\s*:\s*"(\d+)"'
    full_name_pattern = r'"full_name"\s*:\s*"([^"]+)"'
    input_string += "  }"
    if input_string[0] == ',':
        input_string = input_string[1:]

    author_id = re.search(author_id_pattern, input_string)
    full_name = re.search(full_name_pattern, input_string)

    if author_id and full_name:
        return author_id.group(1), full_name.group(1)
    else:
        return None,None

def process_twitter_data(input_file: str, output_file: str, comm, rank, size):
    with open(input_file, "r", encoding="utf-8") as f:
        with open(output_file, "w", encoding="utf-8") as output_f:
            output_f.write("[\n")

            total_byte_size = os.path.getsize(input_file)
            byte_chunk_size = math.ceil(total_byte_size / size)
            end_location = byte_chunk_size * (rank + 1)

            current_twitter_json = ""
            first_output = True

            f.seek(rank * byte_chunk_size)

            while 1:
                current_line = f.readline()
                if current_line == "]\n":
                    break
                elif current_line == " },\n" or current_line == " }\n":
                    if "homeless" in current_twitter_json:
                        author_id, full_name = extract_usefull(current_twitter_json)
                        if author_id and full_name:
                            output_dict = {
                                "author_id": author_id,
                                "full_name": full_name
                            }
                            if not first_output:
                                output_f.write(",\n")
                            else:
                                first_output = False
                            json.dump(output_dict, output_f)
                    current_twitter_json = ""
                    if f.tell() > end_location:
                        break
                else:
                    current_twitter_json += current_line
            output_f.write("\n]")

if __name__ == "__main__":
    comm = MPI.COMM_WORLD
    size = comm.Get_size()
    rank = comm.Get_rank()

    input_file = "D:/Twitter data/twitter-huge.json"
    output_file = f"D:/Twitter data/preprocess_homeless_twitterdata_rank_{rank}.json"

    process_twitter_data(input_file, output_file, comm, rank, size)