In [1]:
import os
import zipfile
import re
import csv

# Function to append data to a file
def append_to_file(data, filename):
    # Check if file exists to write headers
    file_exists = False
    try:
        with open(filename, 'r') as f:
            file_exists = True
    except FileNotFoundError:
        pass

    with open(filename, 'a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=data.keys())
        if not file_exists:
            writer.writeheader()
        writer.writerow(data)

def extract_vrt_data_from_chunk(chunk):
    # Extract attributes of <text> element
    msg_type = re.search(r'msg_type="(.*?)"', chunk).group(1)
    datetime = re.search(r'datetime="(.*?)"', chunk).group(1)
    title = re.search(r'title="(.*?)"', chunk).group(1)
    thread_id = re.search(r'thread_id="(.*?)"', chunk).group(1)
    comment_id = re.search(r'comment_id="(.*?)"', chunk).group(1)
    topic_name_top = re.search(r'topic_name_top="(.*?)"', chunk).group(1)
    topic_name_leaf = re.search(r'topic_name_leaf="(.*?)"', chunk).group(1)

    # Extract sentences
    sentences = re.findall(r'<sentence(.*?)</sentence>', chunk, re.DOTALL)
    thread_text = ' '.join([' '.join(re.findall(r'^(\S+?)\t', sentence, re.MULTILINE)) for sentence in sentences])

    return {
        'msg_type': msg_type,
        'datetime': datetime,
        'title': title,
        'thread_id':thread_id,
        'comment_id':comment_id,
        'topic_name_top':topic_name_top,
        'topic_name_leaf':topic_name_leaf,
        'thread_text': thread_text,
    }

def process_file_in_chunks(file_path, num_messages):
    chunk = ""
    count = 0

    with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
        for line in file:
            chunk += line
            if '</text>' in line:
                yield extract_vrt_data_from_chunk(chunk)
                chunk = ""
                count += 1
                if count%num_messages == 0:
                    print(f"{count} number of messages Done!")
                # if count >= num_messages:
                #     break

def process_zip_files(zip_filenames, output_file):
    for zip_filename in zip_filenames:
        # Open the zip file
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            # Iterate over each .vrt file in the zip archive
            for vrt_file in zip_ref.namelist():
                if vrt_file.endswith('.vrt'):
                    with zip_ref.open(vrt_file, 'r') as file:
                        chunk = ""
                        count = 0
                        for line in file:
                            line = line.decode('utf-8', errors='replace')
                            chunk += line
                            if '</text>' in line:
                                entry = extract_vrt_data_from_chunk(chunk)
                                append_to_file(entry, output_file)
                                chunk = ""
                                count += 1
                                if count % num_messages == 0:
                                    print(f"{count} number of messages Done from {zip_filename}/{vrt_file}!")


zip_files = ['data/suomi24-2001-2017-vrt-v1-2.zip', 'data/suomi24-2018-2020-vrt-beta.zip']
output_file = 'data/parsed_data.csv'
num_messages = 500  # Change this to the number of messages you want to parse per file

process_zip_files(zip_files, output_file)

print("Done!")


500 number of messages Done from data/suomi24-2001-2017-vrt-v1-2.zip/suomi24-2001-2017-vrt-v1-2/vrt/s24_2001.vrt!
1000 number of messages Done from data/suomi24-2001-2017-vrt-v1-2.zip/suomi24-2001-2017-vrt-v1-2/vrt/s24_2001.vrt!
1500 number of messages Done from data/suomi24-2001-2017-vrt-v1-2.zip/suomi24-2001-2017-vrt-v1-2/vrt/s24_2001.vrt!
2000 number of messages Done from data/suomi24-2001-2017-vrt-v1-2.zip/suomi24-2001-2017-vrt-v1-2/vrt/s24_2001.vrt!
2500 number of messages Done from data/suomi24-2001-2017-vrt-v1-2.zip/suomi24-2001-2017-vrt-v1-2/vrt/s24_2001.vrt!
3000 number of messages Done from data/suomi24-2001-2017-vrt-v1-2.zip/suomi24-2001-2017-vrt-v1-2/vrt/s24_2001.vrt!
3500 number of messages Done from data/suomi24-2001-2017-vrt-v1-2.zip/suomi24-2001-2017-vrt-v1-2/vrt/s24_2001.vrt!
4000 number of messages Done from data/suomi24-2001-2017-vrt-v1-2.zip/suomi24-2001-2017-vrt-v1-2/vrt/s24_2001.vrt!
4500 number of messages Done from data/suomi24-2001-2017-vrt-v1-2.zip/suomi24-200

KeyboardInterrupt: 