In [1]:
import os
import zipfile
import re
import csv
from concurrent.futures import ThreadPoolExecutor
import threading

# Function to append data to a file
def append_to_file(data, filename):
    # Check if file exists to write headers
    file_exists = False
    try:
        with open(filename, 'r') as f:
            file_exists = True
    except FileNotFoundError:
        pass

    with open(filename, 'a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=data.keys())
        if not file_exists:
            writer.writeheader()
        writer.writerow(data)

def extract_vrt_data_from_chunk(chunk):
    # Extract attributes of <text> element
    msg_type = re.search(r'msg_type="(.*?)"', chunk).group(1)
    datetime = re.search(r'datetime="(.*?)"', chunk).group(1)
    title = re.search(r'title="(.*?)"', chunk).group(1)
    thread_id = re.search(r'thread_id="(.*?)"', chunk).group(1)
    comment_id = re.search(r'comment_id="(.*?)"', chunk).group(1)
    topic_name_top = re.search(r'topic_name_top="(.*?)"', chunk).group(1)
    topic_name_leaf = re.search(r'topic_name_leaf="(.*?)"', chunk).group(1)

    # Extract sentences
    sentences = re.findall(r'<sentence(.*?)</sentence>', chunk, re.DOTALL)
    thread_text = ' '.join([' '.join(re.findall(r'^(\S+?)\t', sentence, re.MULTILINE)) for sentence in sentences])

    return {
        'msg_type': msg_type,
        'datetime': datetime,
        'title': title,
        'thread_id': thread_id,
        'comment_id': comment_id,
        'topic_name_top': topic_name_top,
        'topic_name_leaf': topic_name_leaf,
        'thread_text': thread_text,
    }

def process_single_vrt_file(zip_ref, vrt_file, output_folder, num_messages, vrt_count, zip_filename):
    if vrt_file.split("/")[-1] in ['s24_2001.vrt', 's24_2002.vrt', 's24_2003.vrt']:
        return
    output_file = os.path.join(output_folder, f'parsed_data_{vrt_count}.csv')
    with zip_ref.open(vrt_file, 'r') as file:
        print(f"{zip_ref} started!")
        chunk = ""
        # count = 0
        for line in file:
            line = line.decode('utf-8', errors='replace')
            chunk += line
            if '</text>' in line:
                entry = extract_vrt_data_from_chunk(chunk)
                append_to_file(entry, output_file)
                chunk = ""
                # count += 1
                # if count % num_messages == 0:
                #     print(f"{count} number of messages Done from {zip_filename}/{vrt_file}!")

def process_zip_files(zip_filenames, output_folder, num_messages, max_threads=4):
# with ThreadPoolExecutor(max_workers=max_workers) as executor:
    threads = []
    vrt_count = 0
    for zip_filename in zip_filenames:
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            # Submit each .vrt file to the thread pool for processing
            for vrt_file in zip_ref.namelist():
                if vrt_file.endswith('.vrt'):
                    vrt_count += 1
                    thread = threading.Thread(target=process_single_vrt_file, args=(zip_ref, vrt_file, output_folder, num_messages, vrt_count, zip_filename))
                    threads.append(thread)
                    thread.start()
                    # executor.submit(process_single_vrt_file, zip_ref, vrt_file, output_folder, num_messages, vrt_count, zip_filename)
                    # If we've reached the max number of threads, wait for all of them to finish before starting more
                    if len(threads) == max_threads:
                        for t in threads:
                            t.join()
                        threads = []

    # Wait for any remaining threads to finish
    for t in threads:
        t.join()
                            
# Paths and parameters
zip_files = ['data/suomi24-2001-2017-vrt-v1-2.zip', 'data/suomi24-2018-2020-vrt-beta.zip']
output_folder = 'C:\\Users\\aghaffar23\\Desktop\\output'
num_messages = 2000  # Change this to the number of messages you want to parse per file

# Process the zip files
process_zip_files(zip_files, output_folder, num_messages, max_threads=20)

print("Done!")


<zipfile.ZipFile filename='data/suomi24-2001-2017-vrt-v1-2.zip' mode='r'> started!
<zipfile.ZipFile filename='data/suomi24-2001-2017-vrt-v1-2.zip' mode='r'> started!
<zipfile.ZipFile filename='data/suomi24-2001-2017-vrt-v1-2.zip' mode='r'> started!
<zipfile.ZipFile filename='data/suomi24-2001-2017-vrt-v1-2.zip' mode='r'> started!
<zipfile.ZipFile filename='data/suomi24-2001-2017-vrt-v1-2.zip' mode='r'> started!
<zipfile.ZipFile filename='data/suomi24-2001-2017-vrt-v1-2.zip' mode='r'> started!
<zipfile.ZipFile filename='data/suomi24-2001-2017-vrt-v1-2.zip' mode='r'> started!
<zipfile.ZipFile filename='data/suomi24-2001-2017-vrt-v1-2.zip' mode='r'> started!
<zipfile.ZipFile filename='data/suomi24-2001-2017-vrt-v1-2.zip' mode='r'> started!
<zipfile.ZipFile filename='data/suomi24-2001-2017-vrt-v1-2.zip' mode='r'> started!
<zipfile.ZipFile filename='data/suomi24-2001-2017-vrt-v1-2.zip' mode='r'> started!
<zipfile.ZipFile filename='data/suomi24-2001-2017-vrt-v1-2.zip' mode='r'> started!
<zip

Exception in thread Thread-10 (process_single_vrt_file):
Traceback (most recent call last):
  File "C:\Users\aghaffar23\AppData\Local\Programs\Python\Python311\Lib\threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "C:\Users\aghaffar23\AppData\Local\Programs\Python\Python311\Lib\threading.py", line 975, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\aghaffar23\AppData\Local\Temp\ipykernel_14004\3280777335.py", line 62, in process_single_vrt_file
  File "C:\Users\aghaffar23\AppData\Local\Temp\ipykernel_14004\3280777335.py", line 13, in append_to_file
  File "c:\Users\aghaffar23\OneDrive - Oulun yliopisto\Work\Uni\Courses\NLP\codes\venv\Lib\site-packages\IPython\core\interactiveshell.py", line 286, in _modified_open
    return io_open(file, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: [Errno 22] Invalid argument: 'C:\\Users\\aghaffar23\\Desktop\\output\\parsed_data_6.csv'
