In [1]:
import os


output_folder = "/media/ducha/SSDSHARED/VN/subs_dump/viet_subs_processed2"


def prepare_corpus(output_folder):
    def get_file(file_name):
        with open(os.path.join(output_folder, file_name), "r") as f:
            # Read each line and strip the newline character
            return set([line.strip() for line in f.readlines()])

    # List all files in the folder, that are actually vietnamese
    available_files = [
        fname for fname in os.listdir(output_folder) if fname.rsplit(".", 4)[1] == "vie"
    ]

    corpus = [
        (file_name.rsplit(".", 4)[0], get_file(file_name))
        for file_name in available_files
    ]
    return corpus


corpus = prepare_corpus(output_folder)
print(sum([len(c[1]) for c in corpus]))

7736833


# Raw Python

In [2]:
import re


def find_examples(ex, corpus, sentence_min_words=5, sentence_word_limit=30):
    found_examples = []
    regex_pattern = re.compile(rf"\b{ex}\b", re.IGNORECASE)
    for file_name, file_content in corpus:
        for line in file_content:
            # Check case insensitive, and sentence should not go over word limit
            contains_word = regex_pattern.search(line)
            under_word_limit = len(line.split()) < sentence_word_limit
            over_min_words = len(line.split()) > sentence_min_words
            if contains_word and under_word_limit and over_min_words:
                found_examples.append((file_name, line))

    # sort by Sentence length
    return sorted(found_examples, key=lambda x: len(x[1].split()), reverse=True)

In [4]:
%%time

example = "cô gái"
results = find_examples(example, corpus)
len(results)

CPU times: user 12.6 s, sys: 3.19 ms, total: 12.6 s
Wall time: 12.6 s


12752

# numpy

In [7]:
import numpy as np

corpus_np = np.array(corpus)

# Multiprocessing

## Manager and Queues

In [8]:
# Number of processes
from multiprocessing import Process, Manager, Pool
import math

# Create a Manager object to manage the shared state
num_processes = 6
manager = Manager()
result_queue = manager.Queue()

sentence_min_words = 5
sentence_word_limit = 30


def find_examples_mp(ex_pattern, corpus, result_queue):
    found_examples = []
    for file_name, file_content in corpus:
        for line in file_content:
            # Check case insensitive, and sentence should not go over word limit
            contains_word = ex_pattern.search(line)
            under_word_limit = len(line.split()) < sentence_word_limit
            over_min_words = len(line.split()) > sentence_min_words
            if contains_word and under_word_limit and over_min_words:
                found_examples.append((file_name, line))

    result_queue.put(found_examples)

In [9]:
%%time

# Create and start the processes
processes = []
split_size = len(corpus_np) // num_processes

ex_pattern = re.compile(rf"\b{example}\b", re.IGNORECASE)
for i in range(num_processes):
    split_corpus = corpus_np[i * split_size : (i + 1) * split_size]
    print("Assigning corpus length", len(split_corpus))
    p = Process(target=find_examples_mp, args=(ex_pattern, split_corpus, result_queue))
    p.start()
    processes.append(p)

# Wait for all processes to finish
for p in processes:
    p.join()

# Collect the results from the Queue and concatenate them into a single list
final_result = []
while not result_queue.empty():
    final_result.extend(result_queue.get())

final_result = sorted(final_result, key=lambda x: len(x[1].split()), reverse=True)

print(len(final_result))

Assigning corpus length 2274
Assigning corpus length 2274
Assigning corpus length 2274
Assigning corpus length 2274
Assigning corpus length 2274
Assigning corpus length 2274
12744
CPU times: user 20.5 ms, sys: 127 ms, total: 148 ms
Wall time: 2.65 s


In [10]:
set(results) - set(final_result)

{('zulu.(1964)',
  'Nhưng làm thế nào họ có thể cho phép bản thân Những cô gái trẻ với những ông già.'),
 ('zulu.(1964)',
  'Ở Châu Âu, những cô gái trẻ chấp nhận có lẽ các cô gái Zulu may mắn hơn Margareta .'),
 ('zulu.(2013)', 'Anh từng thấy cô gái này chưa?'),
 ('zulu.(2013)', 'Chúng tôi đang điều tra vụ án cô gái ở Thảo cầm viên.'),
 ('zulu.(2013)', 'Cô gái giúp việc cho nhà thờ bị Trong này sao?'),
 ('zulu.(2013)', 'Cô gái nắm được tóc hắn trong tay,'),
 ('zulu.(2013)', 'Cô đã từng thấy cô gái này chưa?'),
 ('zulu.(2013)', 'Là cô gái bị giết đúng không?')}

## Pools

In [3]:
%%time
with Pool(6) as pool:
    result = pool.map(test_process, data_splits)

len(list(result))


CPU times: user 13.3 s, sys: 3.35 s, total: 16.7 s
Wall time: 18.6 s


6

# JIT - Numba

In [8]:
import numba

@numba.jit(nopython=True)
def even_suffix(s):
    return s[-1] in {"0", "2", "4", "6", "8"}

@numba.jit(nopython=True)
def test_process(string_list):
    result = []
    for string in string_list:
        if even_suffix(string):
            result.append(string)
    return result

25000000

In [9]:
%%time

result = test_process(large_set_of_strings)
len(result)

CPU times: user 44.9 s, sys: 1.54 s, total: 46.4 s
Wall time: 46.4 s


25000000

# Multithreading

In [7]:
from concurrent.futures import ThreadPoolExecutor

def even_suffix(s):
    return s[-1] in {"0", "2", "4", "6", "8"}

def test_process(string_list):
    return [string for string in string_list if even_suffix(string)]

def main(string_list, num_threads):
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        results = list(executor.map(test_process, data_splits))
    
    return [r for result in results for r in result]


In [8]:
%%time
# Example usage
num_threads = 4
result = main(large_set_of_strings, num_threads)
print(len(result))

25000000
CPU times: user 4.75 s, sys: 159 ms, total: 4.91 s
Wall time: 4.91 s
