In [None]:
!pip install -q tqdm pyriksdagen ipywidgets

In [None]:
from pyparlaclarin.read import paragraph_iterator, speeches_with_name
from pyriksdagen.utils import protocol_iterators, download_corpus

from pathlib import Path
from queue import Queue
from lxml import etree
from tqdm.auto import tqdm

import progressbar
import pyriksdagen
import ipywidgets

import sqlite3
import re
import os

In [None]:
parser = etree.XMLParser(remove_blank_text=True)

data_dir = Path('.').resolve() / 'data'

In [None]:
if len([file for file in data_dir.iterdir() if file.is_file()]) < 25:
    print('Did not find metadata files. Downloading.')
    download_corpus(partitions=["persons"])

records_data_dir = data_dir / 'records'
if len([subdir for subdir in data_dir.iterdir() if subdir.is_dir()]) < 158:
    print('Did not find protocols files. Downloading.')
    download_corpus(partitions=["records"])


In [None]:
protocols = list(sorted(protocol_iterators(corpus_root="data/", start=1899, end=1941)))
print(len(protocols))

In [None]:
int(protocols[0].split('/')[1][:4])

In [None]:
def prepare_roots(protocols):
    for protocol in protocols:
        year = int(protocol.split('/')[1][:4])
        yield etree.parse(protocol, parser).getroot(), year

In [None]:
def process_root_queue(q: Queue):
    while not q.empty():
        c, element, year = q.get()
        if (who:= element.get('who')) is not None:
            u_id = element.get([key for key in element.keys() if key.endswith('}id')][0])
            assert u_id
            prev = element.get('prev')
            nxt = element.get('next')
    
            text = '\n\n'.join(re.sub(r'\s+' ,' ', seg.text) for seg in element.getchildren())
            yield u_id, prev, nxt, text, who, year
        else:
            for child in element.getchildren():
                if (child.tag.endswith('note') or child.tag.endswith('seg')) and not bool(re.search(r'^\S+dag', child.text)):
                    continue
                q.put((c+1, child, year))
        


In [None]:
def extract_all_utterances(protocols):
    q = Queue()
    for root, year in prepare_roots(protocols):
        q.put((0,root, year))
    yield from process_root_queue(q)
    

In [None]:
all_utterances = []
for utterance in tqdm(extract_all_utterances(protocols), total=701_218): #total=5273785):
    all_utterances.append(utterance)

In [None]:
from itertools import islice
def batched(iterable, n, *, strict=False):
    # batched('ABCDEFG', 2) → AB CD EF G
    if n < 1:
        raise ValueError('n must be at least one')
    iterator = iter(iterable)
    while batch := tuple(islice(iterator, n)):
        if strict and len(batch) != n:
            raise ValueError('batched(): incomplete batch')
        yield batch

In [27]:

tmp_db = './tmp.db'
if os.path.exists(tmp_db):
    os.unlink(tmp_db)

with sqlite3.connect(tmp_db) as conn:
    cur = conn.cursor()
    cur.execute('CREATE TABLE utterance (id str primary key, prev text, next text, who text, year int)')
    # cur.execute("PRAGMA compile_options LIKE '%SQLITE_ENABLE_FTS5%';")
    cur.execute('CREATE VIRTUAL TABLE utterance_fts USING fts5(content)')
    cur.execute('CREATE index next_index on utterance(next)')
    cur.execute('CREATE index prev_index on utterance(prev)')
    cur.execute('CREATE index who_index on utterance(who)')
    cur.execute('CREATE index year_index on utterance(year)')

    data = []
    for batch in tqdm(batched(all_utterances, 50_000), total=len(all_utterances)//50_000):
        data = [{'id':u_id, 'prev':prev, 'next':nxt, 'content':text, 'who':who, 'year':year} for u_id, prev, nxt, text, who, year in batch]
        cur.executemany('INSERT INTO utterance_fts (content) values (:content)', data)
        cur.executemany('INSERT INTO utterance (id, prev, next, who, year) values (:id, :prev, :next, :who, :year)', data)
    
        conn.commit()
            
    # TODO: Add speaker metadata
    

15it [00:36,  2.44s/it]                                                                                                                                                                                                                       


In [37]:
from collections import Counter

term_counter = Counter()

with sqlite3.connect(tmp_db) as conn:
    cur = conn.cursor()

    
    print(cur.execute('select count(*) from utterance_fts where content match "kvinna AND kvinnor"').fetchall())
    print(cur.execute('select count(*) from utterance_fts where content match "kvinna"').fetchall())
    print(cur.execute('select count(*) from utterance_fts where content match "kvinnor"').fetchall())
    print(cur.execute('select count(*) from utterance_fts where content match "kvinna OR kvinnor"').fetchall())
    print(cur.execute('select count(*) from utterance_fts where content match "kvinn*"').fetchall())

    term_counter = Counter((token.lower() for content in cur.execute('select content from utterance_fts where content match "kvinn*"')
        for token in content[0].split() if token.lower().startswith('kvinn')))
            
   


[(542,)]
[(2126,)]
[(3993,)]
[(5577,)]
[(10760,)]


In [38]:
term_counter.most_common()

[('kvinnliga', 4084),
 ('kvinnor', 3497),
 ('kvinnorna', 3035),
 ('kvinnan', 2212),
 ('kvinnor,', 1921),
 ('kvinna', 1698),
 ('kvinnans', 1509),
 ('kvinnornas', 1113),
 ('kvinnlig', 1015),
 ('kvinna,', 952),
 ('kvinnor.', 648),
 ('kvinnorna,', 534),
 ('kvinnors', 406),
 ('kvinnan,', 323),
 ('kvinnorna.', 306),
 ('kvinnas', 295),
 ('kvinna.', 213),
 ('kvinnligt', 188),
 ('kvinnan.', 168),
 ('kvinnliga,', 69),
 ('kvinnorösträtten', 62),
 ('kvinnoförbundet', 56),
 ('kvinnliga.', 54),
 ('kvinnohåll', 37),
 ('kvinnorösträttens', 36),
 ('kvinnorösträtten,', 33),
 ('kvinnlig,', 32),
 ('kvinnorna?', 28),
 ('kvinnor?', 25),
 ('kvinnor;', 23),
 ('kvinnoorganisationer', 22),
 ('kvinnohåll,', 20),
 ('kvinnlig.', 19),
 ('kvinn-', 19),
 ('kvinnofråga.', 18),
 ('kvinnorörelsen', 18),
 ('kvinnofrågan', 15),
 ('kvinnor.»', 15),
 ('kvinnorna:', 14),
 ('kvinnoförbundets', 14),
 ('kvinnornas.', 13),
 ('kvinnorösträtten.', 13),
 ('kvinna»', 12),
 ('kvinnoförbund', 12),
 ('kvinnor».', 11),
 ('kvinnoförening