# DateAuthorGuesser Development Notebook

In [1]:
import os
import json
import re
from collections import defaultdict
import pandas as pd
import time

from sklearn.feature_extraction.text import CountVectorizer

from joblib import load, dump

In [50]:
""" PATHS """

ROOT_PATH = os.getcwd()
JSON_PATH_HEYDUK = os.path.join(ROOT_PATH, 'Dataset', 'heyduk.json')

DATA_PATH_HEYDUK = os.path.join(ROOT_PATH, 'Dataset', 'heyduk_data')

MODELS_PATH = os.path.join(ROOT_PATH, 'models')

GUESSES_PATH = os.path.join(ROOT_PATH, 'guesses')

ANALYSIS_PATH = os.path.join(ROOT_PATH, 'analysis')

In [None]:
# TODO: When transforming this to .py files change "window_size" parameter to year_window_size, so as it is not confusing with the window_size parameter in the segmentation functions!!

## PART ONE: trsnaforming heyduk.json to xml files

In [10]:
heyduk_json = json.load(open(JSON_PATH_HEYDUK, 'r', encoding='utf-8'))

In [14]:
def extract_composition_features_tokens(composition_data:dict, composition_name:str) -> dict:
    """
    Extracts the tokens from the composition data.
    
    :param composition_data: The composition data.
    :param composition_name: The name of the composition.
    """

    out_data = {'composition': composition_name, 'year': 0, 'full_data': '', 'types': []}
    
    out_data['year'] = int(composition_data['source']['year_published'])

    for element in composition_data['body']:
        stanza_type = element['meter'][0]['type']
        out_data['types'].append(stanza_type)

        out_data['full_data'] += f'<stanza pattern="{element["rhythm_pattern"]}">\n'
        words_data = element['words']
        for word in words_data:
            out_data['full_data'] += f'\t<token>form="{word['form']}" lemma="{word['lemma']}" upos="{word['upos']}" xpos="{word['xpos']}" feats="{word['feats']}"</token>\n'
        out_data['full_data'] += '</stanza>\n'

    composition_types = list(set(out_data['types']))
    # print(composition_types)
    out_data['types'] = composition_types

    out_data['full_data'] += f'<composition_info name="{composition_name}" year="{out_data["year"]}" types="{out_data["types"]}"></composition_info>'

    return out_data
    

In [15]:
""" Extracting the tokens from the Heyduk dataset """

years = []
types = []

for key in heyduk_json:
    # print('processing', key)
    composition_test = extract_composition_features_tokens(heyduk_json[key][0], key)

    years.append(composition_test['year'])
    types.append(composition_test['types'])

    with open(os.path.join(DATA_PATH_HEYDUK, f'{key}.xml'), 'w', encoding='utf-8') as file_:
        file_.write(composition_test['full_data'])

## PART TWO: Working with XML files

### Analysing dataset

In [35]:
def extract_value(text, parameter):
    """
    Extracts the value for a given parameter from the text.
    
    :param text: The text to extract from.
    :param parameter: The parameter to search for.
    :return: The extracted value.
    """
    pattern = rf'{parameter}="([^"]*)"'
    match = re.search(pattern, text)
    return eval(match.group(1)) if match else None

In [36]:
def get_year_et_types_from_composition(composition_name:str) -> tuple:
    """
    Gets the year and types from the composition name.
    
    :param composition_name: The name of the composition.
    """

    with open(os.path.join(DATA_PATH_HEYDUK, composition_name), 'r', encoding='utf-8') as file_:
        data = file_.read()
        lines = data.split('\n')
        comp_info = lines[-1]

        year = extract_value(comp_info, 'year')
        types = extract_value(comp_info, 'types')

    return year, types

In [37]:
years = []
types = []

for composition in os.listdir(DATA_PATH_HEYDUK):
    year, type_ = get_year_et_types_from_composition(composition)
    years.append(int(year))
    types.append(type_)

print('Years:', years)
print('Types:', types)

Years: [1858, 1859, 1864, 1865, 1873, 1873, 1874, 1876, 1878, 1879, 1881, 1882, 1883, 1883, 1883, 1884, 1884, 1885, 1885, 1886, 1886, 1888, 1888, 1888, 1890, 1894, 1894, 1897, 1897, 1897, 1898, 1898, 1898, 1899, 1899, 1899, 1899, 1899, 1899, 1900, 1900, 1900, 1900, 1900, 1901, 1901, 1901, 1901, 1902, 1902, 1902, 1903, 1904, 1905, 1906, 1908, 1908, 1908, 1908, 1909, 1910, 1910, 1910, 1913, 1913, 1919, 1920, 1920, 1920, 1921, 1921, 1921, 1923]
Types: [['iamb', 'dactyl', 'trochee'], ['iamb'], ['trochee'], ['iamb'], ['trochee'], ['trochee'], ['trochee'], ['trochee'], ['trochee'], ['iamb', 'dactyl'], ['trochee'], ['iamb'], ['trochee'], ['iamb'], ['trochee'], ['trochee'], ['trochee'], ['trochee'], ['trochee'], ['trochee'], ['logaedic'], ['iamb'], ['trochee'], ['trochee'], ['iamb'], ['trochee'], ['trochee'], ['trochee'], ['iamb'], ['iamb'], ['iamb'], ['iamb'], ['trochee'], ['iamb'], ['trochee'], ['iamb'], ['trochee'], ['trochee'], ['iamb'], ['iamb'], ['iamb'], ['iamb'], ['iamb'], ['dactyl', '

In [38]:
def group_years_into_windows(years, window_size=5):
    """
    Group years into window-size-year windows and count occurrences.
    """
    # Initialize empty dictionary for results
    grouped_years = {}
    
    for year in years:
        # Calculate the start year of the window (round down to nearest 5-year interval)
        window_start = year - (year % window_size)
        # Create the window label
        window_label = f'{window_start}-{window_start + (window_size - 1)}'
        # Increment the count for this window
        grouped_years[window_label] = grouped_years.get(window_label, 0) + 1
    
    # Sort the dictionary by window start year
    return dict(sorted(grouped_years.items()))

def group_types(types):
    """
    Group types into windows and count occurrences.
    """
    grouped_types = {}
    
    for type_list in types:
        type_tuple = tuple(sorted(type_list))  # Convert list to sorted tuple to handle order
        grouped_types[type_tuple] = grouped_types.get(type_tuple, 0) + 1
    
    return grouped_types

In [39]:
group_years_into_windows(years, 10)

# NOTE: window of 10 years will be used, because the data is sparse and for some years in 5-year windows there is only one composition.

{'1850-1859': 2,
 '1860-1869': 2,
 '1870-1879': 6,
 '1880-1889': 14,
 '1890-1899': 15,
 '1900-1909': 21,
 '1910-1919': 6,
 '1920-1929': 7}

In [40]:
group_types(types)

# NOTE: do the types change the results?

{('dactyl', 'iamb', 'trochee'): 1,
 ('iamb',): 37,
 ('trochee',): 30,
 ('dactyl', 'iamb'): 1,
 ('logaedic',): 2,
 ('dactyl', 'trochee'): 1,
 ('dactyl', 'logaedic', 'trochee'): 1}

In [12]:
print(composition_test['full_data'])

	<stanza pattern="01010001010">
		<token>form="Vždy" lemma="vždy" upos="ADV" xpos="Db-------------" feats="PronType=Tot"</token>
		<token>form="pevným" lemma="pevný" upos="ADJ" xpos="AAMS7----1A----" feats="Animacy=Anim|Case=Ins|Degree=Pos|Gender=Masc|Number=Sing|Polarity=Pos"</token>
		<token>form="v" lemma="v" upos="ADP" xpos="RR--6----------" feats="AdpType=Prep|Case=Loc"</token>
		<token>form="právu" lemma="právo" upos="NOUN" xpos="NNNS6-----A----" feats="Case=Loc|Gender=Neut|Number=Sing|Polarity=Pos"</token>
		<token>form="buď" lemma="buď" upos="CCONJ" xpos="J^-------------" feats="_"</token>
		<token>form="jak" lemma="jak" upos="SCONJ" xpos="J,-------------" feats="_"</token>
		<token>form="živá" lemma="živý" upos="ADJ" xpos="AAFS1----1A----" feats="Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|Polarity=Pos"</token>
		<token>form="skála" lemma="skála" upos="NOUN" xpos="NNFS1-----A----" feats="Case=Nom|Gender=Fem|Number=Sing|Polarity=Pos"</token>
		<token>form="," lemma="," upos="PUN

### Extracting features

In [41]:
def extract_token_attributes(line):
    """
    Extracts attributes from a token line.
    :param line: The token line.
    :return: A dictionary with the extracted attributes.
    """
    pattern = r'(\w+)="([^"]*)"'
    attributes = dict(re.findall(pattern, line))
    return attributes

def transform_xml_to_token_data(xml_data, feature_type: str = 'form', autosem_delexicalise: bool = False, ignore_interpunkt: bool = True):
    """
    Transforms tokens in XML data to a given type.
    :param xml_data: The XML data.
    :param feature_type: The feature type to extract (form, lemma, upos, xpos, feats).
    :param delexicalize: Whether to delexicalize the extracted features (autosemantic UPOS tokens are recorded as UPOS in all cases).
    :param ignore_interpunkt: Whether to ignore interpunkt tokens.
    """
    output = ''

    lines = xml_data.split('\n')

    for line in lines:
        if '<token' in line:
            token_attributes = extract_token_attributes(line)
            token_value = token_attributes.get(feature_type, '')
            if autosem_delexicalise and token_attributes['upos'] in ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV', 'NUM']:
                token_value = f'UPOS_{token_attributes['upos']}'
            elif ignore_interpunkt and token_attributes['upos'] == 'PUNCT':
                continue
            elif ignore_interpunkt and token_attributes['upos'] == 'SYM':
                continue
            elif ignore_interpunkt and token_attributes['lemma'] in ['.', ',', ';', ':', '!', '?', '(', ')', '[', ']', '{', '}', '"', "'", '„', '“', '”', '–', '—', '…']:
                # NOTE: this has been added because some interpunkt tokens are not marked as PUNCT or SYM (e.g., - or — has been marked as ADP for some reason).
                continue
            output += f'\t\t<token>{token_value}</token>\n'
        else:
            output += line + '\n'

    return output

In [42]:
print(composition_test['composition'])
print(transform_xml_to_token_data(composition_test['full_data'], 'upos', autosem_delexicalise=True))

1898_Cesti_spisovatele_ceskym_delnikum_narodnim_k_1._maji_1898
<stanza pattern="01010001010">
		<token>UPOS_ADV</token>
		<token>UPOS_ADJ</token>
		<token>ADP</token>
		<token>UPOS_NOUN</token>
		<token>CCONJ</token>
		<token>SCONJ</token>
		<token>UPOS_ADJ</token>
		<token>UPOS_NOUN</token>
</stanza>
<stanza pattern="0101010001">
		<token>PRON</token>
		<token>PART</token>
		<token>PRON</token>
		<token>UPOS_NOUN</token>
		<token>UPOS_VERB</token>
		<token>PRON</token>
		<token>UPOS_VERB</token>
</stanza>
<stanza pattern="01010001010">
		<token>CCONJ</token>
		<token>UPOS_NOUN</token>
		<token>UPOS_VERB</token>
		<token>ADP</token>
		<token>PRON</token>
		<token>CCONJ</token>
		<token>UPOS_NOUN</token>
		<token>UPOS_VERB</token>
</stanza>
<stanza pattern="0101001001">
		<token>UPOS_ADV</token>
		<token>UPOS_ADJ</token>
		<token>UPOS_NOUN</token>
		<token>DET</token>
		<token>UPOS_VERB</token>
		<token>UPOS_VERB</token>
</stanza>
<stanza pattern="01010001010">
		<token>UPOS_ADV</token>

In [64]:
def assigns_year(year: int, window_size: int = 5):
    """
    Assigns a year to a window.
    :param year: The year to assign.
    :param window_size: The size of the window.
    :return: The window label.
    """
    window_start = year - (year % window_size)
    return f'{window_start}-{window_start + (window_size - 1)}'

def count_token(file_path:str):
    """
    Count the tokens in the given file.
    :param file_path: The path to the file.
    """
    # TODO: add ignore interpunkt tokens??

    with open(file_path, 'r', encoding='utf-8') as file_:
        data = file_.read()
        token_count = data.count('<token>')
    
    return token_count

def count_stanza(file_path:str):
    """
    Count the stanzas in the given file.
    :param file_path: The path to the file.
    """
    with open(file_path, 'r', encoding='utf-8') as file_:
        data = file_.read()
        stanza_count = data.count('</stanza>')
    
    return stanza_count

def count_stanza_lens(file_path:str):
    """
    Count the stanzas in the given file.
    :param file_path: The path to the file.
    """

    # TODO: add ignore interpunkt tokens??

    stanza_lens = []

    with open(file_path, 'r', encoding='utf-8') as file_:
        data = file_.read()

        stanzas = data.split('</stanza>')
        for stanza in stanzas:
            token_count = stanza.count('<token>')

            stanza_lens.append(token_count)
    
    return stanza_lens

def join_one_years_period(year:str):
    years_range = year.split('-')
    if years_range[0] == years_range[1]:
        return years_range[0]
    else:
        return year

def count_years_et_tokens(path:str, window_size:int = 10):
    """
    Count the years and tokens in the given path.
    :param path: The path to the folder, where XML files are stored.
    """
    years = defaultdict(int)

    for file_name in os.listdir(path):
        if file_name.endswith('.xml'):
            path_to_file = os.path.join(path, file_name)
            year = int(file_name.split('_')[0])
            period = assigns_year(year, window_size=window_size)
            period = join_one_years_period(period)
            token_count = count_token(path_to_file)
            years[period] += token_count
    
    return years

def count_years_et_stanzas(path:str, window_size:int = 10):
    """
    Count the years and tokens in the given path.
    :param path: The path to the folder, where XML files are stored.
    """
    years = defaultdict(int)

    for file_name in os.listdir(path):
        if file_name.endswith('.xml'):
            path_to_file = os.path.join(path, file_name)
            year = int(file_name.split('_')[0])
            period = assigns_year(year, window_size=window_size)
            period = join_one_years_period(period)
            stanza_count = count_stanza(path_to_file)
            years[period] += stanza_count
    
    return years

In [61]:
token_counts = []
stanza_counts = []
stanza_lens = []

for file_name in os.listdir(DATA_PATH_HEYDUK):
    if file_name.endswith('.xml'):
        path_to_file = os.path.join(DATA_PATH_HEYDUK, file_name)
        token_count = count_token(path_to_file)
        stanza_count = count_stanza(path_to_file)
        stanzas_lens = count_stanza_lens(path_to_file)
        
        stanza_lens.extend(stanzas_lens)
        token_counts.append(token_count)
        stanza_counts.append(stanza_count)

print('Total token count:', sum(token_counts), 'total stanza count:', sum(stanza_counts))
print('Average token count:', sum(token_counts) / len(token_counts), 'average stanza count:', sum(stanza_counts) / len(stanza_counts))
print('Median token count:', sorted(token_counts)[len(token_counts) // 2], 'median stanza count:', sorted(stanza_counts)[len(stanza_counts) // 2])
print('Max token count:', max(token_counts), 'max stanza count:', max(stanza_counts))
print('Min token count:', min(token_counts), 'min stanza count:', min(stanza_counts))
print('Average stanza length:', sum(stanza_lens) / len(stanza_lens))
print('Median stanza length:', sorted(stanza_lens)[len(stanza_lens) // 2])
print('Max stanza length:', max(stanza_lens), 'Min stanza length:', min(stanza_lens))

Total token count: 43586 total stanza count: 6572
Average token count: 597.068493150685 average stanza count: 90.02739726027397
Median token count: 185 median stanza count: 32
Max token count: 8634 max stanza count: 1255
Min token count: 13 min stanza count: 2
Average stanza length: 6.559217456734387
Median stanza length: 6
Max stanza length: 20 Min stanza length: 0


In [105]:
def join_tokens_stanzas_data(token_data:dict, stanza_data:dict):
    """
    Joins token and stanza data.
    :param token_data: The token data.
    :param stanza_data: The stanza data.
    """
    joined_data = {}
    for key in token_data:
        joined_data[key] = {'tokens': token_data[key], 'stanzas': stanza_data[key]}
        
    return joined_data

def make_csv_for_flourish(data:dict, file_name:str, fill_empties=False, joined=False, destination:str=ANALYSIS_PATH):
    """
    Makes a CSV file for Flourish.

    :param data: The data to store.
    :param file_name: The name of the file.
    :param fill_empties: Whether to fill the empty values - works only for one-year windows.
    :param destination: The destination folder.
    """

    if fill_empties:
        new_dict = {}
        start_year = int(list(data.keys())[0])
        end_year = int(list(data.keys())[-1])
        for key in range(start_year, end_year+1):
            key = str(key)
            if key not in data:
                new_dict[key] = {'tokens': 0, 'stanzas': 0}
            else:
                new_dict[key] = data[key]
        data = new_dict

    if joined:
        df = pd.DataFrame.from_dict(data, orient='index', columns=['tokens', 'stanzas'])
    else:
        df = pd.DataFrame.from_dict(data, orient='index', columns=['value'])

    df.to_csv(os.path.join(destination, f'{file_name}.csv'), header=True)

In [98]:
tokens_years_data = count_years_et_tokens(DATA_PATH_HEYDUK, window_size=1)
make_csv_for_flourish(tokens_years_data, 'tokens_years_data', fill_empties=True)

In [99]:
stanzas_years_data = count_years_et_stanzas(DATA_PATH_HEYDUK, window_size=1)
make_csv_for_flourish(stanzas_years_data, 'stanzas_years_data', fill_empties=True)

In [107]:
joined_tokens_years_data = join_tokens_stanzas_data(tokens_years_data, stanzas_years_data)
make_csv_for_flourish(joined_tokens_years_data, 'joined_tokens_stanzas_years_data', fill_empties=True, joined=True)

In [112]:
tokens_years_data_10 = count_years_et_tokens(DATA_PATH_HEYDUK, window_size=10)
make_csv_for_flourish(tokens_years_data, 'tokens_years_data_10_years')

In [113]:
stanzas_years_data_10 = count_years_et_stanzas(DATA_PATH_HEYDUK, window_size=10)
make_csv_for_flourish(stanzas_years_data, 'stanzas_years_data_10_years')

In [114]:
joined_tokens_years_data:10 = join_tokens_stanzas_data(tokens_years_data_10, stanzas_years_data_10)
make_csv_for_flourish(joined_tokens_years_data, 'joined_tokens_stanzas_years_data_10_years', joined=True)

# Training Models

## Experiment Design

- Because of the sparse data, we will use periods as targets; we will start with a window of 10 years.
- Because of the sparse data, including the very limited length of each composition, we will first try to do the leave-one-out experiment for each decade --> in sum, there will be eight itterations.
- we will do segmentation by stanzas. First, we will try what a one-length segments will do. If this does not work, we may try to do larger segments, which may result in deleting some compositions (the shortest has just 2 stanza) We may itterate through stanzas to densen the data... if this does not work, we may try to do attribution by the same avg. token length
- In the next step, we may try do itterations of the leave-one-out design, so that each of the works at our disposal is used as a test document.

## Delexicalization types (r)
Delexicalization types follow the designation created for NKP AuthorGuesser
- r-04: No delexicalisation (baseline) — original word forms are used ('forms')
- r-05: Lemmatisation — lemmas used instead of word forms ('lemma')
- r-06: Part-of-speech tags for all words ('upos')
- r-07: Morphological tags for all words ('xpos')
- r-08: Part-of-speech tags for autosemantic words, others lemmatised ('lemma' + autosem_delexicalise=True)
- r-09: Morphological tags for autosemantic words, others lemmatised --> not used here

In [186]:
def split_data_to_statza(xml_data:str):
    """
    Split the XML data into stanzas.
    :param xml_data: The XML data.
    """
    return xml_data.split('</stanza>')[:-1]

def get_tokens_from_stanza(stanza:str):
    """
    Get the tokens from a stanza.
    :param stanza: The stanza.
    """
    tokens = re.findall(r'<token>(.*?)</token>', stanza)
    return tokens

def iterate_over_stanzas(stanzas:list, window_size:int = 2, shift_size:int = 1):
    """
    Iterate over stanzas in a given window size.
    :param stanzas: The stanzas.
    :param window_size: The window size.
    """
    if shift_size > window_size:
        raise ValueError('Shift size must be less than or equal to window size.')
    for i in range(0, len(stanzas) - window_size + 1, shift_size):
        window = stanzas[i:i + window_size]
        yield window

def segment_file(file_path:str, segment_window:int = 1, shift_size:int = 1, feature_type: str = 'form', autosem_delexicalise: bool = False, ignore_interpunkt: bool = True):
    """
    Segment the file into segments of a given size.
    :param file_path: The path to the file.
    :param segment_size: The size of the segments.
    :param feature_type: The feature type to extract (form, lemma, upos, xpos, feats).
    :param autosem_delexicalise: Whether to delexicalize the extracted features (autosemantic UPOS tokens are recorded as UPOS in all cases).
    """
    with open(file_path, 'r', encoding='utf-8') as file_:
        data = file_.read()
        
        transfromed_data = transform_xml_to_token_data(data, feature_type=feature_type, autosem_delexicalise=autosem_delexicalise, ignore_interpunkt=ignore_interpunkt)

        single_stanzas = split_data_to_statza(transfromed_data)
        stanzas_tokens = [get_tokens_from_stanza(stanza) for stanza in single_stanzas]

        segments = iterate_over_stanzas(stanzas_tokens, window_size=segment_window, shift_size=shift_size)

        joined_segments = []
        for segment in segments:
            joined_segment = []
            for stanza in segment:
                joined_segment.extend(stanza)
            joined_segments.append(joined_segment)
    
    return joined_segments

In [193]:
def segment_file_to_data_et_targets(file_path:str, year_window_size=10, segment_window=2, shift_size=1, feature_type='lemma', autosem_delexicalise=True, ignore_interpunkt=True):
    """
    This function creates segments from the given file.
    """
    targets = []
    data = []

    composition_year = get_year_et_types_from_composition(file_path)[0]
    composition_period = assigns_year(composition_year, window_size=year_window_size)

    # NOTE: if the segment_window is greater than number of stanzas in the composition, the window will be set to the number of stanzas.
    # TODO: Evaluate if this is the best approach.
    stanza_count = count_stanza(file_path)
    if segment_window > stanza_count:
        segment_window = stanza_count
        if shift_size > segment_window:
            shift_size = segment_window

    segments = segment_file(file_path=file_path, segment_window=segment_window, shift_size=shift_size, feature_type=feature_type, autosem_delexicalise=autosem_delexicalise, ignore_interpunkt=ignore_interpunkt)

    for seg in segments:
        targets.append(composition_period)

        seg = ' '.join(seg)
        data.append(seg)

    # print('Number of segments:', len(data))
    # print('Number of targets:', len(targets))

    return data, targets

In [136]:
class NKPdateship:
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

        self.target_names = sorted(set(targets))

    def __len__(self):
        return len(self.data)

    def check_valid(self):
        return len(self.data) == len(self.targets)

In [129]:
def select_model(model_name:str):
    """ This function selects relevant model based on input. """
    if model_name == 'MultinomialNB':
        from sklearn.naive_bayes import MultinomialNB
        return MultinomialNB
    elif model_name == 'SVC':
        from sklearn.svm import SVC
        return SVC
    elif model_name == 'LinearSVC':
        from sklearn.svm import LinearSVC
        return LinearSVC
    elif model_name == 'KNeighborsClassifier':
        from sklearn.neighbors import KNeighborsClassifier
        return KNeighborsClassifier
    elif model_name == 'SGDClassifier':
        from sklearn.linear_model import SGDClassifier
        return SGDClassifier
    elif model_name == 'DecisionTreeClassifier':
        from sklearn.tree import DecisionTreeClassifier
        return DecisionTreeClassifier
    else:
        print('Selected model is not available in this version')
        return

In [163]:
def train_model(train_dataset:NKPdateship, model_prefix:str, model_name='LinearSVC', n_min=1, n_max=2, models_path=MODELS_PATH):
    """ 
    This function trains the model on the given dataset.

    :param train_dataset: The dataset to train on.
    :param model_name: The name of the model to use (choose from MultinomialNB, SVC, LinearSVC, KNeighborsClassifier, SGDClassifier, and DecisionTreeClassifier).
    :param n_min: The minimum n-gram range.
    :param n_max: The maximum n-gram range.
    :param models_path: The path where the models should be saved.
    """
    # TODO: implement model parameters configuration?

    model_training_start = time.time()
    print('\tTraining started')
    vectorizer = CountVectorizer(ngram_range=(n_min, n_max))

    X_train_counts = vectorizer.fit_transform(train_dataset.data)

    dump(vectorizer, os.path.join(models_path, f'{model_prefix}_vectorizer_n-{n_min},{n_max}.joblib'))

    model = select_model(model_name)

    clf = model()

    clf.fit(X_train_counts, train_dataset.targets)

    dump(clf, os.path.join(models_path, f'{model_prefix}_model_n-{n_min},{n_max}.m-{model_name}.joblib'))
    model_training_end = time.time()
    print('\tmodel trained in', model_training_end-model_training_start, 'seconds')

In [194]:
""" Training models with leave-one-out design."""

files_for_hejduk = os.listdir(DATA_PATH_HEYDUK)

print('number of files --> nuber of models:', len(files_for_hejduk))

for test_f_hejduk in files_for_hejduk:
    print('Training model with leave-one-out design for', test_f_hejduk)

    data = []
    targets = []

    for train_f_hejduk_ in files_for_hejduk:
        if train_f_hejduk_ != test_f_hejduk:
            data_, targets_ = segment_file_to_data_et_targets(os.path.join(DATA_PATH_HEYDUK, train_f_hejduk_), year_window_size=10, segment_window=10, shift_size=5, feature_type='lemma', autosem_delexicalise=True, ignore_interpunkt=True)
            data.extend(data_)
            targets.extend(targets_)
    
    Hejduk_dateship = NKPdateship(data, targets)
    
    train_model(Hejduk_dateship, model_prefix=f'LOO_{test_f_hejduk}', model_name='LinearSVC', n_min=1, n_max=2)


number of files --> nuber of models: 73
Training model with leave-one-out design for 1858_Maj.xml
	Training started




	model trained in 1.3060550689697266 seconds
Training model with leave-one-out design for 1859_Basne.xml
	Training started




	model trained in 1.343045711517334 seconds
Training model with leave-one-out design for 1864_Basne_2,_1.xml
	Training started




	model trained in 1.2143003940582275 seconds
Training model with leave-one-out design for 1865_Basne_2,_2.xml
	Training started




	model trained in 1.14583158493042 seconds
Training model with leave-one-out design for 1873_Lesni_kviti.xml
	Training started




	model trained in 1.2727625370025635 seconds
Training model with leave-one-out design for 1873_Ruch_3.xml
	Training started




	model trained in 1.2459914684295654 seconds
Training model with leave-one-out design for 1874_Almanach_na_oslavu_25leteho_trvani_Akademicko-ctenarskeho_spolku_prazskeho.xml
	Training started




	model trained in 1.2860918045043945 seconds
Training model with leave-one-out design for 1876_Cimbal_a_husle.xml
	Training started




	model trained in 1.2941851615905762 seconds
Training model with leave-one-out design for 1878_Mohamed_II..xml
	Training started




	model trained in 1.0818302631378174 seconds
Training model with leave-one-out design for 1879_Deduv_odkaz.xml
	Training started




	model trained in 0.9338333606719971 seconds
Training model with leave-one-out design for 1881_Dudak.xml
	Training started




	model trained in 1.0526113510131836 seconds
Training model with leave-one-out design for 1882_Drevorubec.xml
	Training started




	model trained in 1.2001049518585205 seconds
Training model with leave-one-out design for 1883_Oldrich_a_Bozena.xml
	Training started




	model trained in 1.2417490482330322 seconds
Training model with leave-one-out design for 1883_V_zatisi.xml
	Training started




	model trained in 0.9712679386138916 seconds
Training model with leave-one-out design for 1883_Za_volnost_a_viru.xml
	Training started




	model trained in 0.9490435123443604 seconds
Training model with leave-one-out design for 1884_Horec_a_srdecnik.xml
	Training started




	model trained in 1.4860563278198242 seconds
Training model with leave-one-out design for 1884_Na_prastkach.xml
	Training started




	model trained in 1.1990540027618408 seconds
Training model with leave-one-out design for 1885_Pisne.xml
	Training started




	model trained in 1.158383846282959 seconds
Training model with leave-one-out design for 1885_Pod_Vitkovym_kamenem.xml
	Training started




	model trained in 1.139397144317627 seconds
Training model with leave-one-out design for 1886_Bela.xml
	Training started




	model trained in 1.0613799095153809 seconds
Training model with leave-one-out design for 1886_Zavate_listy.xml
	Training started




	model trained in 1.218550682067871 seconds
Training model with leave-one-out design for 1888_Obrazky.xml
	Training started




	model trained in 1.4658854007720947 seconds
Training model with leave-one-out design for 1888_Pisen_o_bitve_u_Kressenbrunnu.xml
	Training started




	model trained in 1.454420566558838 seconds
Training model with leave-one-out design for 1888_Sipy_a_paprsky.xml
	Training started




	model trained in 1.2577331066131592 seconds
Training model with leave-one-out design for 1890_Na_vlnach.xml
	Training started




	model trained in 1.0670180320739746 seconds
Training model with leave-one-out design for 1894_Bohatyri.xml
	Training started




	model trained in 1.1012601852416992 seconds
Training model with leave-one-out design for 1894_Na_potulkach.xml
	Training started




	model trained in 1.1363894939422607 seconds
Training model with leave-one-out design for 1897_Nove_Ciganske_melodie_(in_Spisy_Adolfa_Heyduka,_svazek_1).xml
	Training started




	model trained in 1.3352584838867188 seconds
Training model with leave-one-out design for 1897_Ptaci_motivy_1,_2_(in_Spisy_Adolfa_Heyduka,_svazek_2).xml
	Training started




	model trained in 1.3933908939361572 seconds
Training model with leave-one-out design for 1897_Ptaci_motivy_3,_4_(in_Spisy_Adolfa_Heyduka,_svazek_2).xml
	Training started




	model trained in 1.03116774559021 seconds
Training model with leave-one-out design for 1898_Cesti_spisovatele_ceskym_delnikum_narodnim_k_1._maji_1898.xml
	Training started




	model trained in 1.1372098922729492 seconds
Training model with leave-one-out design for 1898_Milota_(in_Spisy_Adolfa_Heyduka,_svazek_7).xml
	Training started




	model trained in 0.8697681427001953 seconds
Training model with leave-one-out design for 1898_Tri_zkazky_(in_Spisy_Adolfa_Heyduka,_svazek_5).xml
	Training started




	model trained in 1.109358549118042 seconds
Training model with leave-one-out design for 1899_Dumy_a_dojmy_(in_Spisy_Adolfa_Heyduka,_svazek_13).xml
	Training started




	model trained in 1.2522950172424316 seconds
Training model with leave-one-out design for 1899_Rosa_a_jini_(in_Spisy_Adolfa_Heyduka,_svazek_15).xml
	Training started




	model trained in 1.1680262088775635 seconds
Training model with leave-one-out design for 1899_Za_dlouhych_veceru_(in_Spisy_Adolfa_Heyduka,_svazek_14).xml
	Training started




	model trained in 0.9775798320770264 seconds
Training model with leave-one-out design for 1899_Zpevy_posumavskeho_dudaka_1_(in_Spisy_Adolfa_Heyduka,_svazek_10).xml
	Training started




	model trained in 0.9263820648193359 seconds
Training model with leave-one-out design for 1899_Zpevy_posumavskeho_dudaka_2_(in_Spisy_Adolfa_Heyduka_11).xml
	Training started




	model trained in 1.2797601222991943 seconds
Training model with leave-one-out design for 1899_Zpevy_posumavskeho_dudaka_3_(in_Spisy_Adolfa_Heyduka,_svazek_12).xml
	Training started




	model trained in 1.304673194885254 seconds
Training model with leave-one-out design for 1900_Cerne_ruze_(in_Spisy_Adolfa_Heyduka,_svazek_21).xml
	Training started




	model trained in 1.284611463546753 seconds
Training model with leave-one-out design for 1900_Na_cerne_hodince_(in_Spisy_Adolfa_Heyduka,_svazek_16).xml
	Training started




	model trained in 0.9247102737426758 seconds
Training model with leave-one-out design for 1900_Parnasie_(in_Spisy_Adolfa_Heyduka,_svazek_17).xml
	Training started




	model trained in 1.493370532989502 seconds
Training model with leave-one-out design for 1900_V_polich_(in_Spisy_Adolfa_Heyduka,_svazek_19).xml
	Training started




	model trained in 1.081106185913086 seconds
Training model with leave-one-out design for 1900_V_zaseru_minulosti_(in_Spisy_Adolfa_Heyduka,_svazek_18).xml
	Training started




	model trained in 1.1042428016662598 seconds
Training model with leave-one-out design for 1901_Lotysske_motivy_(in_Spisy_Adolfa_Heyduka,_svazek_23).xml
	Training started




	model trained in 1.3606042861938477 seconds
Training model with leave-one-out design for 1901_Pohadky_duse_(in_Spisy_Adolfa_Heyduka,_svazek_25).xml
	Training started




	model trained in 0.9783365726470947 seconds
Training model with leave-one-out design for 1901_V_samotach_(in_Spisy_Adolfa_Heyduka,_svazek_26).xml
	Training started




	model trained in 1.210336685180664 seconds
Training model with leave-one-out design for 1901_Z_rodnych_hor_(in_Spisy_Adolfa_Heyduka,_svazek_24).xml
	Training started




	model trained in 1.3092477321624756 seconds
Training model with leave-one-out design for 1902_Biblicke_zvesti_(in_Spisy_Adolfa_Heyduka,_svazek_31).xml
	Training started




	model trained in 0.9753825664520264 seconds
Training model with leave-one-out design for 1902_Ritornely_(in_Spisy_Adolfa_Heyduka,_svazek_29).xml
	Training started




	model trained in 1.2273950576782227 seconds
Training model with leave-one-out design for 1902_Z_pouti_na_Kavkaz_(in_Spisy_Adolfa_Heyduka,_svazek_30).xml
	Training started




	model trained in 1.2673227787017822 seconds
Training model with leave-one-out design for 1903_Cestou_(in_Spisy_Adolfa_Heyduka,_svazek_34).xml
	Training started




	model trained in 1.2736279964447021 seconds
Training model with leave-one-out design for 1904_Z_deniku_toulaveho_zpevaka_(in_Spisy_Adolfa_Heyduka,_svazek_36).xml
	Training started




	model trained in 0.9837474822998047 seconds
Training model with leave-one-out design for 1905_Znelky_(in_Spisy_Adolfa_Heyduka,_svazek_40).xml
	Training started




	model trained in 1.1589412689208984 seconds
Training model with leave-one-out design for 1906_Vychod_a_Zapad_(in_Spisy_Adolfa_Heyduka,_svazek_41).xml
	Training started




	model trained in 0.9069287776947021 seconds
Training model with leave-one-out design for 1908_Apel_k_trunu_za_vrazdy_v_Cernove.xml
	Training started




	model trained in 1.2617125511169434 seconds
Training model with leave-one-out design for 1908_Sekernik_1_(in_Spisy_Adolfa_Heyduka,_svazek_3).xml
	Training started




	model trained in 1.0343375205993652 seconds
Training model with leave-one-out design for 1908_Sekernik_2_(in_Spisy_Adolfa_Heyduka,_svazek_3).xml
	Training started




	model trained in 1.0340535640716553 seconds
Training model with leave-one-out design for 1908_Skudci_a_dobrodruzi_(Spisy_Adolfa_Heyduka,_svazek_46).xml
	Training started




	model trained in 1.0175516605377197 seconds
Training model with leave-one-out design for 1909_Almanach_akademickeho_spolku_Orlican.xml
	Training started




	model trained in 1.2996163368225098 seconds
Training model with leave-one-out design for 1910_Co_hlavou_tahlo_(in_Spisy_Adolfa_Heyduka,_svazek_50).xml
	Training started




	model trained in 1.262563705444336 seconds
Training model with leave-one-out design for 1910_Oddech_v_lese.xml
	Training started




	model trained in 1.2857520580291748 seconds
Training model with leave-one-out design for 1910_Od_Tater_a_Dunaje_(in_Spisy_Adolfa_Heyduka,_svazek_49).xml
	Training started




	model trained in 0.9337501525878906 seconds
Training model with leave-one-out design for 1913_Nekolik_povesti_z_Posumavi_(in_Spisy_Adolfa_Heyduka,_svazek_54).xml
	Training started




	model trained in 1.203805685043335 seconds
Training model with leave-one-out design for 1913_Rozmanite_zvesti_a_drobne_deje_(in_Spisy_Adolfa_Heyduka,_svazek_53).xml
	Training started




	model trained in 1.1229770183563232 seconds
Training model with leave-one-out design for 1919_Slovensku_(in_Spisy_Adolfa_Heyduka,_svazek_55).xml
	Training started




	model trained in 1.2650234699249268 seconds
Training model with leave-one-out design for 1920_Obeti_presvedceni_(in_Spisy_Adolfa_Heyduka,_svazek_56).xml
	Training started




	model trained in 1.2468657493591309 seconds
Training model with leave-one-out design for 1920_Sny_kralovske_1_(in_Spisy_Adolfa_Heyduka,_svazek_57).xml
	Training started




	model trained in 1.257641315460205 seconds
Training model with leave-one-out design for 1920_Sny_kralovske_2_(in_Spisy_Adolfa_Heyduka,_svazek_58).xml
	Training started




	model trained in 1.0633068084716797 seconds
Training model with leave-one-out design for 1921_Epigramy.xml
	Training started




	model trained in 1.297778844833374 seconds
Training model with leave-one-out design for 1921_Sny_kralovske_3_(in_Spisy_Adolfa_Heyduka,_svazek_59).xml
	Training started




	model trained in 0.9222815036773682 seconds
Training model with leave-one-out design for 1921_Stesky_na_stezkach_stareho_zpevaka_(in_Spisy_Adolfa_Heyduka,_svazek_60).xml
	Training started




	model trained in 1.3918654918670654 seconds
Training model with leave-one-out design for 1923_Ciganska_melodie_(in_Spisy_Adolfa_Heyduka,_svazek_61).xml
	Training started
	model trained in 1.3314683437347412 seconds




# Evaluating models

In [167]:
def get_composition_et_vectorizer_from_model_name(model_name:str):
    """
    Get the composition and vectorizer from the model name.

    :param model_name: The model name.
    """
    end_comp = model_name.find('_model')
    composition = model_name[4:end_comp]
    vectorizer = model_name.replace('_model', '_vectorizer')
    end_vect = vectorizer.find('.m-')
    vectorizer = vectorizer[:end_vect]+'.joblib'
    
    return composition, vectorizer

In [149]:
def guess_instance_segment(model_filename:str, vectorizer_filename:str, data_to_eval:str, models_path=MODELS_PATH):
    """
    This function is used within the guess_file function as a guess of one instance.
    
    :param model_filename: quite self-explanatory...
    :param data_to_eval: input data must be a str in XML-like structure, already delexicalized (with "<token>TOKEN</token>")
    """

    model = load(os.path.join(models_path, model_filename))
    vectorizer = load(os.path.join(models_path, vectorizer_filename))

    # Create test counts
    X_guess_counts = vectorizer.transform([data_to_eval])

    # Prediction and evaluation:
    y_guess_pred = model.predict(X_guess_counts)
    
    print(f'\tGUESS: model - {model_filename}, guessed time period - {y_guess_pred[0]}')

    return y_guess_pred[0]

In [157]:
test_data, test_targets = segment_file_to_data_et_targets(os.path.join(DATA_PATH_HEYDUK, '1858_Maj.xml'))
Hejduk_dateship_test = NKPdateship(test_data, test_targets)
print(Hejduk_dateship_test.targets)

Number of segments: 7
Number of targets: 7
['1850-1859', '1850-1859', '1850-1859', '1850-1859', '1850-1859', '1850-1859', '1850-1859']


In [196]:
def evaluate_model_on_loo_compostition(model_name:str, models_path=MODELS_PATH):
    """
    This function evaluates the model on the leave-one-out composition.

    :param model_name: The name of the model file (in joblib format).
    :param models_path: The path to the models.
    """
    composition, vectorizer = get_composition_et_vectorizer_from_model_name(model_name)

    print('Evaluating model on', composition)

    # TODO: implement the r-designations for the trainnig and testing datasets. Show this on the models filenames, etc. Also, show in the models filenames the window size, segment_window, and shift_size.
    data, targets = segment_file_to_data_et_targets(os.path.join(DATA_PATH_HEYDUK, composition), year_window_size=10, segment_window=10, shift_size=5, feature_type='lemma', autosem_delexicalise=True, ignore_interpunkt=True)

    Hejduk_test_dateship = NKPdateship(data, targets)

    score = {'correct': 0, 'incorrect': 0}
    detail = defaultdict(int)

    for i, test_data in enumerate(Hejduk_test_dateship.data):
        true_period = Hejduk_test_dateship.targets[i]

        guessed_period = guess_instance_segment(model_name, vectorizer, test_data, models_path=models_path)

        if guessed_period == true_period:
            score['correct'] += 1
        else:
            score['incorrect'] += 1

        detail[(true_period, guessed_period)] += 1

    eval_score = score['correct'] / (score['correct'] + score['incorrect'])
    print(composition, 'Score:', score, 'Evaluation score:', eval_score)
    
    return score, eval_score, detail

In [197]:
models_files = os.listdir(MODELS_PATH)

results = {}
details_for_heatmap = defaultdict(int)

for possible_model in models_files:
    if '_model_n-' in possible_model:
        score, eval_score, detail = evaluate_model_on_loo_compostition(possible_model, models_path=MODELS_PATH)

        results[possible_model] = {'score': score, 'eval_score': eval_score}

        for key in detail:
            details_for_heatmap[key] += detail[key]



Evaluating model on 1858_Maj.xml
	GUESS: model - LOO_1858_Maj.xml_model_n-1,2.m-LinearSVC.joblib, guessed time period - 1880-1889
1858_Maj.xml Score: {'correct': 0, 'incorrect': 1} Evaluation score: 0.0
Evaluating model on 1859_Basne.xml
	GUESS: model - LOO_1859_Basne.xml_model_n-1,2.m-LinearSVC.joblib, guessed time period - 1900-1909
1859_Basne.xml Score: {'correct': 0, 'incorrect': 1} Evaluation score: 0.0
Evaluating model on 1864_Basne_2,_1.xml
	GUESS: model - LOO_1864_Basne_2,_1.xml_model_n-1,2.m-LinearSVC.joblib, guessed time period - 1900-1909
	GUESS: model - LOO_1864_Basne_2,_1.xml_model_n-1,2.m-LinearSVC.joblib, guessed time period - 1900-1909
1864_Basne_2,_1.xml Score: {'correct': 0, 'incorrect': 2} Evaluation score: 0.0
Evaluating model on 1865_Basne_2,_2.xml
	GUESS: model - LOO_1865_Basne_2,_2.xml_model_n-1,2.m-LinearSVC.joblib, guessed time period - 1880-1889
	GUESS: model - LOO_1865_Basne_2,_2.xml_model_n-1,2.m-LinearSVC.joblib, guessed time period - 1880-1889
1865_Basne_2

In [198]:
for key in results:
    print(key, results[key])

LOO_1858_Maj.xml_model_n-1,2.m-LinearSVC.joblib {'score': {'correct': 0, 'incorrect': 1}, 'eval_score': 0.0}
LOO_1859_Basne.xml_model_n-1,2.m-LinearSVC.joblib {'score': {'correct': 0, 'incorrect': 1}, 'eval_score': 0.0}
LOO_1864_Basne_2,_1.xml_model_n-1,2.m-LinearSVC.joblib {'score': {'correct': 0, 'incorrect': 2}, 'eval_score': 0.0}
LOO_1865_Basne_2,_2.xml_model_n-1,2.m-LinearSVC.joblib {'score': {'correct': 0, 'incorrect': 2}, 'eval_score': 0.0}
LOO_1873_Lesni_kviti.xml_model_n-1,2.m-LinearSVC.joblib {'score': {'correct': 0, 'incorrect': 1}, 'eval_score': 0.0}
LOO_1873_Ruch_3.xml_model_n-1,2.m-LinearSVC.joblib {'score': {'correct': 4, 'incorrect': 4}, 'eval_score': 0.5}
LOO_1874_Almanach_na_oslavu_25leteho_trvani_Akademicko-ctenarskeho_spolku_prazskeho.xml_model_n-1,2.m-LinearSVC.joblib {'score': {'correct': 2, 'incorrect': 6}, 'eval_score': 0.25}
LOO_1876_Cimbal_a_husle.xml_model_n-1,2.m-LinearSVC.joblib {'score': {'correct': 0, 'incorrect': 3}, 'eval_score': 0.0}
LOO_1878_Mohamed_I

In [199]:
""" Counting the segments for each period. """

segments_in_period = defaultdict(int)

for file_name in os.listdir(DATA_PATH_HEYDUK):
    if file_name.endswith('.xml'):
        data, targets = segment_file_to_data_et_targets(os.path.join(DATA_PATH_HEYDUK, file_name), year_window_size=10, segment_window=10, shift_size=5, feature_type='lemma', autosem_delexicalise=True, ignore_interpunkt=True)

        segments_in_period[targets[0]] += len(data)

In [200]:
data_for_heatmap = {'true_period': [], 'guessed_period': [], 'percentage': []}
for key in details_for_heatmap:
    data_for_heatmap['true_period'].append(key[0])
    data_for_heatmap['guessed_period'].append(key[1])
    data_for_heatmap['percentage'].append(details_for_heatmap[key]/segments_in_period[key[0]])

# NOTE: adding empty values for the heatmap
for key in segments_in_period:
    for key_ in segments_in_period:
        if (key, key_) not in details_for_heatmap:
            data_for_heatmap['true_period'].append(key)
            data_for_heatmap['guessed_period'].append(key_)
            data_for_heatmap['percentage'].append(0)

df = pd.DataFrame(data_for_heatmap, columns=['true_period', 'guessed_period', 'percentage'])
df.to_csv(os.path.join(GUESSES_PATH, 'guesses_details_.m-LinearSVC_.r-08_.n-1,2.s-10.o-5.csv'), header=True)