In [20]:
import os
import pandas as pd
import pickle

In [21]:
df = pd.read_csv("composite_words.csv", sep="\t")
always_true = df[df["forklaring"] == "altid sådan"]
composite_words = list(always_true["ord/ordforbindelse"].values)
composite_words = [word for word in composite_words if "(" not in word]
# currently only correct 2-3 composite words in spellchecker
composite_words = [word for word in composite_words if len(word.split()) < 4] 
len(composite_words)

671

In [22]:
from collections import Counter
Counter([len(word.split()) for word in composite_words])

Counter({2: 298, 1: 311, 3: 62})

In [23]:
[word for word in composite_words if len(word.split()) > 3]

[]

In [30]:
composite_one = [word for word in composite_words if len(word.split()) == 1]
composite_two = [word for word in composite_words if len(word.split()) == 2]
composite_three = [word for word in composite_words if len(word.split()) == 3]
len(composite_one), len(composite_two), len(composite_three)

(311, 298, 62)

In [63]:
composite_one[:5], composite_two[:5], composite_three[:5]

(['actionfilm', 'afstedkomme', 'ahaoplevelse', 'ajour', 'allerbedst'],
 ['aber dabei', 'a cappella', 'a cappella-kor', 'accent aigu', 'accent grave'],
 ['a la carte',
  'a la carte-ret',
  'a la grecque-bort',
  'al den stund',
  'alt i alt'])

In [83]:
dictionary = pickle.load(open("../../Datasets/dictionary.pickle", "rb"))
composite_dict = {}
for comp in composite_one:
    composite_dict[comp] = comp
print("Composite word two")
for comp in composite_two:
    if comp.replace(" ", "") not in dictionary:
        composite_dict[comp.replace(" ", "")] = comp
    else:
        print(comp.replace(" ", ""))
print("Composite word three")
for comp in composite_three:
    word1, word2, word3 = comp.split()
    if word1 + word2 + " " + word3 not in dictionary:
        composite_dict[word1 + word2 + " " + word3] = comp
    else:
        print(word1 + word2 + " " + word3)
    if word1 + " " + word2 + word3 not in dictionary:
        composite_dict[word1 + " " + word2 + word3] = comp
    else:
        print(word1 + " " + word2 + word3)
    if word1 + word2 + word3 not in dictionary: 
        composite_dict[word1 + word2 + word3] = comp
    else:
        print(word1 + word2 + word3)

Composite word two
adgangen
afhænde
billigbog
desværre
engang
førstebehandling
førsteklasses
glatis
godaften
goddag
godmorgen
godnat
iland
lilleby
lillejuleaften
lilleskole
overstyr
småbitte
stormagt
tilkende
tilstede
udeomkring
udfor
velsagtens
veltilfreds
veltilpas
vistnok
Composite word three


In [85]:
len(composite_dict)

767

In [86]:
def turn_features_to_dicts(features):
    feature_dicts = []
    current_tense = None
    for feature in features:
        if feature is None:
            feature_dicts.append({})
            continue
        feature_dict = {}
        current_features = feature.split("|")
        for current_feature in current_features:
            key, value = current_feature.split("=")
            if key == "Tense" and current_tense is None:
                current_tense = value
            feature_dict[key] = value
        if "Tense" not in feature_dict and "VerbForm" in feature_dict and key is not None:
            feature_dict["Tense"] = "Pres" if current_tense is None else current_tense
        feature_dicts.append(feature_dict)
    return feature_dicts

In [87]:
import stanza
from tqdm import tqdm
pos_model = stanza.Pipeline("da", processors='tokenize,pos', use_gpu=True, cache_directory='./cache', tokenize_pretokenized=True, n_process=4)
for k, v in tqdm(composite_dict.items()):
    doc = pos_model(v)
    features = [word.feats if word.feats else None for sentence in doc.sentences for word in sentence.words]
    feature_dicts = turn_features_to_dicts(features)
    results = [[word.upos, [], feature_dicts[i]] for sentence in doc.sentences for i, word in enumerate(sentence.words)]
    composite_dict[k] = (v, results)

2023-06-19 22:47:51 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-06-19 22:47:52 INFO: Loading these models for language: da (Danish):
| Processor | Package |
-----------------------
| tokenize  | ddt     |
| pos       | ddt     |

2023-06-19 22:47:52 INFO: Using device: cpu
2023-06-19 22:47:52 INFO: Loading: tokenize
2023-06-19 22:47:52 INFO: Loading: pos
2023-06-19 22:47:52 INFO: Done loading processors!
100%|██████████| 767/767 [00:25<00:00, 29.96it/s]


In [88]:
len(composite_dict)

767

In [94]:
with open("composite_dict.pickle", "wb") as f:
    pickle.dump(composite_dict, f)

In [25]:
dictionary = pickle.load(open("../../Datasets/dictionary.pickle", "rb"))

In [26]:
spelling_errors = pickle.load(open("../../Datasets/misspellings_dict.pickle", "rb"))

In [27]:
len(spelling_errors)

23732830

In [11]:
keys = list(spelling_errors.keys())
values = list(spelling_errors.values())

In [14]:
with open("spelling_errors_keys.pickle", "wb") as f:
    pickle.dump(keys, f)
with open("spelling_errors_values.pickle", "wb") as f:
    pickle.dump(values, f)

In [28]:
keys = pickle.load(open("spelling_errors_keys.pickle", "rb"))
values = pickle.load(open("spelling_errors_values.pickle", "rb"))

FileNotFoundError: [Errno 2] No such file or directory: 'spelling_errors_keys.pickle'

In [16]:
spelling_errors = {k: v for k,v in zip(keys, values)}

In [17]:
len(spelling_errors)

23732830