Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrates Wortschatz frequencies #122

Merged
merged 7 commits into from
Dec 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@ __pycache__/
.mypy_cache/
*.py[cdo]
*.egg-info/
*.log
*.log
**/tars
**/freq_tsvs
69 changes: 69 additions & 0 deletions languages/wikipron/src/frequencies/grab_wortschatz_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env python

import json
import logging
import os
import requests
import tarfile
import time

WORTSCHATZ_DICT_PATH = "wortschatz_languages.json"


# Downloads the Wortschatz tarballs, roughly 10 GB of data.
def download(data_to_grab):
to_retry = {}
os.makedirs("tars", exist_ok=True)
for language in data_to_grab:
url = data_to_grab[language]["data_url"]
with requests.get(url, stream=True) as response:
target_path = url.split("/")[-1]
logging.info("Downloading: %s", target_path)
if response.status_code == 200:
with open(f"tars/{target_path}", "wb") as f:
f.write(response.raw.read())
else:
logging.info(
"Status code %s while downloading %s",
response.status_code,
target_path,
)
to_retry[language] = data_to_grab[language]
# 30 seconds appears to not be enough, 60-70 seconds works well
# but takes a long time.
time.sleep(45)
return to_retry


# Unpacks word frequency TSVs of tarballs, roughly 1 GB of data.
def unpack():
os.mkdir("freq_tsvs")
for tarball in os.listdir("tars"):
logging.info("Unpacking: %s", tarball)
with tarfile.open(name=f"tars/{tarball}", mode="r:gz") as tar_data:
for file_entry in tar_data:
if file_entry.name.endswith("words.txt"):
# Removes inconsistent path in tarballs
# so freq_tsvs has uniform contents.
file_entry.name = os.path.basename(file_entry.name)
tar_data.extract(file_entry, "freq_tsvs")


def main():
with open(WORTSCHATZ_DICT_PATH, "r") as langs:
languages = json.load(langs)

# Hack for repeatedly attempting to download Wortschatz data
# as a way of getting around 404 response from their server.
langs_to_retry = download(languages)
while langs_to_retry:
langs_to_retry = download(langs_to_retry)

unpack()


if __name__ == "__main__":
logging.basicConfig(
format="%(filename)s %(levelname)s: %(message)s", level="INFO"
)
main()
87 changes: 87 additions & 0 deletions languages/wikipron/src/frequencies/merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/usr/bin/env python

import csv
import json
import logging
import os
import tempfile

from grab_wortschatz_data import WORTSCHATZ_DICT_PATH


def rewrite_wikipron_tsv(
wiki_tsv_affix, transcription_level, frequencies_dict
):
# Complete WikiPron TSV path.
file_to_target = wiki_tsv_affix + transcription_level
# Will try to overwrite phonetic and phonemic WikiPron TSVs
# for all Wortschatz languages. WikiPron may not have both a
# phonetic and phonemic TSV for all languages.
try:
# This is written to be run after remove_duplicates.sh
# and retain sorted order.
with open(file_to_target, "r") as wiki_file:
wiki_tsv = csv.reader(
wiki_file, delimiter="\t", quoting=csv.QUOTE_NONE
)
with tempfile.NamedTemporaryFile(
mode="w", dir="../../tsv", delete=False
) as source:
for word, pron in wiki_tsv:
# Check if WikiPron word is in Wortschatz frequencies
# else set frequency to 0.
if word in frequencies_dict:
print(
f"{word}\t{pron}\t{frequencies_dict[word]}",
file=source,
)
else:
print(f"{word}\t{pron}\t0", file=source)
temp_path = source.name
os.replace(temp_path, file_to_target)
except FileNotFoundError as err:
logging.info("File not found: %s", err)


def main():
with open(WORTSCHATZ_DICT_PATH, "r") as langs:
languages = json.load(langs)

word_freq_dict = {}
transcription = ["_phonetic.tsv", "_phonemic.tsv"]

for freq_file in os.listdir("freq_tsvs"):
# For accessing correct language in wortschatz_languages.json.
file_to_match = freq_file.rsplit("-", 1)[0]
logging.info("Currently working on: %s", file_to_match)

with open(f"freq_tsvs/{freq_file}", "r") as tsv:
frequencies_tsv = csv.reader(
tsv, delimiter="\t", quoting=csv.QUOTE_NONE
)
for row in frequencies_tsv:
# Wortschatz TSVs are not uniformly formatted.
# Some have 3 columns, some have 4.
try:
word = row[2].lower()
freq = int(row[3])
except IndexError:
word = row[1].lower()
freq = int(row[2])
# Filter out numbers in Wortschatz data.
if str.isalpha(word):
if word not in word_freq_dict:
word_freq_dict[word] = freq
else:
word_freq_dict[word] = word_freq_dict[word] + freq

for wiki_tsv_path in languages[file_to_match]["path"]:
for level in transcription:
rewrite_wikipron_tsv(wiki_tsv_path, level, word_freq_dict)


if __name__ == "__main__":
logging.basicConfig(
format="%(filename)s %(levelname)s: %(message)s", level="INFO"
)
main()
Loading