-
Notifications
You must be signed in to change notification settings - Fork 68
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Integrates Wortschatz frequencies #122
Merged
Merged
Changes from 5 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
c48a389
First pass successfully integrating Wortschatz data
lfashby 3afee5d
Added comments to new scripts
lfashby e33e250
Cleaned up comments
lfashby bab7474
Final cleanup
lfashby 63490a7
added shebang
lfashby 1cebb53
changes to .gitignore and grab_wortschatz_data.py
lfashby d9acc80
added path to wortschatz dictionary as global constant
lfashby File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,4 +3,6 @@ __pycache__/ | |
.mypy_cache/ | ||
*.py[cdo] | ||
*.egg-info/ | ||
*.log | ||
*.log | ||
*.tar.gz | ||
**/freq_tsvs |
67 changes: 67 additions & 0 deletions
67
languages/wikipron/src/frequencies/grab_wortschatz_data.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#!/usr/bin/env python | ||
|
||
import json | ||
import logging | ||
import os | ||
import requests | ||
import tarfile | ||
import time | ||
|
||
with open("wortschatz_languages.json", "r") as langs: | ||
languages = json.load(langs) | ||
|
||
|
||
# Downloads the Wortschatz tarballs, roughly 10 GB of data. | ||
def download(data_to_grab): | ||
to_retry = {} | ||
os.makedirs("tars", exist_ok=True) | ||
for language in data_to_grab: | ||
url = data_to_grab[language]["data_url"] | ||
with requests.get(url, stream=True) as response: | ||
target_path = url.split("/")[-1] | ||
logging.info("Downloading: %s", target_path) | ||
if response.status_code == 200: | ||
with open(f"tars/{target_path}", "wb") as f: | ||
f.write(response.raw.read()) | ||
else: | ||
logging.info( | ||
"Status code %s while downloading %s", | ||
response.status_code, | ||
target_path, | ||
) | ||
to_retry[language] = data_to_grab[language] | ||
# 30 seconds appears to not be enough, 60-70 seconds works well | ||
# but takes a long time. | ||
time.sleep(45) | ||
return to_retry | ||
|
||
|
||
# Unpacks word frequency TSVs of tarballs, roughly 1 GB of data. | ||
def unpack(): | ||
os.mkdir("freq_tsvs") | ||
for tarball in os.listdir("tars"): | ||
logging.info("Unpacking: %s", tarball) | ||
with tarfile.open(name=f"tars/{tarball}", mode="r:gz") as tar_data: | ||
for file_entry in tar_data: | ||
if file_entry.name.endswith("words.txt"): | ||
# Removes inconsistent path in tarballs | ||
# so freq_tsvs has uniform contents. | ||
file_entry.name = os.path.basename(file_entry.name) | ||
tar_data.extract(file_entry, "freq_tsvs") | ||
|
||
|
||
def main(): | ||
# Hack for repeatedly attempting to download Wortschatz data | ||
# as a way of getting around 404 response from their server. | ||
langs_to_retry = download(languages) | ||
while langs_to_retry: | ||
langs_to_retry = download(langs_to_retry) | ||
|
||
unpack() | ||
|
||
|
||
if __name__ == "__main__": | ||
logging.basicConfig( | ||
format="%(filename)s %(levelname)s: %(message)s", level="INFO" | ||
) | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
#!/usr/bin/env python | ||
|
||
import csv | ||
import json | ||
import logging | ||
import os | ||
import tempfile | ||
|
||
|
||
def rewrite_wikipron_tsv( | ||
wiki_tsv_affix, transcription_level, frequencies_dict | ||
): | ||
# Complete WikiPron TSV path. | ||
file_to_target = wiki_tsv_affix + transcription_level | ||
# Will try to overwrite phonetic and phonemic Wikipron TSVs | ||
# for all Wortschatz languages. WikiPron may not have both a | ||
# phonetic and phonemic TSV for all languages. | ||
try: | ||
# This is written to be run after remove_duplicates.sh | ||
# and retain sorted order. | ||
with open(file_to_target, "r") as wiki_file: | ||
wiki_tsv = csv.reader( | ||
wiki_file, delimiter="\t", quoting=csv.QUOTE_NONE | ||
) | ||
with tempfile.NamedTemporaryFile( | ||
mode="w", dir="../../tsv", delete=False | ||
) as source: | ||
for word, pron in wiki_tsv: | ||
# Check if WikiPron word is in Wortschatz frequencies | ||
# else set frequency to 0. | ||
if word in frequencies_dict: | ||
print( | ||
f"{word}\t{pron}\t{frequencies_dict[word]}", | ||
file=source, | ||
) | ||
else: | ||
print(f"{word}\t{pron}\t0", file=source) | ||
temp_path = source.name | ||
os.replace(temp_path, file_to_target) | ||
except FileNotFoundError as err: | ||
logging.info("File not found: %s", err) | ||
|
||
|
||
def main(): | ||
with open("wortschatz_languages.json", "r") as langs: | ||
languages = json.load(langs) | ||
|
||
word_freq_dict = {} | ||
transcription = ["_phonetic.tsv", "_phonemic.tsv"] | ||
|
||
for freq_file in os.listdir("freq_tsvs"): | ||
# For accessing correct language in wortschatz_languages.json. | ||
file_to_match = freq_file.rsplit("-", 1)[0] | ||
logging.info("Currently working on: %s", file_to_match) | ||
|
||
with open(f"freq_tsvs/{freq_file}", "r") as tsv: | ||
frequencies_tsv = csv.reader( | ||
tsv, delimiter="\t", quoting=csv.QUOTE_NONE | ||
) | ||
for row in frequencies_tsv: | ||
# Wortschatz TSVs are not uniformly formatted. | ||
# Some have 3 columns, some have 4. | ||
try: | ||
word = row[2].lower() | ||
freq = int(row[3]) | ||
except IndexError: | ||
word = row[1].lower() | ||
freq = int(row[2]) | ||
# Filter out numbers in Wortschatz data. | ||
if str.isalpha(word): | ||
if word not in word_freq_dict: | ||
word_freq_dict[word] = freq | ||
else: | ||
word_freq_dict[word] = word_freq_dict[word] + freq | ||
|
||
for wiki_tsv_path in languages[file_to_match]["path"]: | ||
for level in transcription: | ||
rewrite_wikipron_tsv(wiki_tsv_path, level, word_freq_dict) | ||
|
||
|
||
if __name__ == "__main__": | ||
logging.basicConfig( | ||
format="%(filename)s %(levelname)s: %(message)s", level="INFO" | ||
) | ||
main() |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is this at global scope?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good question!