diff --git a/.gitignore b/.gitignore index 26209e9b..9eda7576 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,128 @@ -.DS_Store -*.pyc \ No newline at end of file +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# macOS +.DS_Store \ No newline at end of file diff --git a/setup.py b/setup.py index 86c74c46..b0611c83 100644 --- a/setup.py +++ b/setup.py @@ -5,13 +5,13 @@ setup( name = "translatepy", packages = ["translatepy"], -version = "1.2", +version = "1.3", license = "GNU General Public License v3 (GPLv3)", description = "Translate, transliterate, get the language of texts in no time with the help of multiple APIs!", author = "Anime no Sekai", author_email = "niichannomail@gmail.com", url = "https://github.com/Animenosekai/translate", -download_url = "https://github.com/Animenosekai/translate/archive/v1.2.tar.gz", +download_url = "https://github.com/Animenosekai/translate/archive/v1.3.tar.gz", keywords = ['python', 'translate', 'translation', 'google-translate', 'yandex-translate', 'bing-translate', 'reverso', 'transliteration', 'detect-language'], install_requires = ['safeIO', 'requests'], classifiers = ['Development Status :: 4 - Beta', 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9'], diff --git a/translatepy/__init__.py b/translatepy/__init__.py index 9e7f8e9b..2bc13f02 100644 --- a/translatepy/__init__.py +++ b/translatepy/__init__.py @@ -17,7 +17,7 @@ __copyright__ = 'Copyright 2021, translate' __credits__ = ['animenosekai'] __license__ = 'GNU General Public License v3 (GPLv3)' -__version__ = 'translatepy v1.2' +__version__ = 'translatepy v1.3' __maintainer__ = 'Anime no Sekai' __email__ = 'niichannomail@gmail.com' __status__ = 'Beta' \ No newline at end of file diff --git a/translatepy/models/languages.py b/translatepy/models/languages.py index 12333715..e9c3cd11 100644 --- a/translatepy/models/languages.py +++ b/translatepy/models/languages.py @@ -6,7 +6,6 @@ from translatepy.data.data import ALPHA2_TO_ALPHA3 from translatepy.utils.similarity import language_search from translatepy.models.exceptions import UnknownLanguage -import translatepy LANGUAGES_CACHES = {} diff --git a/translatepy/requirements.txt b/translatepy/requirements.txt index c7d09a5a..52cb3fb6 100644 --- a/translatepy/requirements.txt +++ b/translatepy/requirements.txt @@ -1,2 +1,2 @@ requests -safeIO \ No newline at end of file +safeIO>=1.2 \ No newline at end of file diff --git a/translatepy/translators/_yandex_last_tried.translatepy b/translatepy/translators/_yandex_last_tried.translatepy new file mode 100644 index 00000000..e69de29b diff --git a/translatepy/translators/_yandex_sid.translatepy b/translatepy/translators/_yandex_sid.translatepy new file mode 100644 index 00000000..e69de29b diff --git a/translatepy/translators/yandex.py b/translatepy/translators/yandex.py index b5327f6e..17658e6d 100644 --- a/translatepy/translators/yandex.py +++ b/translatepy/translators/yandex.py @@ -1,3 +1,4 @@ +from time import time from json import loads from random import randint from os.path import dirname, abspath @@ -7,6 +8,7 @@ from translatepy.models.languages import Language from translatepy.models.userAgents import USER_AGENTS +from translatepy.utils.utils import convert_to_float FILE_LOCATION = dirname(abspath(__file__)) @@ -29,26 +31,54 @@ class YandexTranslate(): """ def __init__(self, sid_refresh=False) -> None: self._base_url = "https://translate.yandex.net/api/v1/tr.json/" - self._sid_cache = TextFile(FILE_LOCATION + "/_yandex_sid.translatepy") - self._sid = self._sid_cache.read() + self._sid_cache = TextFile(FILE_LOCATION + "/_yandex_sid.translatepy", blocking=False) + self._last_tried_cache = TextFile(FILE_LOCATION + "/_yandex_last_tried.translatepy", blocking=False) + with self._sid_cache as cache: + self._sid = str(cache.read()) + with self._last_tried_cache as cache: + self._last_tried = convert_to_float(cache.read()) self._headers = self._header() + self._check_increment = 600 # defaults to 10 minutes if sid_refresh: self.refreshSID() def refreshSID(self): - data = get("https://translate.yandex.com/", headers=self._headers).text - sid_position = data.find("Ya.reqid = '") - if sid_position == -1: - return - data = data[sid_position + 12:] - self._sid = data[:data.find("';")] - self._sid_cache.write(self._sid) + """ + Refreshes the SID used for requests to Yandex Translation API + + See issue #4 for more information + Randomness is used to prevent bot detection + """ + try: + if time() - self._last_tried > self._check_increment: # if the duration between the last time we tried to get the SID and now is greater than 10 minutes for the first pass + data = get("https://translate.yandex.com/", headers=self._headers).text + sid_position = data.find("Ya.reqid = '") + if sid_position != -1: + data = data[sid_position + 12:] + self._sid = data[:data.find("';")] + self._sid_cache.write(self._sid) + + self._check_increment = self._check_increment / 2 + randint(0, 1000) / 1000 # decrementing because it might work decremented + self._last_tried = time() # maybe keep that in a file + self._last_tried_cache.write(self._last_tried) + return True + else: + self._check_increment = self._check_increment * 2 + randint(0, 1000) / 1000 # incrementing the waiting time + self._last_tried = time() # maybe keep that in a file + self._last_tried_cache.write(self._last_tried) + # else + # do nothing as we know that yandex will rate-limit us if we ping too much their website + return False + except: + return False def _header(self): """ Creates a new header + + _header might not be appropriate if the _sid is linked to the User-Agent header """ - _dict = HEADERS + _dict = HEADERS.copy() randomChoice = randint(0, 7499) _dict.update({"User-Agent": USER_AGENTS[randomChoice]}) return _dict @@ -58,31 +88,31 @@ def translate(self, text, destination_language, source_language="auto"): Translates the given text to the given language """ try: + # preparing the request if source_language is None or str(source_language) == "auto": source_language = self.language(text) if source_language is None: return None, None if isinstance(source_language, Language): source_language = source_language.yandex_translate - if self._sid.replace(" ", "") == "": - self.refreshSID() - url = self._base_url + "translate?id=" + self._sid + "-0-0&srv=tr-text&lang=" + str(source_language) +"-" + str(destination_language) + "&reason=auto&format=text" - request = get(url, headers=self._headers, data={'text': str(text), 'options': '4'}) - data = loads(request.text) - if request.status_code < 400 and data["code"] == 200: - data = loads(request.text) - return str(data["lang"]).split("-")[0], data["text"][0] - else: - self.refreshSID() - # redo everything with the new sid + # check if we have an _sid + if self._sid.replace(" ", "") == "" and not self.refreshSID(): + return None, None + + def _request(): url = self._base_url + "translate?id=" + self._sid + "-0-0&srv=tr-text&lang=" + str(source_language) +"-" + str(destination_language) + "&reason=auto&format=text" request = get(url, headers=self._headers, data={'text': str(text), 'options': '4'}) data = loads(request.text) if request.status_code < 400 and data["code"] == 200: data = loads(request.text) return str(data["lang"]).split("-")[0], data["text"][0] - else: - return None, None + return None, None + + _lang, _text = _request() + if _lang is None or _text is None: + if self.refreshSID(): + _lang, _text = _request() + return _lang, _text except: return None, None @@ -95,18 +125,22 @@ def transliterate(self, text, source_language=None): source_language = self.language(text) if source_language is None or source_language not in TRANSLIT_LANGS: return None, None - if self._sid.replace(" ", "") == "": - self.refreshSID() - request = post("https://translate.yandex.net/translit/translit?sid=" + self._sid + "&srv=tr-text", headers=self._headers, data={'text': str(text), 'lang': source_language}) - if request.status_code < 400: - return source_language, request.text[1:-1] - else: - self.refreshSID() + + if self._sid.replace(" ", "") == "" and not self.refreshSID(): + return None, None + + def _request(): request = post("https://translate.yandex.net/translit/translit?sid=" + self._sid + "&srv=tr-text", headers=self._headers, data={'text': str(text), 'lang': source_language}) if request.status_code < 400: return source_language, request.text[1:-1] else: return None, None + + _lang, _text = _request() + if _lang is None or _text is None: + if self.refreshSID(): + _lang, _text = _request() + return _lang, _text except: return None, None @@ -118,17 +152,12 @@ def spellcheck(self, text, source_language=None): if source_language is None: source_language = self.language(text) if source_language is None: - return None - if self._sid.replace(" ", "") == "": - self.refreshSID() - request = post("https://speller.yandex.net/services/spellservice.json/checkText?sid=" + self._sid + "&srv=tr-text", headers=self._headers, data={'text': str(text), 'lang': source_language, 'options': 516}) - if request.status_code < 400: - data = loads(request.text) - for correction in data: - text = text[:correction.get("pos", 0)] + correction.get("s", [""])[0] + text[correction.get("pos", 0) + correction.get("len", 0):] - return source_language, text - else: - self.refreshSID() + return None, None + + if self._sid.replace(" ", "") == "" and not self.refreshSID(): + return None, None + + def _request(): request = post("https://speller.yandex.net/services/spellservice.json/checkText?sid=" + self._sid + "&srv=tr-text", headers=self._headers, data={'text': str(text), 'lang': source_language, 'options': 516}) if request.status_code < 400: data = loads(request.text) @@ -137,6 +166,12 @@ def spellcheck(self, text, source_language=None): return source_language, text else: return None, None + + _lang, _text = _request() + if _lang is None or _text is None: + if self.refreshSID(): + _lang, _text = _request() + return _lang, _text except: return None, None @@ -147,20 +182,24 @@ def language(self, text, hint=None): try: if hint is None: hint = "en,ja" - if self._sid.replace(" ", "") == "": - self.refreshSID() + + if self._sid.replace(" ", "") == "" and not self.refreshSID(): + return None + url = self._base_url + "detect?sid=" + self._sid + "&srv=tr-text&text=" + str(text) + "&options=1&hint=" + str(hint) - request = get(url, headers=self._headers) - if request.status_code < 400 and request.json()["code"] == 200: - return loads(request.text)["lang"] - else: - self.refreshSID() - url = self._base_url + "detect?sid=" + self._sid + "&srv=tr-text&text=" + str(text) + "&options=1&hint=" + str(hint) + + def _request(): request = get(url, headers=self._headers) if request.status_code < 400 and request.json()["code"] == 200: return loads(request.text)["lang"] else: return None + + _lang = _request() + if _lang is None: + if self.refreshSID(): + _lang = _request() + return _lang except: return None diff --git a/translatepy/utils/utils.py b/translatepy/utils/utils.py new file mode 100644 index 00000000..d87a88bc --- /dev/null +++ b/translatepy/utils/utils.py @@ -0,0 +1,13 @@ +from re import compile + +POSITIVE_FLOAT_REGEX = compile("[^0-9.]") + +def convert_to_float(element): + """ + Safely converts anything to a positive float + """ + element = POSITIVE_FLOAT_REGEX.sub("", str(element)) + if element != '': + return float(element) + else: + return 0 \ No newline at end of file