Skip to content

Commit

Permalink
Update askrobot_extraction_lib.py
Browse files Browse the repository at this point in the history
  • Loading branch information
yuriy-vorontsov committed May 15, 2024
1 parent 9fb0e38 commit b42a3a1
Showing 1 changed file with 1 addition and 60 deletions.
61 changes: 1 addition & 60 deletions askrobot-python-utils/askrobot_extraction_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,65 +10,6 @@
import tiktoken
from askrobot_common_lib import ( OPENAI_EMBEDDING )

from pprint import pprint

SCRIPT_DIR = os.path.realpath( os.path.dirname( __file__ ) )


#
# Cities
#
cities_hash = {}
cities_file = SCRIPT_DIR + '/cities.json'
if os.path.isfile( cities_file ):
with open( cities_file, encoding="utf8" ) as f:
cities_hash = json.load( f )

def get_cities_from_text( country, text ):
city_names = []
if (
country != None
and isinstance( country, str )
and country.strip() != ""
and text != None
and text.strip() != ""
):
country_lc = country.lower()
if (
country_lc in cities_hash
and cities_hash[ country_lc ] != None
and len( cities_hash[ country_lc ] ) > 0
):
text_lc = text.lower()

cities = cities_hash[ country_lc ]
for i, city in enumerate( cities ):
if 'type' not in city:
cities[ i ]['type'] = 'weak'

for city in cities:
city_lc = city['key'].lower()
if (
city['label'] not in city_names
and (
city['type'] == 'strong'
and re.search( "[^а-яА-Я]" + city_lc + "[^а-яА-Я]|^" + city_lc + "[^а-яА-Я]|[^а-яА-Я]" + city_lc + "$|^" + city_lc + "$", text_lc ) != None

or

city['type'] == 'weak'
and (
city_lc in text_lc
or city_lc.replace("-", " ") in text_lc
or city_lc.replace("-", "") in text_lc
)
)
):
city_names.append( city['label'] )

return city_names



#
# Text
Expand Down Expand Up @@ -321,4 +262,4 @@ def markdown_to_pages_recursive( page_content, minimal_number_of_words = 8, head

del blocks

return pages
return pages

0 comments on commit b42a3a1

Please sign in to comment.