**GEO NAMES**

**Задача: Сопоставление произвольных гео названий с унифицированными именами geonames**:

1. Создать решение для подбора наиболее подходящих названий с geonames. Например Ереван -> Yerevan
2. На примере РФ и стран наиболее популярных для релокации - Беларусь, Армения, Казахстан, Кыргызстан, Турция, Сербия. Города с населением от 15000 человек (с возможностью масштабирования на сервере заказчика)
3. Возвращаемые поля geonameid, name, region, country, cosine similarity -  формат данных на выходе: список словарей, например [{dict_1}, {dict_2}, …. {dict_n}] где словарь - одна запись с указанными полями
олями



In [71]:
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
import pandas as pd
import numpy as np
import logging
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial import distance
from transliterate import translit, get_available_language_codes
from sentence_transformers import SentenceTransformer, util, InputExample, losses, evaluation
import warnings
import torch
from torch.utils.data import DataLoader
import transformers
from sparse_dot_topn import awesome_cossim_topn
import os
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
warnings.filterwarnings('ignore')

## Настраиваем подключение к postgresql

In [2]:
host = '127.0.0.1'
username = 'postgres'
password = '123'
database = 'mybd'

In [3]:
DATABASE = {
    'drivername': 'postgresql',
    'username': username, 
    'password': password, 
    'host': host,
    'port': 5432,
    'database': database,
    'query': {}
}  


In [4]:
engine = create_engine(URL(**DATABASE))

In [5]:
query = 'SELECT * FROM countryInfo LIMIT 2'
pd.read_sql_query(query, con=engine)

Unnamed: 0,iso,iso3,iso_numeric,fips,country,capital,continent,tld,currencycode,currencyname,phone,geonameid,neighbours,equivalentfipscode
0,AD,AND,20,AN,Andorra,Andorra la Vella,EU,.ad,EUR,Euro,376,3041565,"ES,FR",
1,AE,ARE,784,AE,United Arab Emirates,Abu Dhabi,AS,.ae,AED,Dirham,971,290557,"SA,OM",


Данные в БД загружены, подключение настроено.

## Собираем данные в таблицу

Список стран:

In [6]:
countries = ['RU','BY','AM','KZ','KG','TR','RS','GE']

In [7]:
query = 'SELECT * FROM cities15000'
cities = pd.read_sql_query(query, con=engine)

In [8]:
#отбираем города из указаны стран в ТЗ
cities_filter = cities.loc[cities['country_code'].isin(countries)]
cities_filter = cities_filter[['geonameid', 'name','asciiname', 'country_code', 'admin1_code','feature_code', 'population']]
cities_filter['admin1cod']= cities_filter['country_code']+'.'+cities_filter['admin1_code']

In [9]:
cities_filter.head()

Unnamed: 0,geonameid,name,asciiname,country_code,admin1_code,feature_code,population,admin1cod
97,174875,Kapan,Kapan,AM,8,PPLA,33160,AM.08
98,174895,Goris,Goris,AM,8,PPL,20379,AM.08
99,174972,Hats’avan,Hats'avan,AM,8,PPL,15208,AM.08
100,174979,Artashat,Artashat,AM,2,PPLA,20562,AM.02
101,174991,Ararat,Ararat,AM,2,PPL,28832,AM.02


In [10]:
query = 'SELECT * FROM countryinfo'
country = pd.read_sql_query(query, con=engine)

In [11]:
country = country[['iso', 'geonameid','country']]

In [12]:
query = 'SELECT * FROM admin1codesascii'
admin1codesascii = pd.read_sql_query(query, con=engine)

In [13]:
cities_x_country = cities_filter.merge(country, how = 'left' , left_on = 'country_code', right_on = 'iso')

In [14]:
cities_x_country_x_admin = cities_x_country.merge(admin1codesascii, how = 'left' , left_on = 'admin1cod', \
                                                  right_on = 'code')

In [15]:
cities_x_country_x_admin.head()

Unnamed: 0,geonameid_x,name_x,asciiname,country_code,admin1_code,feature_code,population,admin1cod,iso,geonameid_y,country,code,name_y,name_ascii,geonameid
0,174875,Kapan,Kapan,AM,8,PPLA,33160,AM.08,AM,174982,Armenia,AM.08,Syunik,Syunik,409314
1,174895,Goris,Goris,AM,8,PPL,20379,AM.08,AM,174982,Armenia,AM.08,Syunik,Syunik,409314
2,174972,Hats’avan,Hats'avan,AM,8,PPL,15208,AM.08,AM,174982,Armenia,AM.08,Syunik,Syunik,409314
3,174979,Artashat,Artashat,AM,2,PPLA,20562,AM.02,AM,174982,Armenia,AM.02,Ararat,Ararat,409313
4,174991,Ararat,Ararat,AM,2,PPL,28832,AM.02,AM,174982,Armenia,AM.02,Ararat,Ararat,409313


In [16]:
cities_x_country_x_admin.columns

Index(['geonameid_x', 'name_x', 'asciiname', 'country_code', 'admin1_code',
       'feature_code', 'population', 'admin1cod', 'iso', 'geonameid_y',
       'country', 'code', 'name_y', 'name_ascii', 'geonameid'],
      dtype='object')

In [17]:
fin_df = cities_x_country_x_admin.rename(columns={'geonameid_x': 'geonameid_city', 'name_x': 'name_city', 
                                                  'geonameid_y': 'geonameid_country', 'country': 'name_country',
                                                  'name_y': 'name_region', 'geonameid': 'geonameid_region'})

In [18]:
#из данного фрейма будем формировать ответ пользователю
fin_df = fin_df[['geonameid_city', 'name_city', 'country_code',
       'admin1_code', 'feature_code', 'population',
       'geonameid_country', 'name_country', 'name_region','geonameid_region']]
fin_df.head()

Unnamed: 0,geonameid_city,name_city,country_code,admin1_code,feature_code,population,geonameid_country,name_country,name_region,geonameid_region
0,174875,Kapan,AM,8,PPLA,33160,174982,Armenia,Syunik,409314
1,174895,Goris,AM,8,PPL,20379,174982,Armenia,Syunik,409314
2,174972,Hats’avan,AM,8,PPL,15208,174982,Armenia,Syunik,409314
3,174979,Artashat,AM,2,PPLA,20562,174982,Armenia,Ararat,409313
4,174991,Ararat,AM,2,PPL,28832,174982,Armenia,Ararat,409313


In [19]:
#убираем ссылки и старые названия
query = """
SELECT 
a.geonameid, a.isolanguage, a.alternate_name  
FROM alternatenamesv2 as a 
where (a.isolanguage <> 'link' or a.isolanguage is null)
and a.dt_to is null
"""
alternatenames = pd.read_sql_query(query, con=engine)

In [20]:
alternatenames.head()

Unnamed: 0,geonameid,isolanguage,alternate_name
0,2994701,,Roc Mélé
1,2994701,,Roc Meler
2,3007683,,Pic des Langounelles
3,3017832,,Pic de les Abelletes
4,3017832,,Pic de la Font-Nègre


In [21]:
#объединяем полученные таблицы
cities_alternatenames=fin_df.merge(alternatenames, how = 'left' , left_on = 'geonameid_city', right_on = 'geonameid')

За счёт left join отбираем только записи по списку стран в ТЗ:

In [22]:
cities_alternatenames.head()

Unnamed: 0,geonameid_city,name_city,country_code,admin1_code,feature_code,population,geonameid_country,name_country,name_region,geonameid_region,geonameid,isolanguage,alternate_name
0,174875,Kapan,AM,8,PPLA,33160,174982,Armenia,Syunik,409314,174875.0,,Qafan
1,174875,Kapan,AM,8,PPLA,33160,174982,Armenia,Syunik,409314,174875.0,es,Kapan
2,174875,Kapan,AM,8,PPLA,33160,174982,Armenia,Syunik,409314,174875.0,en,Kapan
3,174875,Kapan,AM,8,PPLA,33160,174982,Armenia,Syunik,409314,174875.0,de,Kapan
4,174875,Kapan,AM,8,PPLA,33160,174982,Armenia,Syunik,409314,174875.0,fa,کاپان


In [23]:
#проверяем, для всех записей из списка городов нашлась запись в таблице с альтернативными названиями?
cities_alternatenames[cities_alternatenames['alternate_name'].isnull()]

Unnamed: 0,geonameid_city,name_city,country_code,admin1_code,feature_code,population,geonameid_country,name_country,name_region,geonameid_region,geonameid,isolanguage,alternate_name
22815,11238229,Obruchevo,RU,48,PPL,85616,2017370,Russia,Moscow,524894,,,
25462,323094,Aşkale,TR,25,PPL,15462,298795,Turkey,Erzurum,315367,,,
27844,7627067,Bahçelievler,TR,34,PPL,576799,298795,Turkey,Istanbul,745042,,,
27845,7628416,Sultangazi,TR,34,PPL,436935,298795,Turkey,Istanbul,745042,,,
27847,7628420,Sancaktepe,TR,34,PPL,241000,298795,Turkey,Istanbul,745042,,,
27848,7701384,Karabağlar,TR,35,PPL,458000,298795,Turkey,İzmir Province,311044,,,
27850,11238838,Merkezefendi,TR,20,PPLA2,280341,298795,Turkey,Denizli,317106,,,


In [24]:
cities_alternatenames['alternate_name'] = cities_alternatenames['alternate_name']. fillna(cities_alternatenames['name_city'])

In [25]:
#потребуется для соединения таблиц
cities_alternatenames.index.names = ['ID']

## Сопоставляем город по запросу

### Перевод+TF-IDF

Кол-во записей для выдачи N

In [26]:
N = 1

In [27]:
corpus_tfidf = cities_alternatenames['alternate_name']

In [28]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))

In [29]:
corpus_tfidf_transform = vectorizer.fit_transform(corpus_tfidf)

In [30]:
test = pd.read_csv('geo_test.csv', sep = ';')

In [31]:
test.head()

Unnamed: 0,query,name,region,country
0,Смоленск,Smolensk,Smolensk Oblast,Russia
1,Кемерово,Kemerovo,Kuzbass,Russia
2,Бишкек,Bishkek,Bishkek,Kyrgyzstan
3,Москва,Moscow,Moscow,Russia
4,Алматы,Almaty,Almaty,Kazakhstan


In [32]:
query = []
distance = []
city_index = []
city_query = []
city_translate = []
for city in test['query'].values.tolist():
    for lang in get_available_language_codes():
        translate_text = translit(city, language_code = lang , reversed=True)
        query.append(translate_text)
        answer = awesome_cossim_topn(vectorizer.transform(query), corpus_tfidf_transform.transpose(), N, 0)
        for index, i in enumerate(answer.indices):
            distance.append(answer.data[index])
            city_index.append(answer.indices[0])
            city_query.append(city)
            city_translate.append(translate_text)
        query.clear()    

In [33]:
test_df = pd.DataFrame(list(zip(distance, city_index, city_query, city_translate)), columns = ['distance', 'city_index','city_query','city_translate'])

На данном этапе для каждого значения из тестового набора мы нашли наиболее близкие записи. Теперь для каждой записи тестового набора выбираем город с максимальным значением "distance".

In [34]:
test_df_range = test_df.sort_values(['distance'],ascending= False ).groupby('city_query').head(1)

In [35]:
test_fin = pd.merge(test, pd.merge(test_df_range, cities_alternatenames, how = 'inner' , left_on = 'city_index' , right_on = 'ID'), 
                    how = 'inner', left_on = 'query', right_on = 'city_query')[['query','name','region','country', 'name_city', 'name_region',
                                                                                'name_country','distance']]

In [36]:
#некорректно определили город
test_fin[test_fin['name'] != test_fin['name_city']]['query'].count()

22

### Обученные языковые модели

In [38]:
MLM = ['distiluse-base-multilingual-cased-v1','distiluse-base-multilingual-cased-v2','paraphrase-multilingual-MiniLM-L12-v2',
      'paraphrase-multilingual-mpnet-base-v2','LaBSE']

In [39]:
unic_city = cities_alternatenames.alternate_name.values  
test_city_lm = test['query'].values
for lm in MLM:
    model = SentenceTransformer('sentence-transformers/'+lm)
    embeddings = model.encode(unic_city)
    embeddings_test = model.encode(test_city_lm)
    city_lm = []
    gerion_lm = []
    country_lm = []
    score_lm = []
    for text in test_city_lm:
        result = util.semantic_search(model.encode(text),embeddings,  top_k = 1)[0]
        for item in result:
            city_lm.append(cities_alternatenames.iloc[item['corpus_id']]['name_city'])
            gerion_lm.append(cities_alternatenames.iloc[item['corpus_id']]['name_region'])
            country_lm.append(cities_alternatenames.iloc[item['corpus_id']]['name_country'])
            score_lm.append(item['score'])
    test_df_lm = pd.DataFrame(list(zip(city_lm, gerion_lm, country_lm, score_lm)), columns = ['city_lm', 'gerion_lm','country_lm','score_lm'])
    test_fin_lm = test.join(test_df_lm)
    print('Кол-во некорректных для модели '+lm+':', test_fin_lm[test_fin_lm['name'] != test_fin_lm['city_lm']]['query'].count())

Кол-во некорректных для модели distiluse-base-multilingual-cased-v1: 24
Кол-во некорректных для модели distiluse-base-multilingual-cased-v2: 28
Кол-во некорректных для модели paraphrase-multilingual-MiniLM-L12-v2: 28
Кол-во некорректных для модели paraphrase-multilingual-mpnet-base-v2: 27
Кол-во некорректных для модели LaBSE: 22


Нельзя сказать, что результаты моделкей сильно отдичаются. Посомтрим, на каких городах ошиблась последняя модель из предыдущего блока.

In [41]:
test_fin_lm[test_fin_lm['name'] != test_fin_lm['city_lm']]

Unnamed: 0,query,name,region,country,city_lm,gerion_lm,country_lm,score_lm
10,Минск,Minsk City,Minsk City,Belarus,Minsk,Minsk City,Belarus,1.0
15,Екб,Yekaterinburg,Sverdlovsk Oblast,Russia,Ekibastuz,Pavlodar Region,Kazakhstan,0.770992
17,Н.Новгород,Nizhniy Novgorod,Nizhny Novgorod Oblast,Russia,Velikiy Novgorod,Novgorod Oblast,Russia,0.91368
31,Остана,Astana,Astana,Kazakhstan,Tunceli,Tunceli,Turkey,0.787425
44,Островцы,Ostrovtsy,Moscow Oblast,Russia,Ostrov,Pskov Oblast,Russia,0.827741
75,Аксай,Aksay,Rostov,Russia,Aqsay,Batys Qazaqstan,Kazakhstan,1.0
91,Сербия,Serbia,Serbia,Serbia,Sivas,Sivas,Turkey,0.721775
98,Армения,Armenia,Armenia,Armenia,Ararat,Ararat,Armenia,0.76678
101,Атырау,Atyraū,Atyraū,Kazakhstan,Atyrau,Atyraū,Kazakhstan,1.0
121,Джанкой,Zhanibek,Batys Qazaqstan,Kazakhstan,Samsun,Samsun,Turkey,0.81576


Модель плохо отработала для похожих названия и для сокращений. Попробуем дообучить на нашем корпусе текста. Но т.к. labse модель тяжёлая, 
дообучать попробую distiluse-base-multilingual-cased-v1, а на тесте разница между моделями в 2 записи

In [43]:
for_fit = cities_alternatenames[['name_city','alternate_name']]

In [44]:
for_fit = for_fit[for_fit.name_city != for_fit.alternate_name].drop_duplicates()

In [45]:
for_fit['imput_example'] = for_fit[['name_city','alternate_name']].apply(lambda x: InputExample(texts = list(x)), axis = 1)

In [47]:
train = for_fit.imput_example.to_list()

In [48]:
dbmcv1 = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1', device = device)

In [49]:
train_dataloader = DataLoader(train, shuffle = True, batch_size = 128)
train_loss = losses.MultipleNegativesRankingLoss( model = dbmcv1)

In [50]:
class LoggingMultipleNegativesRankingLoss(losses.MultipleNegativesRankingLoss):
    def forward(self, sentance_features, dbmcv1):
        loss_value = super().forward(sentance_features, dbmcv1)
        logging.info(f'Loss:{loss_value.item()}')
        return loss_value
train_loss = LoggingMultipleNegativesRankingLoss(model = dbmcv1)
logging.getLogger().setLevel(logging.DEBUG)

In [51]:
#%%time
#dbmcv1.fit(train_objectives = [(train_dataloader, train_loss)], epochs = 5)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/137 [00:00<?, ?it/s]

INFO:root:Loss:3.3587677478790283
INFO:root:Loss:3.123884677886963
INFO:root:Loss:2.9713032245635986
INFO:root:Loss:3.2057483196258545
INFO:root:Loss:2.979665517807007
INFO:root:Loss:2.9528937339782715
INFO:root:Loss:2.8606672286987305
INFO:root:Loss:3.1198947429656982
INFO:root:Loss:3.106536388397217
INFO:root:Loss:3.1402971744537354
INFO:root:Loss:3.1045992374420166
INFO:root:Loss:3.219496965408325
INFO:root:Loss:3.201340675354004
INFO:root:Loss:3.385812520980835
INFO:root:Loss:3.281479835510254
INFO:root:Loss:2.9452290534973145
INFO:root:Loss:3.4556639194488525
INFO:root:Loss:3.0074026584625244
INFO:root:Loss:2.7682931423187256
INFO:root:Loss:3.0583527088165283
INFO:root:Loss:2.812424898147583
INFO:root:Loss:3.3902008533477783
INFO:root:Loss:3.247126579284668
INFO:root:Loss:3.329486846923828
INFO:root:Loss:3.166686534881592
INFO:root:Loss:2.869112014770508
INFO:root:Loss:2.8095643520355225
INFO:root:Loss:3.059065580368042
INFO:root:Loss:3.147698163986206
INFO:root:Loss:3.36612296104

Iteration:   0%|          | 0/137 [00:00<?, ?it/s]

INFO:root:Loss:2.7662734985351562
INFO:root:Loss:2.945146322250366
INFO:root:Loss:2.720707893371582
INFO:root:Loss:2.7673630714416504
INFO:root:Loss:3.319511890411377
INFO:root:Loss:2.7543892860412598
INFO:root:Loss:3.1382980346679688
INFO:root:Loss:3.1200268268585205
INFO:root:Loss:2.3953335285186768
INFO:root:Loss:2.6049270629882812
INFO:root:Loss:3.029414176940918
INFO:root:Loss:2.7048823833465576
INFO:root:Loss:2.8806142807006836
INFO:root:Loss:2.886289596557617
INFO:root:Loss:2.4131455421447754
INFO:root:Loss:2.9301252365112305
INFO:root:Loss:2.9289398193359375
INFO:root:Loss:3.0030977725982666
INFO:root:Loss:2.8303956985473633
INFO:root:Loss:2.3962485790252686
INFO:root:Loss:2.887176513671875
INFO:root:Loss:2.526881217956543
INFO:root:Loss:2.8627355098724365
INFO:root:Loss:2.6724934577941895
INFO:root:Loss:2.447204351425171
INFO:root:Loss:3.1942691802978516
INFO:root:Loss:2.760777711868286
INFO:root:Loss:2.3147428035736084
INFO:root:Loss:2.885915517807007
INFO:root:Loss:2.7632153

Iteration:   0%|          | 0/137 [00:00<?, ?it/s]

INFO:root:Loss:2.6522583961486816
INFO:root:Loss:2.514833927154541
INFO:root:Loss:2.377100706100464
INFO:root:Loss:2.147505044937134
INFO:root:Loss:2.3313369750976562
INFO:root:Loss:2.5079643726348877
INFO:root:Loss:2.1418275833129883
INFO:root:Loss:2.2970197200775146
INFO:root:Loss:2.3193471431732178
INFO:root:Loss:2.2799324989318848
INFO:root:Loss:2.4530839920043945
INFO:root:Loss:2.3189330101013184
INFO:root:Loss:2.1468541622161865
INFO:root:Loss:2.5629312992095947
INFO:root:Loss:2.2392489910125732
INFO:root:Loss:2.3485727310180664
INFO:root:Loss:2.5401864051818848
INFO:root:Loss:2.1327786445617676
INFO:root:Loss:2.433690309524536
INFO:root:Loss:2.0632143020629883
INFO:root:Loss:2.307697296142578
INFO:root:Loss:2.045684814453125
INFO:root:Loss:2.30684232711792
INFO:root:Loss:2.1761245727539062
INFO:root:Loss:2.550102710723877
INFO:root:Loss:2.4701380729675293
INFO:root:Loss:2.006337881088257
INFO:root:Loss:2.301922559738159
INFO:root:Loss:2.452070474624634
INFO:root:Loss:2.102092027

Iteration:   0%|          | 0/137 [00:00<?, ?it/s]

INFO:root:Loss:1.7866289615631104
INFO:root:Loss:2.30769419670105
INFO:root:Loss:2.215010643005371
INFO:root:Loss:1.896520972251892
INFO:root:Loss:2.095465660095215
INFO:root:Loss:1.9995390176773071
INFO:root:Loss:2.1231892108917236
INFO:root:Loss:2.232724905014038
INFO:root:Loss:2.0207631587982178
INFO:root:Loss:2.1437489986419678
INFO:root:Loss:2.056041955947876
INFO:root:Loss:2.1404452323913574
INFO:root:Loss:1.672279715538025
INFO:root:Loss:1.7776718139648438
INFO:root:Loss:2.349381923675537
INFO:root:Loss:1.9971895217895508
INFO:root:Loss:2.1331818103790283
INFO:root:Loss:1.592689037322998
INFO:root:Loss:2.369029998779297
INFO:root:Loss:2.2947702407836914
INFO:root:Loss:1.9243981838226318
INFO:root:Loss:2.5452795028686523
INFO:root:Loss:1.8626474142074585
INFO:root:Loss:2.2063419818878174
INFO:root:Loss:2.0733466148376465
INFO:root:Loss:1.8657270669937134
INFO:root:Loss:2.3200342655181885
INFO:root:Loss:1.7744215726852417
INFO:root:Loss:1.896880865097046
INFO:root:Loss:1.865357756

Iteration:   0%|          | 0/137 [00:00<?, ?it/s]

INFO:root:Loss:1.6288796663284302
INFO:root:Loss:1.7715264558792114
INFO:root:Loss:1.9320471286773682
INFO:root:Loss:2.22460675239563
INFO:root:Loss:1.658329963684082
INFO:root:Loss:1.904997706413269
INFO:root:Loss:1.831337571144104
INFO:root:Loss:2.0221238136291504
INFO:root:Loss:1.7270933389663696
INFO:root:Loss:1.908408522605896
INFO:root:Loss:1.968637228012085
INFO:root:Loss:1.971882700920105
INFO:root:Loss:1.789986491203308
INFO:root:Loss:1.892063021659851
INFO:root:Loss:1.7969310283660889
INFO:root:Loss:1.4392249584197998
INFO:root:Loss:2.2388014793395996
INFO:root:Loss:2.3018529415130615
INFO:root:Loss:2.0140092372894287
INFO:root:Loss:1.9589383602142334
INFO:root:Loss:1.8004043102264404
INFO:root:Loss:1.9262186288833618
INFO:root:Loss:1.9418553113937378
INFO:root:Loss:2.212174654006958
INFO:root:Loss:1.956243872642517
INFO:root:Loss:1.8347277641296387
INFO:root:Loss:2.0628695487976074
INFO:root:Loss:1.3823537826538086
INFO:root:Loss:1.683578610420227
INFO:root:Loss:2.0972530841

CPU times: total: 4h 50min 42s
Wall time: 50min 20s


Теперь посмотрим на качество для тестовых записей:

In [57]:
test_city_lm = test['query'].values
embeddings = dbmcv1.encode(unic_city)
embeddings_test = dbmcv1.encode(test_city_lm)
city_lm = []
gerion_lm = []
country_lm = []
score_lm = []
for text in test_city_lm:
    result = util.semantic_search(dbmcv1.encode(text),embeddings,  top_k = 1)[0]
    for item in result:
        city_lm.append(cities_alternatenames.iloc[item['corpus_id']]['name_city'])
        gerion_lm.append(cities_alternatenames.iloc[item['corpus_id']]['name_region'])
        country_lm.append(cities_alternatenames.iloc[item['corpus_id']]['name_country'])
        score_lm.append(item['score'])
test_df_lm = pd.DataFrame(list(zip(city_lm, gerion_lm, country_lm, score_lm)), columns = ['city_lm', 'gerion_lm','country_lm','score_lm'])
test_fin_lm = test.join(test_df_lm)
print('Кол-во некорректных для модели '+'dbmcv1'+':', test_fin_lm[test_fin_lm['name'] != test_fin_lm['city_lm']]['query'].count())

Batches:   0%|          | 0/871 [00:00<?, ?it/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Кол-во некорректных для модели dbmcv1: 24


<b>Выводы:</b> 
- TFIDF с переводом и labse показали одинаковый результат при выборе top 1.
- дообучение модели не дало прирост качества
- далее для реализации класса будем использовать labse "из коробки"

### Реализация задачи с использованием модуля

Подготавливаем данные

In [37]:
from module_geo_names import geo_name

In [38]:
%load_ext autoreload
%autoreload 2

In [42]:
countries = ['RU','BY','AM','KZ','KG','TR','RS','GE']
countries_str = "('RU','BY','AM','KZ','KG','TR','RS','GE')"

In [51]:
query = '''
select 
ci.geonameid,
ci.name,
ci.feature_code,
ci.population,
ci.country_code,
a.name as region_name,
co.country,
al.alternate_name
from cities15000 as ci join countryinfo as co on ci.country_code = co.iso
left join admin1codesascii as a on ci.country_code||'.'||ci.admin1_code = a.code
left join alternatenamesv2 as al on al.geonameid = ci.geonameid 
where (al.isolanguage <> 'link' or al.isolanguage is null)
and ci.country_code in '''+countries_str
cities_alternatenames = pd.read_sql_query(query, con=geo_name.bd_connect())

In [52]:
cities_alternatenames.head()

Unnamed: 0,geonameid,name,feature_code,population,country_code,region_name,country,alternate_name
0,174875,Kapan,PPLA,33160,AM,Syunik,Armenia,Qafan
1,174875,Kapan,PPLA,33160,AM,Syunik,Armenia,Kapan
2,174875,Kapan,PPLA,33160,AM,Syunik,Armenia,Kapan
3,174875,Kapan,PPLA,33160,AM,Syunik,Armenia,Kapan
4,174875,Kapan,PPLA,33160,AM,Syunik,Armenia,کاپان


In [53]:
cities_alternatenames['alternate_name'] = cities_alternatenames['alternate_name'].fillna(cities_alternatenames['name'])

In [54]:
cities_alternatenames = cities_alternatenames.drop_duplicates()

----------------------------------------------

Подбираем самое близкое по косинусному расстоянию значение

In [55]:
model = SentenceTransformer('sentence-transformers/LaBSE')

В методе можем указать, что делать с эмбеддингами: перезаписать, добавить к текущей таблице или не записывать

In [56]:
embeddings = geo_name.vectorization(model = model, cities_list = cities_alternatenames.alternate_name.values, 
                                    replace_or_append_or_nan = 'nan')

Для дальнейшего использования можем взять эмбеддинги из базы

In [57]:
query = """
SELECT * from embeddings limit 10
"""
emb = pd.read_sql_query(query, con=geo_name.bd_connect()).drop(columns=['index']).values

In [58]:
emb.shape

(10, 768)

Функция для вывода результата:

In [65]:
def list_of_dict(input_sity, embeddings, model, N = 5, cities = cities_alternatenames):
    # т.к. в cities есть дубли(в виду использованиия альтернативных названий) выбираем топ в 200 "ближайших" по косинусному растоянию
    top_names = util.semantic_search(model.encode(input_sity), embeddings,top_k = 200)[0]
    #у нас есть id эмбеддингов, трансформируем их в записи из городов
    geonameid = []
    city_lm = []
    gerion_lm = []
    country_lm = []
    feature_code = []
    population = []
    score_lm = []
    for examp in top_names:
        geonameid.append(cities.iloc[examp['corpus_id']]['geonameid'])
        city_lm.append(cities.iloc[examp['corpus_id']]['name'])
        gerion_lm.append(cities.iloc[examp['corpus_id']]['region_name'])
        country_lm.append(cities.iloc[examp['corpus_id']]['country'])
        feature_code.append(cities.iloc[examp['corpus_id']]['feature_code'])
        population.append(cities.iloc[examp['corpus_id']]['population'])
        score_lm.append(examp['score'])
    top_200 = pd.DataFrame(list(zip(geonameid, city_lm, gerion_lm, country_lm, feature_code, population, score_lm)), columns = 
                           ['geonameid','name', 'region_name','country', 'feature_code','population','score_lm'])
    #Сортируем по косинусному растоянию, типу города и населению, что бы при удалении дубликатов взять первую запись.
    #Таким образом при одинаковом названии в топе выше будет административный центр. 
    #Если и на этом уровне города равны, то выше будет город с бОльшим населением
    top_200 = top_200.sort_values (by = ['score_lm', 'feature_code' , 'population'], ascending = [False, True, False])
    #удаляем дубликаты городов, оставляя первую найденную запись
    top_200 = top_200.drop_duplicates (subset=['geonameid','name', 'region_name', 'region_name'], keep = 'first')
    # берём top N
    top_n = top_200.head(N)
    #формируем ответ в виде списка словарей
    fin_list = []
    for row in top_n.itertuples():
        dict = {}
        dict.update({'geonameid' : row.geonameid, 'name' : row.name, 'region' : row.region_name, 'country' : row.country, 
                     'cosine_similarity' : row.score_lm})
        fin_list.append(dict)
    return fin_list

In [70]:
print('Введите город:')
str = input()
print(list_of_dict(str, embeddings, model, N = 10, cities = cities_alternatenames))

Введите город:


 питер


[{'geonameid': 498817, 'name': 'Saint Petersburg', 'region': 'St.-Petersburg', 'country': 'Russia', 'cosine_similarity': 0.8008365631103516}, {'geonameid': 787050, 'name': 'Pirot', 'region': 'Central Serbia', 'country': 'Serbia', 'cosine_similarity': 0.7622926235198975}, {'geonameid': 511794, 'name': 'Pechora', 'region': 'Komi', 'country': 'Russia', 'cosine_similarity': 0.7288624048233032}, {'geonameid': 750938, 'name': 'Bayburt', 'region': 'Bayburt Province', 'country': 'Turkey', 'cosine_similarity': 0.7097207307815552}, {'geonameid': 511196, 'name': 'Perm', 'region': 'Perm Krai', 'country': 'Russia', 'cosine_similarity': 0.6512341499328613}, {'geonameid': 314967, 'name': 'Fethiye', 'region': 'Muğla', 'country': 'Turkey', 'cosine_similarity': 0.6293563842773438}, {'geonameid': 750637, 'name': 'Beypazarı', 'region': 'Ankara', 'country': 'Turkey', 'cosine_similarity': 0.6277965307235718}, {'geonameid': 510291, 'name': 'Peterhof', 'region': 'St.-Petersburg', 'country': 'Russia', 'cosine_

<b>Общиие выводы:</b> 
- В рамках реализации проекта был сформирован инструмент подбора top N городов для запрошенного.
- Топ формируется по близости косинусного растояния--> по административной роли --> по численности населения
- Для реализации задачи попробова разные варианты решения: TFIDF с переводом на английский, предобученные языковые модели и дообучение языковой модели
- Дообучение не дало прирост качества на тестовых записях городов при выборе top 1
- Итоговый вариант построе на основе эмбеддингов модели sentence-transformers/LaBSE

-------------------------------------------------------

<b>Дополнительные данные:</b> 
Класс с методами подключения к БД и векторизацие названий городов

In [270]:
class geo_name:
    
    def bd_connect(host = '127.0.0.1', username = 'postgres', password = '123', database = 'mybd', port = 5432, drivername = 'postgresql'):
        from sqlalchemy import create_engine
        from sqlalchemy.engine.url import URL
        DATABASE = {
        'drivername': drivername,
        'username': username, 
        'password': password, 
        'host': host,
        'port': port,
        'database': database,
        'query': {}
        }  
        return create_engine(URL(**DATABASE))



    @staticmethod
    def vectorization(model, cities_list, replace_or_append_or_nan = 'nan'):
        import pandas as pd
        embeddings = model.encode(cities_list)
        if replace_or_append_or_nan == 'nan':
            return embeddings
        elif replace_or_append_or_nan == 'replace':
            embeddings_df = pd.DataFrame(embeddings)
            embeddings_df.to_sql('embeddings', con = geo_name.bd_connect(), if_exists = 'replace')
        elif replace_or_append_or_nan == 'replace':
            embeddings_df = pd.DataFrame(embeddings)
            embeddings_df.to_sql('embeddings', con = geo_name.bd_connect(), if_exists = 'append')           
        return embeddings
