In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from wiktionaryparser import WiktionaryParser
import numpy as np
from collections import Counter
from math import isnan

# from code_to_langs import wiki_code_to_lang

### 1. Etym-db attempt

In [102]:
data = pd.read_csv('northeuralex_etymologies.tsv', sep='\t')
data['Value'] = data['Value'].str.lower()
data.head(3)

data.count()

Language_ID     10402
Value           10402
Source_ID         769
Source_Value      769
dtype: int64

In [92]:
codes_w = pd.read_csv('wiki_codes.csv')

codes = pd.read_csv('languoid.csv')
codes.dropna(subset=['iso639P3code'])
codes = codes[codes['level']=='language']
codes_2 = codes[['name', 'iso639P3code']].set_index('iso639P3code')

codes = codes[['name', 'iso639P3code']].set_index('name')
codes_w = codes_w[['Canonical name', 'Code']].set_index('Canonical name')
codes_match = codes.join(codes_w).dropna()
codes_match = codes_match.reset_index().drop(['index'], axis=1).set_index('Code')['iso639P3code'].to_dict()

In [93]:
codes_match_2 = codes_2.to_dict('dict').get('name')
codes_match_2['bua'] = 'Buriat'
codes_match_2['hrv'] = 'Croatian Standard'
codes_match_2['sqi'] = 'Albanian'

data['Language'] = data['Language_ID'].map(codes_match_2)
len(data[data['Language'].isna()]['Language_ID'].unique())

0

Get data:

In [5]:
path_values = "split_etymdb/etymdb_values.csv"
path_link = "split_etymdb/etymdb_links_info.csv"

df_values = pd.read_csv(path_values,
                        sep='\t',
                        names=["id", "lang", "field", "lexeme", "meaning"],
                        dtype={"id": int, "lang": str, "field": int, "meaning": str}).set_index("id")

df_link = pd.read_csv(path_link,
                      sep='\t',
                      names=["relation_type", "child", "parent"],
                      dtype={"relation_type": str, "child": int, "parent": int})

df_inher = df_link.loc[df_link['relation_type'].isin(["inh"])]
df_bor = df_link.loc[df_link['relation_type'].isin(["bor"])]
df_cog = df_link.loc[df_link['relation_type'].isin(["cog"])]

df_values['iso_code'] = df_values['lang'].map(codes_match)
df_values['language_name'] = df_values['lang'].map(wiki_code_to_lang)
# df_values = df_values.dropna()
df_values = df_values.reset_index()
df_values.head(5)

Unnamed: 0,id,lang,field,lexeme,meaning,iso_code,language_name
0,1,en,0,dictionary,dictionary,eng,English
1,10,enm,0,free,free,enm,Middle English
2,100,ru,0,kot,,rus,Russian
3,1000,fy,0,diele,"to divide, to separate",,West Frisian
4,10000,la,0,proprius,ownership,lat,Latin


In [37]:
df_link['relation_type'].unique()

array(['der', 'inh', 'cog', 'bor', 'der(s)', 'cmpd+bor', 'der(p)'],
      dtype=object)

Dictionary of negative indices:

In [6]:
# indexes = pd.read_csv('split_etymdb/etymdb_links_index.csv', names=range(20), delimiter='\t').set_index(0)
# dict_indexes = indexes.T.to_dict('list')
# for k, v in dict_indexes.items():
#     dict_indexes[k] = [x for x in v if not isnan(x)]

# dict_indexes

In [7]:
codes_bg = set(df_values['lexeme'].str.lower().unique())
codes_dt = set(data['Value'].unique())

print('Initial: {}, intersection: {}'.format(len(codes_dt), len(codes_dt.intersection(codes_bg))))

Initial: 9855, intersection: 2899


In [8]:
lg = set(df_values['language_name'].unique())
lg_2 = set(data[data['Value'].isin(codes_bg)]['Language'].unique())

print('Initial: {}, intersection: {}'.format(len(lg_2), len(lg.intersection(lg_2))))

Initial: 64, intersection: 46


In [94]:
def get_index_wf(lang, lexeme):
#     index =  df_values.query('iso_code == "{}" & lexeme == "{}"'.format (lang, lexeme)).index[0]
    try:
        index =  df_values.query('language_name == "{}" & lexeme == "{}"'.format(lang, lexeme))['id'].values[0]
        indexes = 0
        if not df_inher.query('child == "{}"'.format(index)).empty:
            ch_idx = df_inher.query('child == "{}"'.format(index))['parent'].values[0]
            return df_values.query('id == {}'.format(int(ch_idx)))[['lexeme', 
                                                                        'iso_code', 'language_name']].values.tolist()[0]
        else:
            pass
    except:
        pass

In [49]:
dd = []

for row in data[data['Value'].isin(codes_bg)].iterrows():
    value = get_index_wf(row[1]['Language'], row[1]['Value'])
    if value is not None: 
        dd.append((row[0], value))

In [95]:
len(dd)

1220

In [97]:
data['Source_Language'] = 0

In [98]:
data.head(2)

Unnamed: 0,Language_ID,Value,Source_ID,Source_Value,Language,Source_Language
0,abk,ааира,,,Abkhaz,0
1,abk,аара,,,Abkhaz,0


In [99]:
for element in dd:
    if pd.isnull(data.loc[element[0], 'Source_ID']):
#         print(data.loc[element[0], 'Source_ID'])
        data.loc[element[0], 'Source_Value']  = element[1][0]
        data.loc[element[0], 'Source_ID']  = element[1][1]
        data.loc[element[0], 'Source_Language']  = element[1][2]

In [100]:
data.count()

Language_ID        10402
Value              10402
Source_ID            907
Source_Value        1472
Language           10402
Source_Language    10393
dtype: int64

In [101]:
data.to_csv('northeuralex_etymologies_upd.tsv', sep='\t')

### 2. Parsing

In [127]:
from wiktionaryparser import WiktionaryParser

In [113]:
data_n = pd.read_csv('northeuralex_etymologies_upd.tsv', sep='\t')

In [106]:
parser = WiktionaryParser()

In [128]:
parser.fetch('ff', 'ukrainian')

[]

In [146]:
for element in 'persian'.split():
    print(element)

persian


In [187]:
def parce_etym(lexeme, language):
    result = parser.fetch(lexeme, language)
    if result:
        return result[0].get('etymology')
    elif len(language.split()) >= 2:
        for element in language.split():
#             print('{} {}'.format(lexeme, element))
            res =  parce_etym(lexeme, element)
        return res
    else:
        pass

In [189]:
parce_etym('dd', 'old russian')

In [195]:
et = []

for index, row in data_n.iterrows():
    if pd.isnull(row['Source_Value']):
        try:
            etym = parce_etym(row['Value'], row['Language'])
        except:
            pass
        if etym:
            et.append((index, etym))

### 3. Correcting

In [4]:
data_n = pd.read_csv('northeuralex_etymologies_upd.tsv', sep='\t')

In [6]:
data_n = data_n.drop('Unnamed: 0', axis=1)

In [15]:
etymologies = data_n['Etymology_raw'][data_n['Etymology_raw'].notna()].to_list()

In [16]:
etymologies

['Cognate with Abaza адгьы́л (ādg̍ə́l); compare Georgian ადგილი (adgili).\n',
 'Compare Old Armenian պինչ (pinčʿ).\n',
 'Cognate with Adyghe лъэ (łă).\n',
 'From Proto-Circassian *gʷǝ (“heart”).\n',
 'From Proto-Circassian *ʐə.\n',
 'From Proto-Circassian zə.\n',
 'From Proto-Circassian *kʲʼa.\n',
 'тхьамыкӏ (tḥāməč̣̍) +\u200e -гъу (-ġ°)',
 'From Proto-Circassian *ɕʷa',
 'From Proto-Circassian *t͡ʂʰə (“horse”).\n',
 '(This etymology is missing or incomplete. Please add to it, or discuss it at the Etymology scriptorium.)',
 'From the root ء ث ر\u200e (ʾ-ṯ-r).\n',
 'From the root ح ب ب\u200e (ḥ-b-b).\n',
 'From the root د و ر\u200e (d-w-r).\n',
 'From Proto-Semitic *ʾakal- (root ء ك ل\u200e (ʾ-k-l)).\n',
 'From the root و ق د\u200e (w-q-d).\n',
 'From root ن ظ ر\u200e (n-ẓ-r)',
 'From the root ق ط ع\u200e (q-ṭ-ʿ).\n',
 'From Proto-Semitic *bint-. Has the Semitic feminine suffix *-at added to the root *bin- of اِبْن\u200e (ibn, “son”).\n',
 'From root ب ي ت\u200e (b-y-t). From Proto-Semit

In [32]:
import re

for element in etymologies:
    print(element)
    print(re.findall(r'from ([A-Z]\w+|[A-Z]\w+\-[A-Z]\w+) \w+', element))

Cognate with Abaza адгьы́л (ādg̍ə́l); compare Georgian ადგილი (adgili).

[]
Compare Old Armenian պինչ (pinčʿ).

[]
Cognate with Adyghe лъэ (łă).

[]
From Proto-Circassian *gʷǝ (“heart”).

[]
From Proto-Circassian *ʐə.

[]
From Proto-Circassian zə.

[]
From Proto-Circassian *kʲʼa.

[]
тхьамыкӏ (tḥāməč̣̍) +‎ -гъу (-ġ°)
[]
From Proto-Circassian *ɕʷa
[]
From Proto-Circassian *t͡ʂʰə (“horse”).

[]
(This etymology is missing or incomplete. Please add to it, or discuss it at the Etymology scriptorium.)
[]
From the root ء ث ر‎ (ʾ-ṯ-r).

[]
From the root ح ب ب‎ (ḥ-b-b).

[]
From the root د و ر‎ (d-w-r).

[]
From Proto-Semitic *ʾakal- (root ء ك ل‎ (ʾ-k-l)).

[]
From the root و ق د‎ (w-q-d).

[]
From root ن ظ ر‎ (n-ẓ-r)
[]
From the root ق ط ع‎ (q-ṭ-ʿ).

[]
From Proto-Semitic *bint-. Has the Semitic feminine suffix *-at added to the root *bin- of اِبْن‎ (ibn, “son”).

[]
From root ب ي ت‎ (b-y-t). From Proto-Semitic *bayt-.

[]
Originally a case form of a defective noun تَحْت‎ (taḥt, “location that


[]
From Old Armenian մարտ (mart).

[]
From Old Armenian մտանեմ (mtanem).

[]
From Old Armenian նստել (nstel), infinitive of նստիմ (nstim); see it for more.

[]
From Old Armenian շաբաթ (šabatʿ).

[]
From Old Armenian պաշտպանեմ (paštpanem).

[]
From Old Armenian պատկեր (patker).

[]
From Old Armenian սպասեմ (spasem), from սպաս (spas).

[]
From Old Armenian վառեմ (vaṙem).

[]
From Old Armenian վճարեմ (včarem).

[]
From Middle Armenian տոպրակ (toprak), a Turkic borrowing. Compare Tatar тубра (tubra), Turkish torba. Russian то́рба (tórba) is from the same source.

[]
From Old Armenian տուն (tun).

[]
From Old Armenian ցամաք (cʿamakʿ).

[]
From Old Armenian փայտ (pʿayt); see it for more.

[]
From Old Armenian փոկ (pʿok); see it for more.

[]
From Old Armenian քանակ (kʿanak); see it for more.

[]
From Old Armenian քաշել (kʿašel).

[]
From Old Norse hóll.

[]
From Old Norse horn, from Proto-Norse ᚺᛟᚱᚾᚨ (horna), from Proto-Germanic *hurną, from Proto-Indo-European *ḱer-.

['Proto-Norse']
From 

From Proto-Uralic *künče.

[]
A Permic word (*gud).

[]
From Proto-Permic [Term?]. Cognates include Komi-Zyrian кар (kar).

[]
From Proto-Uralic *käćetä, like Komi-Permyak кӧдзыд (ködzyd).

[]
From a Permic *kud.

[]
From Proto-Uralic *käle. Cognates include Komi-Permyak кыв (kyv), Finnish kieli and Northern Sami giella.

[]
From Proto-Uralic *kule-.

[]
From Proto-Uralic *lewle.

[]
From Proto-Uralic *mene-.

[]
From Proto-Permic *nu-, cognate to Komi-Permyak нуны (nuny).

[]
From Proto-Permic [Term?], from Proto-Uralic *nejde.

[]
From Proto-Permic *pośe, cognate to Komi-Permyak пӧсь (pösʹ).

[]
From Proto-Uralic *puwe.

[]
From Proto-Uralic *päŋe.

[]
From Proto-Permic [Term?], cognate to Komi-Permyak пырны (pyrny).

[]
From Proto-Permic [Term?], cognate to Komi-Permyak сьöкыд (sʹökyd).

[]
From Proto-Permic [Term?], cognate to Komi-Permyak сетны (setny).

[]
From Proto-Uralic *tumte-.

[]
Related to Komi-Zyrian тӧлысь (tölysʹ) and Eastern Mari тылзе (tylze).

[]
From Proto-Permic *