In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import collections
from collections import Counter
import os

# 3. Слоговая структура и попарное сравнение языков  
В этой части мы посмотрим на слоговую структуру с опорой на первоначальную базу данных (df) - тем самым сделаем выводы о слогах в отдельных языках. Но что важнее, мы введем попарное сравнение языков и будем смотреть, какие слоги в какие чаще переходит - это отражает фонетические процессы в родственных языках.  

### 3.1. От фонем к CV-представлениям  
Для работы с LexStat, мы убрали дефисное деление звуков в транскрипции, поэтому чтобы привести звуки к записи вида CVCVCV, вернемся к фонемному составу языков.

In [2]:
# все звуки поделим на гласные и согласные

vwls = ['ẽ', 'ũ', 'ĩ', 'õ', 'ã', 'u', 'o', 'i', 'e', 'a']

cons = ['χ', 'ʕ', 'ʔ', 'ʒ', 'ʃ', 'ʁ', 'ɬ', 'ɢ', 'ɡ', 'ħ', 'z', 'x', 'w', 'tʃ', 'tɬ', 
        'ts', 't', 's', 'r', 'qχ', 'q', 'p', 'n', 'm', 'l', 'k', 'j', 'h', 'f', 'dʒ', 'dz', 'd', 'b']

# чтобы отдельно рассмотреть процессы, происходящие с r,l,m,n,w,j,b, не будем их заменять на C

cons_sans_rlmnwjb = ['χ', 'ʕ', 'ʔ', 'ʒ', 'ʃ', 'ʁ', 'ɬ', 'ɢ', 'ɡ', 'ħ', 'z', 'x', 'tʃ', 'tɬ', 
        'ts', 't', 's', 'qχ', 'q', 'p','k', 'h', 'f', 'dʒ', 'dz', 'd']

In [3]:
# фонемный состав языков отличается 
# (например, где-то есть звук "dʒ", а где-то это сочетание звуков "d" и "ʒ")

langs = {'akhv1239': ['ẽ', 'χ', 'ʕ', 'ʔ', 'ʒ', 'ʃ', 'ʁ', 'ɬ', 'ɡ', 'ũ', 'ĩ', 'ħ', 'õ', 'ã', 'z', 'x', 
                      'w', 'u', 'tʃ', 'tɬ', 'ts', 't', 's', 'r', 'q', 'p', 'o', 'n', 'm', 'l', 'k', 
                      'j', 'i', 'h', 'e', 'dʒ', 'd', 'b', 'a'], 
         
         'andi1255': ['χ', 'ʕ', 'ʔ', 'ʒ', 'ʃ', 'ʁ', 'ɬ', 'ɢ', 'ɡ', 'z', 'x', 'w', 'u', 'tʃ', 
                      'tɬ', 'ts', 't', 's', 'r', 'q', 'p', 'o', 'n', 'm', 'l', 'k', 'j', 'i', 
                      'h', 'e', 'dʒ', 'd', 'b', 'a'],
         
         'bagv1239': ['ẽ', 'χ', 'ʕ', 'ʔ', 'ʒ', 'ʃ', 'ʁ', 'ɬ', 'ɡ', 'ũ', 'ĩ', 'ħ', 'õ', 'ã', 
                      'z', 'x', 'w', 'u', 'tʃ', 'tɬ', 'ts', 't', 's', 'r', 'q', 'p', 'o', 
                      'n', 'm', 'l', 'k', 'j', 'i', 'h', 'e', 'dʒ', 'd', 'b', 'a'],
         
         'botl1242': ['ẽ', 'χ', 'ʕ', 'ʔ', 'ʒ', 'ʃ', 'ʁ', 'ɬ', 'ɡ', 'ũ', 'ĩ', 'ħ', 
                      'õ', 'ã', 'z', 'x', 'w', 'u', 'tʃ', 'tɬ', 'ts', 't', 's', 'r', 'q', 
                      'p', 'o', 'n', 'm', 'l', 'k', 'j', 'i', 'h', 'f', 'e', 'dʒ', 'd', 'b', 'a'],
         
         'cham1309': ['ẽ', 'χ', 'ʕ', 'ʔ', 'ʒ', 'ʃ', 'ʁ', 'ɬ', 'ɡ', 'ũ', 'ĩ', 'ħ', 'õ', 'ã', 'z', 
                      'x', 'w', 'u', 'tʃ', 'tɬ', 'ts', 't', 's', 'r', 'qχ', 'q', 'p', 'o', 'n', 
                      'm', 'l', 'k', 'j', 'i', 'h', 'e', 'dʒ', 'dz', 'd', 'b', 'a'],
         
         'ghod1238': ['χ', 'ʕ', 'ʔ', 'ʒ', 'ʃ', 'ʁ', 'ɬ', 'ɡ', 'ũ', 'ĩ', 'ħ', 'ã', 'z', 'x', 
                      'w', 'u', 'tʃ', 'tɬ', 'ts', 't', 's', 'r', 'q', 'p', 'o', 'n', 'm', 
                      'l', 'k', 'j', 'i', 'h', 'e', 'dʒ', 'd', 'b', 'a'],
         
         'kara1474': ['ẽ', 'χ', 'ʕ', 'ʔ', 'ʒ', 'ʃ', 'ʁ', 'ɬ', 'ɡ', 'ũ', 'ĩ', 'ħ', 'õ', 
                      'ã', 'z', 'x', 'w', 'u', 'tʃ', 'tɬ', 'ts', 't', 's', 'r', 'q', 'p', 
                      'o', 'n', 'm', 'l', 'k', 'j', 'i', 'h', 'e', 'd', 'b', 'a'],
         
         'tind1238': ['ẽ', 'χ', 'ʕ', 'ʔ', 'ʒ', 'ʃ', 'ʁ', 'ɬ', 'ɡ', 'ũ', 'ĩ', 'ħ', 
                      'õ', 'ã', 'z', 'x', 'w', 'u', 'tʃ', 'tɬ', 'ts', 't', 's', 'r', 
                      'q', 'p', 'o', 'n', 'm', 'l', 'k', 'j', 'i', 'h', 'e', 'dʒ', 'd', 'b', 'a'],
         
         'toki1238': ['ẽ', 'χ', 'ʕ', 'ʔ', 'ʒ', 'ʃ', 'ʁ', 'ɬ', 'ɢ', 'ɡ', 'ũ', 'ĩ', 'ħ', 
                      'ã', 'z', 'x', 'w', 'u', 'tʃ', 'tɬ', 'ts', 't', 's', 'r', 'q', 'p', 
                      'o', 'n', 'm', 'l', 'k', 'j', 'i', 'h', 'e', 'd', 'b', 'a']
        }


In [4]:
# для каждого языка разделим звуки на согласные и гласные

v = {'akhv1239': [], 'andi1255': [], 'bagv1239': [], 'botl1242': [], 'cham1309': [], 
     'ghod1238': [], 'kara1474': [], 'tind1238': [], 'toki1238': []}
c = {'akhv1239': [], 'andi1255': [], 'bagv1239': [], 'botl1242': [], 'cham1309': [], 
     'ghod1238': [], 'kara1474': [], 'tind1238': [], 'toki1238': []}
c_sans_rlmnwjb = {'akhv1239': [], 'andi1255': [], 'bagv1239': [], 'botl1242': [], 
                  'cham1309': [], 'ghod1238': [], 'kara1474': [], 'tind1238': [], 'toki1238': []}

for key in langs:
    for el in vwls:
        if el in langs[key]:
            v[key].append(el)
    for el in cons:
        if el in langs[key]:
            c[key].append(el)
    for el in cons_sans_rlmnwjb:
        if el in langs[key]:
            c_sans_rlmnwjb[key].append(el)

In [5]:
# ф-я, на вход которой подается строка s, словарь гласных V, словарь согласных C, язык l
# преобразует фонемную запись в представление CVCV

def letters_to_CV(s, V, C, l):
    s = s.replace("'", '')
    s = s.replace("ʷ", '')
    s = s.replace('ʲ','')
    s = s.replace('̃', '')
    s = s.replace('ː','')
    s = s.replace('ˌ','')

    for i in range(len(s)):
        flag = True
        for v in V[l]:
            if v in s and flag:
                s = s.replace(v, "V")
                flag = False
                break
        if flag:
            for c in C[l]:
                if c in s:
                    s = s.replace(c, "C")
                    break                
    return s

In [6]:
# ф-я, получающая на вход строку word вида CVCVCV
# делит ее на слоги

def s_to_syllables(word):
    vowels = []
    for i in range(len(word)):
        if word[i] == 'V':
            vowels.append(i)

    hyphens = collections.deque()
    for i in range(1, len(vowels)):
        a, b = vowels[i-1], vowels[i]
        if b - a == 1:
            hyphens.append(b)
        elif b - a == 2:
            hyphens.append(b - 1)
        else:
            hyphens.append(a + 2)

    res = ''
    for i in range(len(word)):
        if len(hyphens) and hyphens[0] == i:
            res += '-'
            hyphens.popleft()
        res += word[i]
    return res

### 3.2. Общая информация  
Обратимся к первоначальной базе данных, чтобы вне контекста когнатов посмотреть на слоговые структуры в языках. 

In [7]:
df = pd.read_csv('edited_andic_dicts.csv', sep=',')
df['meaning_ru'] = df.apply(lambda row: row['new_meaning_ru'] if pd.notnull(row['new_meaning_ru']) else row['meaning_ru'], axis=1)
df = df.drop_duplicates(subset=['ipa', 'meaning_ru', 'glottocode', 'bor'], keep='first')
df = df.dropna(subset=['ipa', 'meaning_ru'], how='any')

df['ipa'] = df['ipa'].apply(lambda x: x.replace('-', '') if isinstance(x, str) else x)
df['ipa'] = df['ipa'].apply(lambda x: x.replace(' ', '') if isinstance(x, str) else x)

In [8]:
# исследуем слоговую структуру

df['cv'] = df.apply(lambda row: s_to_syllables(letters_to_CV(row['ipa'], v, c, row['glottocode'])), axis=1)
df[['glottocode', 'ipa', 'cv']]

Unnamed: 0,glottocode,ipa,cv
0,akhv1239,ab'adatɬːi,V-CV-CVC-CV
1,akhv1239,ab'aʒʷe,V-CV-CV
2,akhv1239,abaʒ'urutɬa,V-CV-CV-CVC-CV
3,akhv1239,abaʒ'urutɬa,V-CV-CV-CVC-CV
4,akhv1239,abaʒ'urutɬa,V-CV-CV-CVC-CV
...,...,...,...
81656,toki1238,baʃl'a,CVC-CV
81657,toki1238,besad'iril,CV-CV-CV-CVC
81658,toki1238,itʃ'ib,VC-CVC
81659,toki1238,itʃ'ik'al,VC-CV-CVC


In [9]:
# используем ф-ии для выделения в CV-представлении df['cv'] всех слогов / первых слогов / последних слогов

def count_syllable_types(cv):
    syllables = cv.split('-')
    types = [s.split('-')[-1] for s in syllables]
    return types

def count_first_syllable_types(cv):
    first_syllable = cv.split('-')[0]
    first_type = first_syllable.split('-')[-1] if '-' in first_syllable else first_syllable
    return first_type

def count_last_syllable_types(cv):
    first_syllable = cv.split('-')[-1]
    first_type = first_syllable.split('-')[-1] if '-' in first_syllable else first_syllable
    return first_type

syllable_types = ['CV', 'CVC', 'CCV', 'V', 'VC', 'CCVC', 'CVCC', 'CCCV', 'CVCCC', 'CCVCC']

In [10]:
# посмотрим общую статистику по языкам

general_statistics = df.groupby('glottocode')['cv'].apply(lambda x: x.apply(count_syllable_types).explode().value_counts()).unstack(fill_value=0)
general_statistics = general_statistics.reindex(columns=syllable_types, fill_value=0)

general_statistics

Unnamed: 0_level_0,CV,CVC,CCV,V,VC,CCVC,CVCC,CCCV,CVCCC,CCVCC
glottocode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
akhv1239,37113,5505,1442,953,329,231,4,0,0,0
andi1255,13390,7710,718,364,462,567,13,13,3,0
bagv1239,18342,11113,825,781,562,628,205,11,4,17
botl1242,27484,15671,1005,876,969,710,383,18,6,16
cham1309,16048,5865,655,834,553,354,280,24,21,24
ghod1238,13854,4349,391,414,374,212,17,1,0,0
kara1474,12013,4772,458,343,189,206,90,1,7,1
tind1238,21471,7936,606,922,260,366,10,1,0,0
toki1238,10,6,0,1,2,0,0,0,0,0


In [11]:
directory = 'output_html/about_whole_data/'
os.makedirs(directory, exist_ok=True)

general_stat_html = general_statistics.to_html()
with open('output_html/about_whole_data/general_stat_html.html', 'w') as f:
    f.write(general_stat_html)

In [12]:
# посмотрим статистику по последнему слогу

last_syl_statistics = df.groupby('glottocode')['cv'].apply(lambda x: x.apply(count_last_syllable_types).value_counts()).unstack(fill_value=0)
last_syl_statistics = last_syl_statistics.reindex(columns=syllable_types, fill_value=0)

last_syl_statistics

Unnamed: 0_level_0,CV,CVC,CCV,V,VC,CCVC,CVCC,CCCV,CVCCC,CCVCC
glottocode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
akhv1239,12717,22,41,4,0,0,4,0,0,0
andi1255,5937,2263,186,5,24,87,13,1,3,0
bagv1239,4588,5549,62,26,72,282,205,0,4,17
botl1242,7980,7670,213,48,47,132,383,11,6,16
cham1309,5954,2498,198,14,89,130,280,4,21,24
ghod1238,5003,1583,75,31,14,26,17,1,0,0
kara1474,3787,2492,53,11,9,43,90,0,7,1
tind1238,6103,4273,55,6,18,30,10,1,0,0
toki1238,2,4,0,0,0,0,0,0,0,0


In [13]:
last_syl_stat_html = last_syl_statistics.to_html()
with open('output_html/about_whole_data/last_syl_stat_html.html', 'w') as f:
    f.write(last_syl_stat_html)

In [14]:
# посмотрим статистику по первому слогу

first_syl_statistics = df.groupby('glottocode')['cv'].apply(lambda x: x.apply(count_first_syllable_types).value_counts()).unstack(fill_value=0)
first_syl_statistics = first_syl_statistics.reindex(columns=syllable_types, fill_value=0)

first_syl_statistics

Unnamed: 0_level_0,CV,CVC,CCV,V,VC,CCVC,CVCC,CCCV,CVCCC,CCVCC
glottocode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
akhv1239,8630,1262,1424,912,329,231,0,0,0,0
andi1255,3657,3218,412,342,451,406,8,12,3,0
bagv1239,5507,3248,444,601,521,365,119,0,3,10
botl1242,8350,5408,507,702,891,563,88,1,3,9
cham1309,4616,2528,311,809,530,285,112,0,16,16
ghod1238,3843,1883,218,291,319,189,7,1,0,0
kara1474,3876,1620,342,298,175,159,21,1,1,0
tind1238,6555,2074,465,823,236,335,8,1,0,0
toki1238,2,1,0,1,2,0,0,0,0,0


In [15]:
first_syl_stat_html = first_syl_statistics.to_html()
with open('output_html/about_whole_data/first_syl_stat_html.html', 'w') as f:
    f.write(first_syl_stat_html)

### 3.3. БД с когнатами

In [16]:
# достанем из первоначальной базы данных значения заимствований

bor_dict = {}
for ind in df.index:
    if df['bor'][ind] == 1:
        if df['glottocode'][ind] not in bor_dict:
            bor_dict[df['glottocode'][ind]] = []
        bor_dict[df['glottocode'][ind]].append(df['ipa'][ind])
bor_dict

{'akhv1239': ["ab'adatɬːi",
  "aw'arak'e",
  "aw'arija",
  "aw'asi",
  "ad'a",
  "ad'a",
  "adaɡʷ'edabe",
  "adaɡʷik'abe",
  "ad'aq'ːedabe",
  "ad'atɬːi",
  "ad'aqːati",
  "ad'aqːati",
  "ad'ija",
  "adikal'o",
  "aʒ'ana",
  "aʒ'a",
  "az'aho",
  "az'azaroda",
  "azar'o",
  "azar'oda",
  "azar'otʃ'e",
  "azerbedʒ'ano",
  "azib'a",
  "azb'a",
  "aj'ati",
  "atɬ'ː'udaʔazaro",
  "atɬ'ːʷ'aːzaroda",
  "al'ati",
  "al'ati",
  "al'atiq'ːe",
  "alː'ahi",
  "alː'a",
  "alː'ahibet'ẽha",
  "al'ipa",
  "al'ipi",
  "alpaw'iti",
  "am'anate",
  "amasː'e",
  'amerikãtso',
  "am'ini",
  "am'ini",
  "amː'a",
  "apar'ak'e",
  "apits'eri",
  "ap'azar'oda",
  "arm'eno",
  "art'eli",
  "as'a",
  "as'i",
  "as'i",
  'asːalamuʕalẽku',
  'asːalaʕaleku',
  'asːetina',
  'asːetino',
  "asːk'a",
  "asːt'awpirulːa",
  "ast'owpirulːa",
  "aχ'i",
  "aχiɬ'e",
  "aχiɬːiɡ'e",
  "aχiɬːiɡel'a",
  "aχir'ati",
  "aχ'iqːa",
  "aħʷ'aħa",
  "aħim'aq'ːedabe",
  "aħm'aq'ːedabe",
  "aħim'aq'ːe",
  "aħmaq'ːe",
  "aħim'aq'ːɬil

In [17]:
lexstat_df = pd.read_csv('res_ls.qlc', sep='\t', skiprows=7)
# удаляем ранее добавленные значения
lexstat_df = lexstat_df[lexstat_df['IPA'] != 'aaa']

In [18]:
grouped_df = lexstat_df.groupby('COGNATES').apply(lambda x: x.assign(CONCEPT=x['CONCEPT'] + '_' + x['COGNATES'].astype(str)))
grouped_df['IPA'] = grouped_df['IPA'].astype(str)

result_df = grouped_df.groupby(['CONCEPT', 'DOCULECT'])['IPA'].agg(lambda x: ', '.join(x)).unstack().reset_index()
result_df

DOCULECT,CONCEPT,akhv1239,andi1255,bagv1239,botl1242,cham1309,ghod1238,kara1474,tind1238,toki1238
0,(в соч. с инф. выражает обязательность действи...,,,,,,,bekːʷ'aɬa,,
1,(возглас (петухом)_11.0,,ɢuɢudu,,,,,,,
2,(возглас вызова)_20.0,,mp'o,,,,,,,
3,(возглас недовольствия и недоумения)_29.0,,wep'ero,,,,,,,
4,(возглас одобрения)_38.0,,hajhuj,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
32126,ящичек_233538.0,,ʁutuqi,,,,,oʔ'ol,,
32127,ящичек_233544.0,,,,,,,,ʁˌoru,
32128,ящур (у животных)_233570.0,,,,,,,t'aːlahu,,
32129,ящур_233555.0,j'aʃːuri,,,j'aʃːur,,j'aʃːur,jaʃː'ur,,


In [19]:
result_df[result_df['toki1238'].notna()]

DOCULECT,CONCEPT,akhv1239,andi1255,bagv1239,botl1242,cham1309,ghod1238,kara1474,tind1238,toki1238
512,"агьаᴴдоб, бокьидоб, къардоб_4194.0",,,,,,,,,itʃ'ib
1539,болезнь копыт_11709.0,,,,,,,,,χĩχiledu
4770,г|акъраб_35082.0,,,,,,,,,itʃ'ik'al
8800,злополучный день_64584.0,,,,,,,,,aχirarb'aʕ
26009,сл/.байбихьар_189801.0,,,,,,,,,baʃl'a
28600,тополь_208417.0,,,,,,,,,besad'iril


In [20]:
# видим, что в токитинском нет когнатов, поэтому удаляем информацию для токитинского

result_df.drop('toki1238', axis=1, inplace=True)
result_df = result_df[result_df.notna().sum(axis=1) > 1]

In [21]:
result_df[37:47]

DOCULECT,CONCEPT,akhv1239,andi1255,bagv1239,botl1242,cham1309,ghod1238,kara1474,tind1238
37,"(возглас, которым отгоняют кошек)_319.0",,,,tʃːit'a,,,,
38,"(возглас, которым отгоняют кур)_325.0",ɡ'ure,,kurʕ'aj,kuʃ,,ɡureɡ'ure,ɡur'aj,kur'aː
39,"(возглас, которым отгоняют овец)_337.0",,,,tsːit'a,tsiraj,ʕats,ʕ'eʃaj,
40,"(возглас, которым отгоняют овец)_341.0",,,,,,,,k'eri
41,"(возглас, которым отгоняют осла)_348.0",,,,,,q''oʃːu,,
42,"(возглас, которым отгоняют собак)_354.0",,,dʒ'ebeɬi,,,,,
43,"(возглас, которым отгоняют теленка)_364.0",,,,huts,,ʕ'arija,,
44,"(возглас, которым отгоняют телят)_376.0",,,,,,,ʕel'aj,
45,"(возглас, которым отгоняют, подгоняют коров)_3...",,,,ʕaj,,,,
46,"(возглас, которым отгоняют, понукают коров)_391.0",,,,hiz'o,,,,


In [22]:
def masking(x):
  if type(x) == str:
    return 1
  else:
    return 0

stat_cognates = result_df.reset_index()
stat_cognates.drop('index', axis=1, inplace=True)
stat_cognates = stat_cognates.applymap(masking)
stat_cognates = stat_cognates.T.dot(stat_cognates)
stat_cognates

  stat_cognates = stat_cognates.applymap(masking)


DOCULECT,CONCEPT,akhv1239,andi1255,bagv1239,botl1242,cham1309,ghod1238,kara1474,tind1238
DOCULECT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CONCEPT,32125,8391,6231,7763,12367,6016,5422,5192,7527
akhv1239,8391,8391,1328,2858,2691,2259,2152,2468,2828
andi1255,6231,1328,6231,1353,1831,1148,1282,1145,1283
bagv1239,7763,2858,1353,7763,2987,2769,2419,2139,3549
botl1242,12367,2691,1831,2987,12367,2433,3292,2227,2601
cham1309,6016,2259,1148,2769,2433,6016,2325,1933,2723
ghod1238,5422,2152,1282,2419,3292,2325,5422,2004,2113
kara1474,5192,2468,1145,2139,2227,1933,2004,5192,2007
tind1238,7527,2828,1283,3549,2601,2723,2113,2007,7527


In [23]:
directory = 'output_html'
os.makedirs(directory, exist_ok=True)

stat_cognates = stat_cognates.to_html()
with open('output_html/stat_cognates.html', 'w') as f:
    f.write(stat_cognates)

In [24]:
# добавляем отдельные столбцы для записи CV (и CV без согласных r,l,m,n,w,j,b)

result_df = result_df.reindex(columns = result_df.columns.tolist() + 
                                ['kara_cv', 'kara_rlm','botl_cv', 'botl_rlm', 'akhv_cv', 'akhv_rlm', 
                                 'ghod_cv', 'ghod_rlm', 'andi_cv', 'andi_rlm', 'cham_cv', 'cham_rlm', 
                                 'tind_cv', 'tind_rlm', 'bagv_cv', 'bagv_rlm', 'bor'])
result_df = result_df[['CONCEPT', 'kara1474', 'kara_cv', 'kara_rlm', 'botl1242', 'botl_cv', 'botl_rlm', 
                       'akhv1239', 'akhv_cv', 'akhv_rlm', 'ghod1238', 'ghod_cv', 'ghod_rlm', 
                       'andi1255', 'andi_cv', 'andi_rlm', 'cham1309', 'cham_cv', 'cham_rlm', 
                       'tind1238', 'tind_cv', 'tind_rlm', 'bagv1239', 'bagv_cv', 'bagv_rlm', 'bor']]

# заполняем столбцы заимствования и CV

for index, row in result_df.iterrows():
    for i in range(1, len(row)-3, 3):
        if row[-1] != 1 and not(pd.isna(row[i])) and row[i] in bor_dict[list(result_df.columns)[i]]:
            result_df.loc[index, 'bor'] = 1
        if not(pd.isna(row[i])):
            result_df.loc[index, list(result_df.columns)[i+1]] = letters_to_CV(row[i], v, c, list(result_df.columns)[i])
            result_df.loc[index, list(result_df.columns)[i+2]] = letters_to_CV(row[i], v, c_sans_rlmnwjb, list(result_df.columns)[i])
result_df

  if row[-1] != 1 and not(pd.isna(row[i])) and row[i] in bor_dict[list(result_df.columns)[i]]:
  if not(pd.isna(row[i])):
  result_df.loc[index, list(result_df.columns)[i+1]] = letters_to_CV(row[i], v, c, list(result_df.columns)[i])
  result_df.loc[index, list(result_df.columns)[i+1]] = letters_to_CV(row[i], v, c, list(result_df.columns)[i])
  result_df.loc[index, list(result_df.columns)[i+2]] = letters_to_CV(row[i], v, c_sans_rlmnwjb, list(result_df.columns)[i])
  result_df.loc[index, list(result_df.columns)[i+2]] = letters_to_CV(row[i], v, c_sans_rlmnwjb, list(result_df.columns)[i])
  result_df.loc[index, list(result_df.columns)[i+2]] = letters_to_CV(row[i], v, c_sans_rlmnwjb, list(result_df.columns)[i])
  result_df.loc[index, list(result_df.columns)[i+1]] = letters_to_CV(row[i], v, c, list(result_df.columns)[i])
  result_df.loc[index, list(result_df.columns)[i+2]] = letters_to_CV(row[i], v, c_sans_rlmnwjb, list(result_df.columns)[i])
  result_df.loc[index, list(result_df.columns)[i+

DOCULECT,CONCEPT,kara1474,kara_cv,kara_rlm,botl1242,botl_cv,botl_rlm,akhv1239,akhv_cv,akhv_rlm,...,cham1309,cham_cv,cham_rlm,tind1238,tind_cv,tind_rlm,bagv1239,bagv_cv,bagv_rlm,bor
0,(в соч. с инф. выражает обязательность действи...,bekːʷ'aɬa,CVCVCV,bVCVCV,,,,,,,...,,,,,,,,,,
1,(возглас (петухом)_11.0,,,,,,,,,,...,,,,,,,,,,
2,(возглас вызова)_20.0,,,,,,,,,,...,,,,,,,,,,
3,(возглас недовольствия и недоумения)_29.0,,,,,,,,,,...,,,,,,,,,,
4,(возглас одобрения)_38.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32126,ящичек_233538.0,oʔ'ol,VCVC,VCVl,,,,,,,...,,,,,,,,,,
32127,ящичек_233544.0,,,,,,,,,,...,,,,ʁˌoru,CVCV,CVrV,,,,
32128,ящур (у животных)_233570.0,t'aːlahu,CVCVCV,CVlVCV,,,,,,,...,,,,,,,,,,
32129,ящур_233555.0,jaʃː'ur,CVCVC,jVCVr,j'aʃːur,CVCVC,jVCVr,j'aʃːuri,CVCVCV,jVCVrV,...,,,,,,,,,,1.0


In [25]:
result_df.to_csv('grouped_data.csv', index=False)

### 3.4. Попарное сравнение языков

In [26]:
# преобразуем result_df в фрейм данных со структурой язык1-фонемы1-CV1-язык2-фонемы2-CV2-значение-заимствование

def pairwise_comparison(df):
    data_res = {
        'language1': [],
        'value1': [],
        'value1_cv': [],
        'value1_rlm': [],
        'language2': [],
        'value2': [],
        'value2_cv': [],
        'value2_rlm': [],
        'meaning': [],
        'bor': []
    }
    
    for index, row in df.iterrows():
        for i in range(1, len(row)-3, 3):
            for j in range(i+3, len(row)-3, 3):
                if not(pd.isna(row[i]) or pd.isna(row[j])):
                    data_res['meaning'].append(row[0])
                    data_res['language1'].append(df.columns[i])
                    data_res['value1'].append(row[i])
                    data_res['value1_cv'].append(s_to_syllables(row[i+1]))
                    data_res['value1_rlm'].append(row[i+2])
                    data_res['language2'].append(df.columns[j])
                    data_res['value2'].append(row[j])
                    data_res['value2_cv'].append(s_to_syllables(row[j+1]))
                    data_res['value2_rlm'].append(row[j+2])
                    if row[-1] == 1:
                        data_res['bor'].append(1)
                    else:
                        data_res['bor'].append('')
            
    data_res = pd.DataFrame(data_res)
    return data_res

In [27]:
pairwise_comparison_df = pairwise_comparison(result_df)
pairwise_comparison_df

  if not(pd.isna(row[i]) or pd.isna(row[j])):
  data_res['meaning'].append(row[0])
  data_res['value1'].append(row[i])
  data_res['value1_cv'].append(s_to_syllables(row[i+1]))
  data_res['value1_rlm'].append(row[i+2])
  data_res['value2'].append(row[j])
  data_res['value2_cv'].append(s_to_syllables(row[j+1]))
  data_res['value2_rlm'].append(row[j+2])
  if row[-1] == 1:


Unnamed: 0,language1,value1,value1_cv,value1_rlm,language2,value2,value2_cv,value2_rlm,meaning,bor
0,kara1474,ʕaj,CVC,CVj,andi1255,wej,CVC,wVj,(возглас)_74.0,
1,cham1309,susu,CV-CV,CVCV,bagv1239,sːu,CV,CV,"(возглас, которым науськивают собаку)_192.0",
2,ghod1238,hoːh'a,CV-CV,CVCV,andi1255,woha,CV-CV,wVCV,"(возглас, которым останавливают быков)_200.0",
3,kara1474,q'oːʃːikj'a,CV-CVC-CV,CVCVCjV,botl1242,q'oʃː,CVC,CVC,"(возглас, которым останавливают осла)_246.0",
4,kara1474,q'oːʃːikj'a,CV-CVC-CV,CVCVCjV,akhv1239,q'ː'oʃːo,CV-CV,CVCV,"(возглас, которым останавливают осла)_246.0",
...,...,...,...,...,...,...,...,...,...,...
62142,kara1474,jaʃː'ur,CV-CVC,jVCVr,ghod1238,j'aʃːur,CV-CVC,jVCVr,ящур_233555.0,1
62143,botl1242,j'aʃːur,CV-CVC,jVCVr,akhv1239,j'aʃːuri,CV-CV-CV,jVCVrV,ящур_233555.0,1
62144,botl1242,j'aʃːur,CV-CVC,jVCVr,ghod1238,j'aʃːur,CV-CVC,jVCVr,ящур_233555.0,1
62145,akhv1239,j'aʃːuri,CV-CV-CV,jVCVrV,ghod1238,j'aʃːur,CV-CVC,jVCVr,ящур_233555.0,1


In [28]:
pairwise_comparison_df[50004:50010]

Unnamed: 0,language1,value1,value1_cv,value1_rlm,language2,value2,value2_cv,value2_rlm,meaning,bor
50004,cham1309,bˌetitla,CV-CVC-CV,bVCVClV,bagv1239,bet'etila,CV-CV-CV-CV,bVCVCVlV,сохранять_195807.0,
50005,kara1474,bets'ː'aɬa,CV-CV-CV,bVCVCV,botl1242,b'uts'ːats'ːaɬi,CV-CV-CV-CV,bVCVCVCV,сочиться_195859.0,
50006,kara1474,bets'ː'aɬa,CV-CV-CV,bVCVCV,akhv1239,b'et'iloːrutɬa,CV-CV-CV-CVC-CV,bVCVlVrVCCV,сочиться_195859.0,
50007,kara1474,bets'ː'aɬa,CV-CV-CV,bVCVCV,ghod1238,butsː'i,CV-CV,bVCV,сочиться_195859.0,
50008,kara1474,bets'ː'aɬa,CV-CV-CV,bVCVCV,cham1309,bˌus'ːla,CVC-CV,bVClV,сочиться_195859.0,
50009,kara1474,bets'ː'aɬa,CV-CV-CV,bVCVCV,tind1238,bˌutsːiɬʲa,CV-CV-CV,bVCVCV,сочиться_195859.0,


In [29]:
# ф-ия принимает на вход один согласный и базу данных
# сохраняет статистику по звуковым сочетаниям этого гласного в разных языках

def concrete_cons_statistics(consonant, df):
    # выбираем строки, где хотя бы для одного языка есть звук consonant
    cons_df = df[df[['value1_rlm','value2_rlm']].apply(lambda x: x.str.contains(consonant)).any(axis=1)]
    cons_df = cons_df[['language1', 'value1_rlm', 'language2', 'value2_rlm']]
    cons_df['operations'] = ''

    cons_df = cons_df[cons_df.notna().sum(axis=1) > 1]

    # возможные варианты звукосочетаний: последние два подразумевают, что транскрипция оканчивается согласным consonant
    sequence_list = [f'V{consonant}V', f'V{consonant}C', f'C{consonant}V', f'V{consonant}']

    # сохраняем в cons_dict для каждого языка сколько раз каждое звукосочетание в нем встретилось
    for index, row in cons_df.iterrows():
        l1, l2 = row[1], row[3]
        operations = row[4]
        operations_list = []
        for i, seq in enumerate(sequence_list):
            if i < 3:
                if seq in l1 and not(seq in l2):
                    if l1.endswith(f'V{consonant}V'):
                        break
                    if i == 0 and not('VV' in l2):
                        operations_list.append(f'{seq} // V')
                    else:
                        operations_list.append(f'{seq} // {seq[0] + seq[-1]}')
                    l1 = l1[l1.find(consonant):]
                elif seq in l2 and not(seq in l1):
                    if l2.endswith(f'V{consonant}V'):
                        break
                    if i == 0 and not('VV' in l1):
                        operations_list.append(f'V // {seq}')
                    else:
                        operations_list.append(f'{seq[0] + seq[-1]} // {seq}')
                    l2 = l2[l2.find(consonant):]
            else:
                if l1.endswith(seq.format(consonant)) and not(l2.endswith(seq.format(consonant))):
                    if l2.endswith(seq.format(consonant)+'V'):
                        operations_list.append(f'ends with {seq} // {seq}V')
                    else:
                        operations_list.append(f'ends with {seq} // {seq[0]}')
                elif l2.endswith(seq.format(consonant)) and not(l1.endswith(seq.format(consonant))):
                    if l1.endswith(seq.format(consonant)+'V'):
                        operations_list.append(f'ends with {seq}V // {seq}')
                    else:
                        operations_list.append(f'ends with {seq[0]} // {seq}')
        if operations_list:
            cons_df.at[index, 'operations'] = operations_list[0]
            for op in operations_list[1:]:
                new_row = row.copy()
                new_row['operations'] = op
                cons_df = pd.concat([cons_df.loc[:index], new_row.to_frame().T, cons_df.loc[index:]]).reset_index(drop=True)
    return cons_df

In [None]:
directory = 'output_html/sounds/'
os.makedirs(directory, exist_ok=True)

consonants_in_question = ['r', 'l', 'm', 'n', 'w', 'j', 'b']

for consonant in consonants_in_question:
    cons_df = concrete_cons_statistics(consonant, pairwise_comparison_df)
    cons_df = cons_df[cons_df['operations'] != '']
    cons_stats = cons_df.groupby(['language1', 'language2'])['operations'].value_counts().unstack(fill_value=0)
    to_html = cons_stats.to_html()
    with open(f'output_html/sounds/sound_{consonant}.html', 'w') as f:
        f.write(to_html)

  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations = row[4]
  l1, l2 = row[1], row[3]
  operations =

In [None]:
# далее мы будем искать с помощью расстояния левенштейна характерные изменения для пар языков
# для использования ф-ии определения расстояния левенштейна для двух cv-ccv представлений, заменим слоги на буквы

syl_dict = {}

for index, row in pairwise_comparison_df[['value1_cv', 'value2_cv']].iterrows():
    for i in range(2):
        for el in row[i].split('-'):
            if el in syl_dict:
                syl_dict[el] += 1
            else:
                syl_dict[el] = 1
dict(sorted(syl_dict.items(), key=lambda x: x[1], reverse=True))

In [None]:
s2l = {'CV': 'a',
       'CVC': 'b',
       'V': 'c',
       'CCV': 'd',
       'VC': 'e',
       'CCVC': 'f',
       'CVCC': 'g',
       'CCCVC': 'h',
       'CCVCC': 'i',
       'VCC': 'j',
       'CCCV': 'k',
       'CVCCC': 'l',
       'CCCVCC': 'm',
       'CCCCVCCC': 'n',
       'VCCC': 'o',
       'CCVCCC': 'p',
       'CCCCCC': 'q',
       'CC': 'r'
      }

def syl_to_let(s, s2l):
    res = ''
    for el in s.split('-'):
        res += s2l[el]
    return res

def let_to_syl(s, s2l):
    res = next(key for key, value in s2l.items() if value == s)
    return res

In [None]:
def levenstein(str1, str2, flag):
    len_str1 = len(str1)
    len_str2 = len(str2)

    end_change, beg_change = '', ''
    
    matrix = [[0 for _ in range(len_str2 + 1)] for _ in range(len_str1 + 1)]
    
    for i in range(len_str1 + 1):
        matrix[i][0] = i
    for j in range(len_str2 + 1):
        matrix[0][j] = j
    
    for i in range(1, len_str1 + 1):
        for j in range(1, len_str2 + 1):
            cost = 0 if str1[i - 1] == str2[j - 1] else 1
            matrix[i][j] = min(matrix[i-1][j] + 1, matrix[i][j-1] + 1, matrix[i-1][j-1] + cost)
    
    operations = []
    i, j = len_str1, len_str2
    while i > 0 or j > 0:
        if i > 0 and matrix[i][j] == matrix[i-1][j] + 1:
            # Если флаг true, меняем обратно буквы на слоги
            if flag:
                operations.append(f"Delete {let_to_syl(str1[i-1], s2l)} from position {i-1}")
            else:
                operations.append(f"Delete '{str1[i-1]}' from position {i-1}")
            i -= 1
        elif j > 0 and matrix[i][j] == matrix[i][j-1] + 1:
            if flag:
                operations.append(f"Insert {let_to_syl(str2[j-1], s2l)} at position {i}")
            else:
                operations.append(f"Insert '{str2[j-1]}' at position {i}")
            j -= 1
        else:
            if matrix[i][j] != matrix[i-1][j-1]:
                if flag:
                    operations.append(f"Replace {let_to_syl(str1[i-1], s2l)} with {let_to_syl(str2[j-1], s2l)} at position {i-1}")
                else:
                    operations.append(f"Replace '{str1[i-1]}' with '{str2[j-1]}' at position {i-1}")
            i -= 1
            j -= 1
    
    operations.reverse()
    if operations != [] and 'position 0' in operations[0]:
        beg_change = 1
    if operations != [] and (f'position {len_str1 - 1}' in operations[-1] or f'position {len_str2 - 1}' in operations[-1]):
        end_change = 1

    return matrix[len_str1][len_str2], operations, end_change, beg_change

In [None]:
pairwise_comparison_df['cv_lev'] = pairwise_comparison_df.apply(lambda row: levenstein(syl_to_let(row['value1_cv'], s2l), syl_to_let(row['value2_cv'], s2l), False)[0], axis=1)
pairwise_comparison_df['cv_operations'] = pairwise_comparison_df.apply(lambda row: '\n'.join(levenstein(syl_to_let(row['value1_cv'], s2l), syl_to_let(row['value2_cv'], s2l), True)[1]), axis=1)
pairwise_comparison_df['end_change'] = pairwise_comparison_df.apply(lambda row: levenstein(syl_to_let(row['value1_cv'], s2l), syl_to_let(row['value2_cv'], s2l), True)[2], axis=1)
pairwise_comparison_df['beg_change'] = pairwise_comparison_df.apply(lambda row: levenstein(syl_to_let(row['value1_cv'], s2l), syl_to_let(row['value2_cv'], s2l), True)[3], axis=1)

# pairwise_comparison_df['lemma_lev'] = pairwise_comparison_df.apply(lambda row: levenstein(row['value1'], row['value2'], False)[0], axis=1)
# pairwise_comparison_df['lemma_operations'] = pairwise_comparison_df.apply(lambda row: '\n'.join(levenstein(row['value1'], row['value2'], False)[1]), axis=1)


In [None]:
pairwise_comparison_df

In [None]:
pairwise_comparison_df.to_csv('pairwise_comparison.csv', index=False)

In [None]:
end_change_df = pairwise_comparison_df[(pairwise_comparison_df['cv_lev'] == 1) & (pairwise_comparison_df['bor'] == '') & (pairwise_comparison_df['end_change'] == 1)]
end_change_df['cv_operations'] = end_change_df['cv_operations'].apply(lambda x: x.split('at')[0])
end_change_df['cv_operations'] = end_change_df['cv_operations'].apply(lambda x: x.split('from')[0])

beg_change_df = pairwise_comparison_df[(pairwise_comparison_df['cv_lev'] == 1) &(pairwise_comparison_df['bor'] == '') & (pairwise_comparison_df['beg_change'] == 1)]
beg_change_df['cv_operations'] = beg_change_df['cv_operations'].apply(lambda x: x.split('at')[0])
beg_change_df['cv_operations'] = beg_change_df['cv_operations'].apply(lambda x: x.split('from')[0])


In [None]:
end_stats = end_change_df.groupby(['language1', 'language2'])['cv_operations'].value_counts().unstack(fill_value=0)
end_column_totals = end_stats.sum()
end_stats_sorted = end_stats[end_column_totals.sort_values(ascending=False).index]
end_stats_sorted

In [None]:
directory = 'output_html/pairwise_data/'
os.makedirs(directory, exist_ok=True)

html_end = end_stats_sorted.to_html()
with open('output_html/pairwise_data/end_stats.html', 'w') as f:
    f.write(html_end)

In [None]:
beg_stats = beg_change_df.groupby(['language1', 'language2'])['cv_operations'].value_counts().unstack(fill_value=0)
beg_column_totals = beg_stats.sum()
beg_stats_sorted = beg_stats[beg_column_totals.sort_values(ascending=False).index]
beg_stats_sorted

In [None]:
html_beg = beg_stats_sorted.to_html()
with open('output_html/pairwise_data/beg_stats.html', 'w') as f:
    f.write(html_beg)

In [None]:
end_stats_df = end_change_df.groupby(['language1', 'language2']).apply(lambda x: x.sort_values(by='cv_operations'))[['cv_operations', 'value1', 'value1_cv', 'value2', 'value2_cv']].reset_index()
end_stats_df.to_csv('end_change.csv', index=False)

beg_stats_df = beg_change_df.groupby(['language1', 'language2']).apply(lambda x: x.sort_values(by='cv_operations'))[['cv_operations', 'value1', 'value1_cv', 'value2', 'value2_cv']].reset_index()
beg_stats_df.to_csv('beg_change.csv', index=False)

In [None]:
!jupyter nbconvert --to html data_analyse.ipynb