In [1]:
# set working directory
setwd("C:/Users/stein/Dropbox/Studium/7. Semester/BA-Thesis/BA-Thesis_NorthEuraLex")

# load tidyverse library for better dataframes
library(tidyverse)

-- [1mAttaching packages[22m ------------------------------------------------------------------------------- tidyverse 1.3.1 --

[32mv[39m [34mggplot2[39m 3.3.5     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.1.5     [32mv[39m [34mdplyr  [39m 1.0.7
[32mv[39m [34mtidyr  [39m 1.1.4     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 2.0.2     [32mv[39m [34mforcats[39m 0.5.1

-- [1mConflicts[22m ---------------------------------------------------------------------------------- tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



If necessary, download the necessary TSV-files from the NorthEuraLex website. Then read in the data.

In [2]:
conceptdata_loc = "northeuralex-0.9-forms.tsv"
geodata_loc = "northeuralex-0.9-language-data.tsv"

if(!file.exists(conceptdata_loc)) {
  download.file(
    "http://www.sfs.uni-tuebingen.de/~jdellert/northeuralex/0.9/northeuralex-0.9-forms.tsv",
    dest = conceptdata_loc
  )
}

if(!file.exists(geodata_loc)) {
  download.file(
    "http://www.sfs.uni-tuebingen.de/~jdellert/northeuralex/0.9/northeuralex-0.9-language-data.tsv",
    dest = geodata_loc
  )
}

raw_conceptdata = read_tsv("northeuralex-0.9-forms.tsv", show_col_types = FALSE)
raw_geodata = read_tsv("northeuralex-0.9-language-data.tsv", show_col_types = FALSE)

head(raw_conceptdata)
head(raw_geodata)

Language_ID,Glottocode,Concept_ID,Word_Form,rawIPA,IPA,ASJP,List,Dolgo,Next_Step
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
fin,finn1318,Auge::N,silmä,silmæ,s i l m æ,silmE,SILME,SVRMV,validate
fin,finn1318,Ohr::N,korva,k<U+0254>r<U+028B><U+0251>,k <U+0254> r <U+028B> <U+0251>,korwa,KURWA,KVRWV,validate
fin,finn1318,Nase::N,nenä,n<U+025B>næ,n <U+025B> n æ,nEnE,NENE,NVNV,validate
fin,finn1318,Mund::N,suu,su<U+02D0>,s u u,su,SY,SV,validate
fin,finn1318,Zahn::N,hammas,h<U+0251>m<U+02D0><U+0251>s,h <U+0251> m m <U+0251> s,hamas,HAMAS,HVMVS,validate
fin,finn1318,Zunge::N,kieli,ki<U+025B><U+032F>li,k i <U+025B> l i,kiEli,KIELI,KVRV,validate


name,glotto_code,iso_code,family,subfamily,latitude,longitude
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
Finnish,finn1318,fin,Uralic,Finnic,61.0,24.45
North Karelian,kare1335,krl,Uralic,Finnic,65.1691,30.8655
Olonets Karelian,livv1243,olo,Uralic,Finnic,61.0,33.0
Veps,veps1250,vep,Uralic,Finnic,60.3353,34.7865
Estonian,esto1258,ekk,Uralic,Finnic,59.25,24.75
Livonian,livv1244,liv,Uralic,Finnic,57.566,22.0262


Filter for rows which have "validate" in their _Next_Step_ column, thereby excluding rows that still need to be reviewed.

In [3]:
conceptdata = raw_conceptdata %>% filter(Next_Step == "validate")

In [4]:
conceptdata = conceptdata %>% select(Language_ID, Concept_ID, Word_Form, rawIPA, IPA, ASJP, Next_Step)
head(conceptdata)

Language_ID,Concept_ID,Word_Form,rawIPA,IPA,ASJP,Next_Step
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
fin,Auge::N,silmä,silmæ,s i l m æ,silmE,validate
fin,Ohr::N,korva,k<U+0254>r<U+028B><U+0251>,k <U+0254> r <U+028B> <U+0251>,korwa,validate
fin,Nase::N,nenä,n<U+025B>næ,n <U+025B> n æ,nEnE,validate
fin,Mund::N,suu,su<U+02D0>,s u u,su,validate
fin,Zahn::N,hammas,h<U+0251>m<U+02D0><U+0251>s,h <U+0251> m m <U+0251> s,hamas,validate
fin,Zunge::N,kieli,ki<U+025B><U+032F>li,k i <U+025B> l i,kiEli,validate


In [5]:
# Unicode characters are not correctly displayed in the tibble, but are internally correct
conceptdata$rawIPA[2]
conceptdata$rawIPA[3]
conceptdata$rawIPA[4]
conceptdata$rawIPA[5]
conceptdata$rawIPA[6]

Only select the three relevant columns.

In [7]:
conceptdata = conceptdata %>% select(Language_ID, Concept_ID, ASJP)
head(conceptdata)

Language_ID,Concept_ID,ASJP
<chr>,<chr>,<chr>
fin,Auge::N,silmE
fin,Ohr::N,korwa
fin,Nase::N,nEnE
fin,Mund::N,su
fin,Zahn::N,hamas
fin,Zunge::N,kiEli


Make a vector with the ASJP word list and filter the data to only keep rows with one of the 40 ASJP concepts. 

In [8]:
asjp_concepts = c("Auge::N", "Ohr::N", "Nase::N", "Zahn::N", "Zunge::N",
                  "Busen::N", "Hand::N", "Knie::N", "Haut::N", "Blut::N", 
                  "Knochen::N", "Leber::N", "Sonne::N", "Stern::N", "Wasser::N",
                  "Stein::N", "Feuer::N", "Berg::N", "Baum::N", "Blatt::N",
                  "Horn::N", "Hund::N", "Fisch::N", "Laus::N", "Mensch::N", 
                  "Name::N", "Pfad::N", "Nacht::N", "voll::A", "neu::A", 
                  "ich::PRN", "du::PRN", "wir::PRN", "eins::NUM", "zwei::NUM",
                  "trinken::V", "sterben::V", "kommen::V", "sehen::V", 
                  "hören::V")

conceptdata = filter(conceptdata, Concept_ID %in% asjp_concepts)
head(conceptdata)

Language_ID,Concept_ID,ASJP
<chr>,<chr>,<chr>
fin,Auge::N,silmE
fin,Ohr::N,korwa
fin,Nase::N,nEnE
fin,Zahn::N,hamas
fin,Zunge::N,kiEli
fin,Busen::N,rinta


Get the iso codes of all the languages.

In [11]:
languages = raw_geodata %>% select(iso_code)
head(languages)

iso_code
<chr>
fin
krl
olo
vep
ekk
liv


For each language, compile a list of the concepts and the corresponding word(s) for that concept.

In [12]:
language_params = list()
for (i in 1:nrow(languages)) {
  lang = 
    conceptdata %>%
    filter(Language_ID == languages$iso_code[i]) %>%
    select(Concept_ID, ASJP)
  language_params[[i]] = lang
}

To calculate the affine gap scores in the next step, the PMI scores between the ASJP sound classes are needed. If necessary, download the file with the PMI scores. Then read them in.

In [49]:
PMI_data_loc = "pnas.1500331112.sd04.csv"


if(!file.exists(PMI_data_loc)) {
  download.file(
    "http://www.pnas.org/lookup/suppl/doi:10.1073/pnas.1500331112/-/DCSupplemental/pnas.1500331112.sd04.csv",
    dest = PMI_data_loc
  )
}


PMI_scores = read.table("pnas.1500331112.sd04.csv", sep = ",", check.names=FALSE)
head(PMI_scores)

Unnamed: 0_level_0,!,3,4,5,7,8,C,E,G,L,...,q,r,s,t,u,v,w,x,y,z
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
!,5.122192,-5.366628,-0.6660987,-3.8046271,-4.727376,-2.5387277,-4.173302,-4.8432118,-0.2110367,-3.087661,...,-3.633199,-5.777114,-5.4221885,-6.020425375,-6.4771753,-3.733401,-5.1084969,-4.0021361,-5.133982,-3.478218
3,-5.3666279,2.321213,-4.067345,-4.9032883,-7.435475,-5.2468267,-6.881401,-0.5782361,-3.6122829,-6.488907,...,-5.088535,-8.485213,-7.0316753,-8.728524451,-0.6736969,-6.4415,-8.5097432,-7.4033824,-5.167933,-5.780852
4,-0.6660987,-4.067345,5.2679133,0.8958533,-3.428093,-0.1408324,-2.874019,-3.5439288,1.0882463,-1.788378,...,-2.333916,-4.477831,-4.1229056,-4.721142391,-5.1778923,-2.434118,-3.1160668,-2.7028531,-3.834699,-2.178935
5,-3.8046271,-4.903288,0.8958533,3.9886589,-4.264036,-4.3779731,-3.37349,-6.6824572,-1.3571349,-2.624321,...,-5.472445,-4.182373,-7.261434,-6.761058514,-8.3164207,-5.572647,-6.2545952,-5.8413815,-1.054334,-2.544875
7,-4.7273756,-7.435475,-3.4280926,-4.264036,3.702041,-4.2021093,-4.855854,-5.6592956,-2.9730306,-2.111985,...,2.268349,-2.326502,-1.3543887,-0.653834114,-9.2391692,-5.396783,-0.1478133,-0.3572501,-1.950556,-4.448453
8,-2.5387277,-5.246827,-0.1408324,-4.3779731,-4.202109,4.2922055,-2.348752,-4.7234106,-0.7843826,1.221795,...,-3.513398,-0.78594,0.6456476,-0.002097624,-7.0505213,-3.208135,-5.6818429,-3.8823349,1.012892,1.701008


Next, a function to calculate the affine gap score between two words is needed.

The following R code has been adapted from the _affine_ function in the python **py_stringmatching** package; https://github.com/anhaidgroup/py_stringmatching/blob/master/py_stringmatching/similarity_measure/cython/cython_affine.pyx). 

The _gap_start_ and _gap_continuation_ values are taken from the paper _Phylogenetic Inference from Word Lists Using Weighted Alignment with Empirically Determined Weights_ by Gerhard Jäger (https://brill.com/view/journals/ldc/3/2/article-p245_4.xml?language=en)

In [54]:
affine <- function(word1, word2, gap_start = -2.4930, gap_continuation = -1.7057) {
  
  length1 = nchar(word1)
  length2 = nchar(word2)
  
  m = matrix(0, length1 + 1, length2 + 1)
  x = matrix(0, length1 + 1, length2 + 1)
  y = matrix(0, length1 + 1, length2 + 1)
  
  for (i in 2:(length1 + 1)){
    m[i, 1] = -Inf
    x[i, 1] = gap_start + (i - 1) * gap_continuation
    y[i, 1] = -Inf
  } 
  
  for (j in 2:(length2 + 1)){
    m[1, j] = -Inf
    x[1, j] = -Inf
    y[1, j] = gap_start + (j - 1) * gap_continuation
  }
  
  for (i in 2:(length1 + 1)){
    for (j in 2:(length2 + 1)){
      m[i,j] = PMI_scores[substr(word1, i - 1, i - 1), substr(word2, j - 1, j - 1)] +
        max(m[i - 1, j - 1], x[i - 1, j - 1], y[i - 1, j - 1])
      
      x[i, j] = max(gap_start + m[i - 1, j], gap_continuation + x[i - 1, j])
      
      y[i, j] = max(gap_start + m[i, j - 1], gap_continuation + y[i, j - 1])      
    }
  }
  
  max(m[length1 + 1, length2 + 1], x[length1 + 1, length2 + 1], y[length1 + 1, length2 + 1])
}

The following two examples are taken from the paper _Support for linguistic macrofamilies from weighted sequence alignment_, also by Gerhard Jäger (https://www.pnas.org/doi/epdf/10.1073/pnas.1500331112). The affine gap results are identical to the results in the paper, therefore the implementation of the algorithm seems to be correct.

In [53]:
affine("hEnd", "hant")
affine("mano", "hant")

Before the distances between the languages are calculated, a matrix to store them is created.

In [57]:
language_distances = 
  matrix(0, nrow(languages), nrow(languages))

Now it's time to iterate through all language pairs and store the resulting language distances in the newly created matrix.

In [64]:
for (i in 1:(nrow(languages) - 1)) {
  # print to see the program is actually doing something, because the loop takes a while
  print(i)
  for (j in (i + 1):nrow(languages)) {
    # creates a tibble comparing two languages
    join = inner_join(language_params[[i]], language_params[[j]], 
                      by=c("Concept_ID" = "Concept_ID"))
    lang_distance = 0
    k = 1
    n_concepts = 40
    remove_concept = TRUE
    for (concept in asjp_concepts) {
      n_words = 0
      concept_distance = 0
      while ((join$Concept_ID[k] == concept) && k != nrow(join) + 1) {
        remove_concept = FALSE
        n_words = n_words + 1
        concept_distance = 
          concept_distance + 
          (affine(join$ASJP.x[k], join$ASJP.y[k]) / 
          max(nchar(join$ASJP.x[k]), nchar(join$ASJP.y[k]))) 
        k = k + 1
      }
      if (remove_concept) {n_concepts = n_concepts - 1} 
      else {
        remove_concept = TRUE
        lang_distance = lang_distance + (concept_distance / n_words)
      }
      counter = 0
      concept_distance = 0
    }
    language_distances[i, j] = (lang_distance / n_concepts) 
    language_distances[j, i] = (lang_distance / n_concepts)
  }
}

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
[1] 21
[1] 22
[1] 23
[1] 24
[1] 25
[1] 26
[1] 27
[1] 28
[1] 29
[1] 30
[1] 31
[1] 32
[1] 33
[1] 34
[1] 35
[1] 36
[1] 37
[1] 38
[1] 39
[1] 40
[1] 41
[1] 42
[1] 43
[1] 44
[1] 45
[1] 46
[1] 47
[1] 48
[1] 49
[1] 50
[1] 51
[1] 52
[1] 53
[1] 54
[1] 55
[1] 56
[1] 57
[1] 58
[1] 59
[1] 60
[1] 61
[1] 62
[1] 63
[1] 64
[1] 65
[1] 66
[1] 67
[1] 68
[1] 69
[1] 70
[1] 71
[1] 72
[1] 73
[1] 74
[1] 75
[1] 76
[1] 77
[1] 78
[1] 79
[1] 80
[1] 81
[1] 82
[1] 83
[1] 84
[1] 85
[1] 86
[1] 87
[1] 88
[1] 89
[1] 90
[1] 91
[1] 92
[1] 93
[1] 94
[1] 95
[1] 96
[1] 97
[1] 98
[1] 99
[1] 100
[1] 101
[1] 102
[1] 103
[1] 104
[1] 105
[1] 106


In [66]:
head(language_distances)

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0.0,2.099054,1.714937,0.9608125,1.2088556,0.8725801,-0.5628791,-0.8307503,-0.7748954,-0.7105577,...,-1.98263,-1.838136,-1.667545,-1.757684,-1.698878,-1.72336,-1.818263,-1.83791,-1.98541,-1.619638
2.0990544,0.0,1.874872,1.0772252,1.4141897,0.898104,-0.4127351,-0.7501226,-0.7174914,-0.6432128,...,-2.001874,-1.867331,-1.774955,-1.880128,-1.880817,-1.803597,-1.875059,-1.869994,-1.972895,-1.781808
1.7149375,1.874872,0.0,1.3261087,1.1631011,0.862999,-0.5537836,-0.9012411,-0.7598275,-0.7031039,...,-2.090001,-1.795411,-1.707174,-1.820721,-1.749955,-1.768547,-1.881067,-1.781351,-1.988927,-1.652222
0.9608125,1.077225,1.326109,0.0,1.3566684,0.8726903,-0.7409957,-1.0469739,-0.9514742,-0.8599191,...,-2.084979,-1.902554,-1.827839,-2.077658,-1.826783,-2.071801,-1.922537,-1.884865,-1.930468,-1.972818
1.2088556,1.41419,1.163101,1.3566684,0.0,0.7272888,-0.7118341,-0.939919,-0.8817747,-0.8087854,...,-2.152645,-1.945738,-1.927458,-2.059355,-1.794128,-2.151873,-1.869549,-1.83196,-1.910834,-1.927941
0.8725801,0.898104,0.862999,0.8726903,0.7272888,0.0,-0.6704208,-0.8152105,-0.8730572,-0.7543655,...,-2.113595,-1.942571,-1.926469,-1.99941,-1.966429,-2.093235,-1.865565,-1.978354,-2.055234,-2.036301
