In [1]:
import lang2vec.lang2vec as l2v
import pandas as pd

In [2]:
############################################
##### FOR SYNTACTICAL AND PHONOLOGICAL #####
############################################

def get_similarity(kind, langs_list):
    sim_dict = {}
    for L1 in langs_list:
        feat_1 = l2v.get_features(L1, kind)[L1]
        for L2 in langs_list[langs_list.index(L1) : ]:
            feat_2 = l2v.get_features(L2, kind)[L2]
            assert len(feat_1) == len(feat_2)
            U = 0
            I = 0
            for i in range(len(feat_1)):
                if ((feat_1[i] != '--') or (feat_2[i] != '--')):
                    if (feat_1[i] == feat_2[i] == 1):
                        I = I+1
                        U = U+1
                    elif ((feat_1[i] == 1 and feat_2[i] == 0) or (feat_1[i] == 0 and feat_2[i] == 1)):
                        U = U+1
                    else:
                        continue
            score = I / U
            sim_dict[(L1,L2)] = score
    return sim_dict


In [3]:
langs = ['ara', 'ben', 'eng', 'fin', 'ind', 'kor', 'rus', 'swh', 'tel']

syn_sim = get_similarity('syntax_wals', langs)
phon_sim = get_similarity('phonology_wals', langs)

In [4]:
wals = pd.read_csv('language.csv')

langs_morph = ['arb', 'ben', 'eng', 'fin', 'ind', 'kor', 'rus', 'swh', 'tel']
wals_subset = wals.loc[wals['iso_code'].isin(langs_morph)]
wals_subset.reset_index(inplace=True, drop=True)

# 1st row of features is morphology
# 2nd and 3rd row of features is nominal categories
wals_morph = wals_subset.filter(regex='iso_code|^20A|^21A|^21B|^22A|^23A|^24A|^25A|^25B|^26A|^27A|^28A|^29A|\
                   ^30A|31A|^32A|^33A|^34A|^35A|^36A|^37A|^38A|^39A|^39B|^40A|^41A|^42A|\
                   ^43A|^44A|^45A|^46A|^47A|^48A|^49A|^50A|^51A|^52A|^53A|^54A|^55A|^56A|^57A')

wals_morph.fillna('000', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wals_morph.fillna('000', inplace=True)


In [5]:
#############################
##### FOR MORPHOLOGICAL #####
#############################

morph_sim= {}

for L1 in langs_morph:
    feat_1 = wals_morph.loc[wals_morph['iso_code']==L1].values.tolist()[0][1:]
    for L2 in langs_morph[langs_morph.index(L1) : ]:
        feat_2 = wals_morph.loc[wals_morph['iso_code']==L2].values.tolist()[0][1:]
        assert len(feat_1) == len(feat_2)
        I = 0
        U = 0
        for i in range(len(feat_1)):
            if feat_1[i] !='000' or feat_2[i] !='000':
                if feat_1[i] == feat_2[i]:
                    I = I+1
                    U = U+1
                else:
                    U = U+1
        score = I / U
        morph_sim[(L1,L2)] = score

In [6]:
syn_sim

{('ara', 'ara'): 1.0,
 ('ara', 'ben'): 0.2962962962962963,
 ('ara', 'eng'): 0.42105263157894735,
 ('ara', 'fin'): 0.43859649122807015,
 ('ara', 'ind'): 0.47058823529411764,
 ('ara', 'kor'): 0.2711864406779661,
 ('ara', 'rus'): 0.48148148148148145,
 ('ara', 'swh'): 0.5581395348837209,
 ('ara', 'tel'): 0.2682926829268293,
 ('ben', 'ben'): 1.0,
 ('ben', 'eng'): 0.2631578947368421,
 ('ben', 'fin'): 0.3684210526315789,
 ('ben', 'ind'): 0.2631578947368421,
 ('ben', 'kor'): 0.6470588235294118,
 ('ben', 'rus'): 0.35,
 ('ben', 'swh'): 0.25,
 ('ben', 'tel'): 0.46153846153846156,
 ('eng', 'eng'): 1.0,
 ('eng', 'fin'): 0.5952380952380952,
 ('eng', 'ind'): 0.5641025641025641,
 ('eng', 'kor'): 0.4523809523809524,
 ('eng', 'rus'): 0.7027027027027027,
 ('eng', 'swh'): 0.26666666666666666,
 ('eng', 'tel'): 0.3333333333333333,
 ('fin', 'fin'): 1.0,
 ('fin', 'ind'): 0.4878048780487805,
 ('fin', 'kor'): 0.5365853658536586,
 ('fin', 'rus'): 0.7297297297297297,
 ('fin', 'swh'): 0.26666666666666666,
 ('fin',

In [7]:
phon_sim

{('ara', 'ara'): 1.0,
 ('ara', 'ben'): 0.6153846153846154,
 ('ara', 'eng'): 0.6,
 ('ara', 'fin'): 0.6,
 ('ara', 'ind'): 0.6,
 ('ara', 'kor'): 0.4,
 ('ara', 'rus'): 0.6923076923076923,
 ('ara', 'swh'): 0.5,
 ('ara', 'tel'): 0.6153846153846154,
 ('ben', 'ben'): 1.0,
 ('ben', 'eng'): 0.8,
 ('ben', 'fin'): 0.8,
 ('ben', 'ind'): 0.8888888888888888,
 ('ben', 'kor'): 0.6666666666666666,
 ('ben', 'rus'): 0.8888888888888888,
 ('ben', 'swh'): 0.7,
 ('ben', 'tel'): 1.0,
 ('eng', 'eng'): 1.0,
 ('eng', 'fin'): 0.8333333333333334,
 ('eng', 'ind'): 0.8333333333333334,
 ('eng', 'kor'): 0.5833333333333334,
 ('eng', 'rus'): 0.8181818181818182,
 ('eng', 'swh'): 0.8333333333333334,
 ('eng', 'tel'): 0.8,
 ('fin', 'fin'): 1.0,
 ('fin', 'ind'): 0.8333333333333334,
 ('fin', 'kor'): 0.5833333333333334,
 ('fin', 'rus'): 0.8181818181818182,
 ('fin', 'swh'): 0.6923076923076923,
 ('fin', 'tel'): 0.8,
 ('ind', 'ind'): 1.0,
 ('ind', 'kor'): 0.5833333333333334,
 ('ind', 'rus'): 0.8181818181818182,
 ('ind', 'swh'): 0.

In [8]:
morph_sim

{('arb', 'arb'): 1.0,
 ('arb', 'ben'): 0.1111111111111111,
 ('arb', 'eng'): 0.02564102564102564,
 ('arb', 'fin'): 0.07692307692307693,
 ('arb', 'ind'): 0.02564102564102564,
 ('arb', 'kor'): 0.05714285714285714,
 ('arb', 'rus'): 0.05263157894736842,
 ('arb', 'swh'): 0.02702702702702703,
 ('arb', 'tel'): 0.2,
 ('ben', 'ben'): 1.0,
 ('ben', 'eng'): 0.02564102564102564,
 ('ben', 'fin'): 0.05128205128205128,
 ('ben', 'ind'): 0.02564102564102564,
 ('ben', 'kor'): 0.08823529411764706,
 ('ben', 'rus'): 0.02631578947368421,
 ('ben', 'swh'): 0.02702702702702703,
 ('ben', 'tel'): 0.16666666666666666,
 ('eng', 'eng'): 1.0,
 ('eng', 'fin'): 0.5128205128205128,
 ('eng', 'ind'): 0.3333333333333333,
 ('eng', 'kor'): 0.3333333333333333,
 ('eng', 'rus'): 0.5384615384615384,
 ('eng', 'swh'): 0.4358974358974359,
 ('eng', 'tel'): 0.1282051282051282,
 ('fin', 'fin'): 1.0,
 ('fin', 'ind'): 0.3076923076923077,
 ('fin', 'kor'): 0.3333333333333333,
 ('fin', 'rus'): 0.5897435897435898,
 ('fin', 'swh'): 0.3846153