In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas
import numpy

In [5]:
def vocabulary_size(data, column="Cognateset_ID"):
    """Count different words in vocabulary."""
    return len(set(data[column]))

In [6]:
from simuling.analysis import semantic_width, synonymity, properties, default_properties

In [None]:
try:
    clics_words = pandas.read_csv("original_data/words.csv")
except FileNotFoundError:
    import urllib.request
    data = urllib.request.urlopen(
        "https://github.com/clics/clics-data/blob/master/data/words.csv?raw=true")
    clics_words = pandas.read_csv(data)

clics_words["Weight"] = 1

clics_vocabulary = []
clics_semantic_width = []
clics_synonymity = []
for language, words in clics_words.groupby("LanguageId"):
    print(language)
    clics_vocabulary.append(vocabulary_size(words, "ClicsValue"))
    clics_semantic_width.append(semantic_width(words, "ClicsValue"))
    clics_synonymity.append(semantic_width(words, "ConcepticonGloss"))

  interactivity=interactivity, compiler=compiler, result=result)


abkh1244
acha1250
ache1246
adyg1241
aghu1253
agua1253
aguf1237
aheu1239
ainu1240
akhv1239
alba1267
aleu1260
aluu1243
amba1266
anci1242
ancu1238
anda1281
andi1255
anut1237
arao1248
arch1244
area1240
arhu1242
aros1241
asum1237
aust1307
avar1256
avas1237
aves1237
awac1239
ayor1240
baba1268
baca1246
bade1248
baeg1237
bael1237
bagv1239
bana1305
bani1254
bara1380
bare1273
bari1297
bash1264
basq1248
bata1314
batl1238
bats1242
baur1252
baur1253
bela1254
bell1243
beng1280
bezh1248
bilu1245
bira1254
bitt1240
blab1237
blan1242
boga1251
bogh1241
boly1239
bora1263
botl1242
bret1244
budu1248
bugh1239
bulg1262
bura1292
buru1296
cabi1241
cacu1241
cams1241
cane1242
cara1272
cari1279
cash1251
cavi1250
cayu1262
cent2004
cent2050
cent2128
cent2142
cent2150
chac1249
chac1251
cham1309
chec1245
chek1238
chew1245
chib1270
chim1309
chip1262
chir1284
chon1284
chuk1273
chur1257
chuv1255
ciba1236
cine1238
cofa1242
cogu1240
colo1256
croa1245
cube1242
cuda1238
cuib1242
cuoi1242
curr1243
czec1258
daga1272
dani1285
d

In [None]:
from pathlib import Path
path = Path("../runs")

In [None]:
import os
n = {}
for file in path.glob("*.csv"):
    props = properties(file)
    if "," in props["--tree"]:
        continue
    else:
        del props["--tree"], props["--branchlength"]
    props["--seed"] = default_properties["--seed"]
    if props == default_properties:
        all_data = pandas.read_csv(
            file.open(),
            sep=",",
            na_values=[""],
            comment="#",
            keep_default_na=False,
            encoding='utf-8')

        for language_id, language_data in all_data.groupby("Language_ID"):
            n.setdefault(int(language_id), []).append(vocabulary_size(language_data))
    else:
        print(props, default_properties)

plt.boxplot(list(n.values()), positions=list(n.keys()), widths=[i/2 for i in n.keys()], manage_xticks=False)

plt.plot([1, 2*1048576], [1284, 1284], '0.8', label="Root #Words")

plt.xlabel("time steps $t$")
plt.gca().set_xscale('log')
plt.ylabel("Vocabulary size")
print("Clics: {:0.1f}±{:0.1f}".format(
    numpy.mean(clics_vocabulary), numpy.std(clics_vocabulary)))
stable = numpy.array(sum([n.get(2**i, []) for i in range(23, 27)], []))
print("Simulation: {:0.1f}±{:0.1f}".format(stable.mean(), stable.std()))
plt.gcf().set_size_inches(5,4)
plt.savefig("vocabularysize_timesteps.pdf")


In [None]:
n[2**22]

In [None]:
import os
n = {}
c = {}
p = {}
for file in os.listdir(path):
    if condition(file):
        all_data = pandas.read_csv(
            os.path.join(path, file),
            sep=",",
            na_values=[""],
            keep_default_na=False,
            encoding='utf-8')

        print(file)
        for language_id, language_data in all_data.groupby("Language_ID"):
            n.setdefault(int(language_id), []).append(synonymity(language_data))
            
            sum_width = 0
            m = 0
            accs, widths = [], []
            for form, meanings in language_data.groupby("Cognate_Set"):
                width = meanings["Weight"].sum()**2/(meanings["Weight"]**2).sum()
                accs.append(meanings["Weight"].sum())
                widths.append(width)
                sum_width += width
                m += 1
            c.setdefault(int(language_id), []).append(numpy.corrcoef(accs, widths)[1, 0])
            p.setdefault(int(language_id), []).append(sum_width/m)

plt.boxplot(list(n.values()), positions=list(n.keys()), widths=[i/2 for i in n.keys()], manage_xticks=False)
plt.boxplot(list(p.values()), positions=list(p.keys()), widths=[i/2 for i in p.keys()], manage_xticks=False,
               boxprops=dict(color='blue'), medianprops=dict(color='red'))

plt.plot([1, 2*1048576], [1, 1], '0.8')

plt.plot([0,0],[1,1],c='red',label="Polysemy")
plt.plot([0,0],[1,1],c='orange',label="Synonymity")
plt.xlabel("time steps $t$")
plt.gca().set_xscale('log')
plt.ylabel("Average Polysemy/Synonymity")
plt.legend()
print("Synonymity")
#print("Clics: {:0.1f}±{:0.1f}".format(
#    numpy.mean(clics_synonymity), numpy.std(clics_synonymity)))
stable = numpy.array([n[2**i] for i in range(20, 22)])
# print("Simulation: {:0.1f}±{:0.1f}".format(stable.mean(), stable.std()))
print("Polysemy")
# print("Clics: {:0.1f}±{:0.1f}".format(
#    numpy.mean(clics_semantic_width), numpy.std(clics_semantic_width)))
stable = numpy.array([p[2**i] for i in range(20, 22)])
#print("Simulation: {:0.1f}±{:0.1f}".format(stable.mean(), stable.std()))
plt.gcf().set_size_inches(5,4)
plt.savefig("synonymitypolysemy_timesteps.pdf")


In [None]:
plt.boxplot(list(c.values()), positions=list(c.keys()), widths=[i/2 for i in c.keys()], manage_xticks=False)

plt.xlabel("time steps $t$")
plt.gca().set_xscale('log')
plt.ylabel("Correlation between semantic width and accessibility")
plt.legend()

plt.gcf().set_size_inches(5,4)
plt.savefig("correlation_timesteps.pdf")
