In [1]:
import sys
import numpy as np
import nltk
import pandas as pd
from balance import Freq

In [2]:
childes_data = open('childes-spanish/childes-spanish.txt', 'r').readlines()
ud_data = open('ud-spanish/spanish.conllu', 'r').readlines()
unimorph = open('uni-spanish/spa.txt', 'r').readlines()
overreg = open('Comp.Morph Participles.tsv.csv', 'r').readlines()

In [3]:
twitter_data = pd.read_excel('Comp-Morph_Twitter data.xlsx')

In [4]:
# print(ud_data[0:10])
def uniforms(unis, forms, freq_count):
    counts = {}
    for line in unis:
        line = line.strip().split('\t')
        if len(line) > 1:
            lemma = line[0]
            wordform = line[1]
            if wordform not in forms:
                if lemma in freq_count:
                    # print(lemma, wordform)
                    # print(lemma, freq_count[lemma])
                    if lemma not in counts:
                        counts[lemma] = 0
                    counts[lemma] += 1
    return counts

In [5]:
def tweet_freq(tweets):
    total = 0
    counts = {}
    for doc in tweets[['text']].values.tolist():
        for line in doc:
            line = nltk.word_tokenize(line.strip())
            for word in line:
                if word not in counts:
                    counts[word] = 0
                counts[word] += 1
                total += 1
    return counts, total
        

In [6]:
tw_freqs, tw_totals = tweet_freq(twitter_data)
#twitter_data[['text']].values.tolist()
# tw_freqs

In [7]:
len(childes_data)

1365343

In [8]:
len(ud_data)

1732199

In [8]:
len(unimorph)

382956

In [9]:
f = Freq()

In [10]:
ch_freqs = f.getFreqs(childes_data)

Counting the frequencies in CHILDES
On 100000 of 1365343
On 200000 of 1365343
On 300000 of 1365343
On 400000 of 1365343
On 500000 of 1365343
On 600000 of 1365343
On 700000 of 1365343
On 800000 of 1365343
On 900000 of 1365343
On 1000000 of 1365343
On 1100000 of 1365343
On 1200000 of 1365343
On 1300000 of 1365343


In [11]:
ud_freqs, ud_lex_freqs, ud_lex2form, ud_form2lex = f.getUDFreqs(ud_data)

Counting the frequencies in UDs
On 100000 of 1732199
On 200000 of 1732199
On 300000 of 1732199
On 400000 of 1732199
On 500000 of 1732199
On 600000 of 1732199
On 700000 of 1732199
On 800000 of 1732199
On 900000 of 1732199
On 1000000 of 1732199
On 1100000 of 1732199
On 1200000 of 1732199
On 1300000 of 1732199
On 1400000 of 1732199
On 1500000 of 1732199
On 1600000 of 1732199
On 1700000 of 1732199


In [12]:
# ud_lex_freqs['presidente']
ud_totals = sum(list(ud_lex_freqs.values()))
ch_totals = sum(list(ch_freqs.values()))
print("ch_totals", ch_totals)
print("ud_totals", ud_totals)
print("tw_totals", tw_totals)
# sum(list(tw_freqs.values())) ==  tw_totals

ch_totals 1938592
ud_totals 1542720
tw_totals 5712


In [13]:
def get_forms(annotations):
    # Kind of on inverted paradigm: {wordform: lemma...} more of a lookup table
    regs = []
    irregs = []
    for line in annotations:
        # if not line[3].startswith('supplet'):
        line = line.strip().split('\t')
        lemma = line[0]
        form = line[1]
        reg = line[2]
        regs.append((reg, lemma))
        irregs.append((form, lemma))
        # if reg not in regs:
        #     regs[reg] = lemma
        # if form not in irregs:
        #     irregs[form] = lemma
        # print(line)
    return regs, irregs

In [12]:
overreg[1].strip().split('\t')

['morir', 'muertas', 'moridas', 'V.PTCP;PST;FEM;PL']

In [14]:
reg_forms, irreg_forms = get_forms(overreg)
assert(len(irreg_forms) == len(reg_forms))

In [15]:
def build_counts(irreg_forms, reg_forms, count_dict):
    data_list = []
    for irr, reg in zip(irreg_forms, reg_forms):
        lemma = irr[1]
        if irr[0] in count_dict:
            irr_freq = count_dict[irr[0]]
        else:
            irr_freq = 0
        if reg[0] in count_dict:
            reg_freq = count_dict[reg[0]]
        else:
            reg_freq = 0
        # pdb.set_trace()
        total = reg_freq + irr_freq
        if total > 0:
            # if irr[0][0] != reg[0][0]:
            #     pdb.set_trace()
            data_list.append([lemma,
                              irr[0],
                              irr_freq,
                              irr_freq / total,
                              reg[0],
                              reg_freq,
                              reg_freq / total])
    data_frame = pd.DataFrame(data_list, columns=['lemma', 'irreg_form',
                                                  'irreg_count', 'irreg_ratio',
                                                  'reg_form', 'reg_count',
                                                  'reg_ratio'])
    return data_frame

In [16]:
df_ud = build_counts(irreg_forms, reg_forms, ud_freqs)

In [17]:
df_ch = build_counts(irreg_forms, reg_forms, ch_freqs)

In [18]:
df_ud[['reg_ratio']].mean()

reg_ratio    0.031232
dtype: float64

In [19]:
df_ud[['irreg_ratio']].mean()

irreg_ratio    0.968768
dtype: float64

In [20]:
df_ch[['reg_ratio']].mean()

reg_ratio    0.02904
dtype: float64

In [21]:
df_ch[['irreg_ratio']].mean()

irreg_ratio    0.97096
dtype: float64

In [22]:
# freq = sum tokens / total corpus
df_ud[['reg_count']].sum() / ud_totals

reg_count    0.000016
dtype: float64

In [23]:
df_ud[['irreg_count']].sum() / ud_totals

irreg_count    0.002394
dtype: float64

In [24]:
df_ch[['reg_count']].sum() / ch_totals

reg_count    0.000014
dtype: float64

In [25]:
df_ch[['irreg_count']].sum() / ch_totals

irreg_count    0.002289
dtype: float64

In [26]:
df_ud.loc[df_ud['reg_count'] > 0]

Unnamed: 0,lemma,irreg_form,irreg_count,irreg_ratio,reg_form,reg_count,reg_ratio
17,decir,dicha,86,0.86,decida,14,0.14
29,soltar,suelto,12,0.923077,soltado,1,0.076923
71,proveer,proveída,0,0.0,provista,2,1.0
72,proveer,proveídos,0,0.0,provistos,4,1.0
73,proveer,proveído,0,0.0,provisto,4,1.0


In [27]:
pd.options.display.max_rows = 4000
df_ud.loc[df_ud['irreg_count'] > 0]

Unnamed: 0,lemma,irreg_form,irreg_count,irreg_ratio,reg_form,reg_count,reg_ratio
0,morir,muertas,1,1.0,moridas,0,0.0
1,morir,muerta,24,1.0,morida,0,0.0
2,morir,muerto,81,1.0,moridos,0,0.0
3,morir,muertos,65,1.0,morido,0,0.0
4,abrir,abiertas,50,1.0,abrididas,0,0.0
5,abrir,abierta,91,1.0,abrida,0,0.0
6,abrir,abiertos,15,1.0,abrididos,0,0.0
7,abrir,abierto,132,1.0,abrido,0,0.0
8,componer,compuestas,6,1.0,componidas,0,0.0
9,componer,compuesta,29,1.0,componida,0,0.0


In [30]:
df_ch.loc[df_ch['reg_count'] > 0]

Unnamed: 0,lemma,irreg_form,irreg_count,irreg_ratio,reg_form,reg_count,reg_ratio
3,morir,muertos,14,0.933333,morido,1,0.066667
7,abrir,abierto,70,0.985915,abrido,1,0.014085
13,decir,dicha,1,0.5,decida,1,0.5
14,decir,dicho,873,0.998856,decido,1,0.001144
20,soltar,suelto,12,0.705882,soltado,5,0.294118
22,poner,puesta,39,0.975,ponida,1,0.025
24,poner,puestos,14,0.583333,ponido,10,0.416667
30,romper,rota,82,0.987952,rompida,1,0.012048
32,romper,roto,428,0.988453,rompido,5,0.011547
41,descubrir,descubierto,18,0.947368,descubrido,1,0.052632


In [31]:
df_ch.loc[df_ch['irreg_count'] > 0]

Unnamed: 0,lemma,irreg_form,irreg_count,irreg_ratio,reg_form,reg_count,reg_ratio
0,morir,muertas,3,1.0,moridas,0,0.0
1,morir,muerta,16,1.0,morida,0,0.0
2,morir,muerto,64,1.0,moridos,0,0.0
3,morir,muertos,14,0.933333,morido,1,0.066667
4,abrir,abiertas,7,1.0,abrididas,0,0.0
5,abrir,abierta,78,1.0,abrida,0,0.0
6,abrir,abiertos,5,1.0,abrididos,0,0.0
7,abrir,abierto,70,0.985915,abrido,1,0.014085
8,componer,compuesta,1,1.0,componida,0,0.0
9,componer,compuesto,2,1.0,componido,0,0.0
