In [1]:
%%html
<style>
    table {float:left}
</style>

First step: read the analysis of the model errors into a data frame.

The legend of the column headers is as follows.

Mnemonic|Description
:-----|:-----
ec|Error class {1: "in beam", 2: "not in beam", 3: "no correct solutions"}
lex|Lexeme
ls|Lexical set
sp|Part of speech
pfm|Preformative
vbs|Verbal stem
vbe|Verbal ending
nme|Nominal ending
prs|Pronominal suffix
vt|Verbal tense
ps|Person
nu|Number
gn|Gender
st|State
form|Input form

In [2]:
from pandas import factorize, read_csv

df = read_csv('model_errors.csv')

In [3]:
df

Unnamed: 0,ec,lex,ls,sp,pfm,vbs,vbe,nme,prs,vt,ps,nu,gn,st,form
0,1,R<H[,0,1,1,0,1,6,0,4,0,0,0,0,!!R<(H[/WT
1,1,ZMM/,0,2,-1,-1,-1,1,6,-1,-1,1,2,2,ZMM/+W
2,1,>LJHW=/,0,3,-1,-1,-1,1,-1,-1,-1,1,2,0,>LJHW=/
3,3,KWL[,0,1,0,1,3,0,13,2,2,1,2,-1,K(WL&K&L[T:d+M
4,1,MS</,0,2,-1,-1,-1,5,12,-1,-1,3,2,2,MS</J+HM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22238,2,BLQ[,0,1,7,4,1,2,0,62,0,1,1,2,!M!BLQ[/H:du
22239,1,LQX[,0,1,2,0,1,0,8,1,3,1,2,-1,!J!(LQX[+H
22240,2,DBR===/,0,2,-1,-1,-1,5,4,-1,-1,3,2,2,DBR===/J+K
22241,2,DBR[,-1,1,0,0,1,5,0,6,0,3,2,1,DBR[/J


In [4]:
# What are the sizes of the error classes?
df['ec'].value_counts()

1    10899
2    10255
3     1089
Name: ec, dtype: int64

Our goal is to discover whether there is any relation between the error classes and the input forms. The numbers in the analysis of the input forms are not quantities, but codes. So we are dealing with categorical data. Let us therefore factorize the data, but keep the original ordering of the values, because there is some logic to it.

In [5]:
# Create the correlation matrix (cm)
cm = df.apply(lambda x : factorize(x, sort=True)[0]).corr()

In [6]:
cm

Unnamed: 0,ec,lex,ls,sp,pfm,vbs,vbe,nme,prs,vt,ps,nu,gn,st,form
ec,1.0,0.018982,0.001724,0.028676,0.011926,0.062948,-0.005075,0.059388,0.011801,0.03643,-0.041921,0.010359,-0.047717,0.022935,0.004403
lex,0.018982,1.0,0.062335,-0.021713,0.012756,0.030054,0.009608,-0.009279,-0.023887,0.009015,0.029922,0.018031,0.027051,-0.018093,0.395463
ls,0.001724,0.062335,1.0,-0.173208,0.092505,0.104931,0.071526,-0.066146,-0.023035,0.09949,0.116522,-0.017551,-0.037861,-0.088154,-0.029865
sp,0.028676,-0.021713,-0.173208,1.0,-0.296331,-0.318431,-0.238741,0.15872,-0.090798,-0.321818,-0.373064,0.016536,0.056211,0.206867,0.206633
pfm,0.011926,0.012756,0.092505,-0.296331,1.0,0.554387,0.332915,-0.287681,-0.01323,0.378911,0.524977,-0.018999,-0.094689,-0.359174,-0.541687
vbs,0.062948,0.030054,0.104931,-0.318431,0.554387,1.0,0.355796,-0.317255,-0.039969,0.462039,0.591973,-0.028301,-0.057784,-0.38076,-0.277775
vbe,-0.005075,0.009608,0.071526,-0.238741,0.332915,0.355796,1.0,-0.343667,-0.057712,0.166522,0.544935,0.290918,-0.151842,-0.468163,-0.243076
nme,0.059388,-0.009279,-0.066146,0.15872,-0.287681,-0.317255,-0.343667,1.0,0.109041,-0.065739,-0.540582,0.454711,0.024891,0.581602,0.224179
prs,0.011801,-0.023887,-0.023035,-0.090798,-0.01323,-0.039969,-0.057712,0.109041,1.0,-0.043472,0.013718,0.025768,0.002858,0.196354,-0.034695
vt,0.03643,0.009015,0.09949,-0.321818,0.378911,0.462039,0.166522,-0.065739,-0.043472,1.0,0.233481,-0.090934,-0.070836,0.019242,-0.246181


In [7]:
# Which columns have the strongest correlation?  We need to exclude
# the diagonal, because we are not interested in the correlation of
# a column with itself, which is 1.

from numpy import eye

n = cm.shape[0]
ix = cm.mask(eye(n, dtype=bool)).stack().idxmax()
print('Positive:', cm[ix[0]][ix[1]], "at", ix)

Positive: 0.5919734187073089 at ('vbs', 'ps')


In [8]:
# Which columns have the strongest negative correlation?
ix = df.corr().stack().idxmin()
print('Negative:', cm[ix[0]][ix[1]], "at", ix)

Negative: -0.7339812597765805 at ('ps', 'st')


These values make sense; so this rather crude approach may work for our purpose. Person is greatly determined by verbs and a nominal with state cannot have a value for person. As were are trying to spot noticeable effects
regardless of whether they are positive of negative, let us make a table of the absolute value of the correspondences.

In [9]:
cm.apply(lambda x: abs(x)).style.background_gradient(cmap='coolwarm')

Unnamed: 0,ec,lex,ls,sp,pfm,vbs,vbe,nme,prs,vt,ps,nu,gn,st,form
ec,1.0,0.018982,0.001724,0.028676,0.011926,0.062948,0.005075,0.059388,0.011801,0.03643,0.041921,0.010359,0.047717,0.022935,0.004403
lex,0.018982,1.0,0.062335,0.021713,0.012756,0.030054,0.009608,0.009279,0.023887,0.009015,0.029922,0.018031,0.027051,0.018093,0.395463
ls,0.001724,0.062335,1.0,0.173208,0.092505,0.104931,0.071526,0.066146,0.023035,0.09949,0.116522,0.017551,0.037861,0.088154,0.029865
sp,0.028676,0.021713,0.173208,1.0,0.296331,0.318431,0.238741,0.15872,0.090798,0.321818,0.373064,0.016536,0.056211,0.206867,0.206633
pfm,0.011926,0.012756,0.092505,0.296331,1.0,0.554387,0.332915,0.287681,0.01323,0.378911,0.524977,0.018999,0.094689,0.359174,0.541687
vbs,0.062948,0.030054,0.104931,0.318431,0.554387,1.0,0.355796,0.317255,0.039969,0.462039,0.591973,0.028301,0.057784,0.38076,0.277775
vbe,0.005075,0.009608,0.071526,0.238741,0.332915,0.355796,1.0,0.343667,0.057712,0.166522,0.544935,0.290918,0.151842,0.468163,0.243076
nme,0.059388,0.009279,0.066146,0.15872,0.287681,0.317255,0.343667,1.0,0.109041,0.065739,0.540582,0.454711,0.024891,0.581602,0.224179
prs,0.011801,0.023887,0.023035,0.090798,0.01323,0.039969,0.057712,0.109041,1.0,0.043472,0.013718,0.025768,0.002858,0.196354,0.034695
vt,0.03643,0.009015,0.09949,0.321818,0.378911,0.462039,0.166522,0.065739,0.043472,1.0,0.233481,0.090934,0.070836,0.019242,0.246181


Unfortunately I do not see clear signs of a relationship between the error classes and the other parameters of the analysis. The correlation with verbal stem is slightly higher. That could be because forms with verbal stem are relatively rare (17% of the analytical dictionary) and therefore less trained on.

The slightly elevated value for nominal ending I cannot completely explain. The incidence of the nominal endings in the model errors (62%) is comparable to that in the analytical dictionary (59%). There is, however, some fluctuation in the distribution over the values of nominal ending, as is shown below.

In [10]:
# Create a data frame with the relative frequencies of the values
# for nominal ending in the list of model errors (rf_me).
rf_me = df.nme.value_counts(normalize=True)
rf_me = rf_me.rename_axis('nme').reset_index(name='relfreq')

In [11]:
# Create a data frame with the relative frequencies of the values
# for nominal ending in the analytical dictionary (rf_ad).
from pandas import DataFrame

nme_anwb = DataFrame([[0, 13074], [1, 10772], [5, 2006], [4, 1732], [3, 1602], [6, 1577], 
[2, 1018], [12, 615], [-1, 343], [13, 147], [7, 142], [14, 45], [16, 20], [9, 20
], [17, 18], [15, 12], [10, 3]], columns=['nme', 'freq'])

rf_ad = nme_anwb['relfreq'] = nme_anwb.freq/nme_anwb.freq.sum()
rf_ad = rf_ad.rename_axis('nme').reset_index(name='relfreq')

# Merge the two on column 'nme'
rfs = rf_me.merge(rf_ad, on='nme')
rfs.columns = ['nme', 'me', 'ad']
rfs['diff'] = (rfs.me - rfs.ad)/(rfs.me + rfs.ad)
rfs

Unnamed: 0,nme,me,ad,diff
0,0,0.366048,0.394437,-0.03733
1,1,0.350672,0.324986,0.038016
2,6,0.056647,0.030713,0.29687
3,4,0.056602,0.048332,0.078816
4,5,0.055208,0.047577,0.074242
5,3,0.043654,0.052254,-0.089664
6,2,0.034483,0.06052,-0.274069
7,12,0.013487,0.000603,0.914357
8,7,0.0049,0.018554,-0.582138
9,13,0.004226,0.000603,0.75012
