# Importing some utilities

In [None]:
%cd ../..
import glob
import sys
from dmg.realism.mle import whichFitsBetter
from scripts.modelSet import datasets_supported
msetObject = datasets_supported['rds-genmymodel']
train_path = 'data/rds-genmymodel/train'
backend = 'python'

# Loading training set

In [None]:
Gs = [msetObject.getGraphReal(f,backend) 
                for f in glob.glob(train_path + "/*")]

# Random EMF

For each rule in RandomEMF, depending on the type of rule, we estimate its parameters. More concretely, for shapes we use the function `whichFitsBetter` that selects the best distribuntion by using maximum likeihood. For priorities in alternative rules, the procedure described in the paper is done and it is based on counting each different alternative in the set $R_{II}$.

## Number of tables per database

For the rule:

```
Root : Database ->
		elements += Tables(self)#Distribution(parameters);
	;
```

In [None]:
import numpy as np
import matplotlib.pyplot as plt

numberSubvertex = []
for G in Gs:
    for n in G:
        if G.nodes[n]['type'] =='Database':
            cont = 0
            for e in G[n]:
                if (G.nodes[e]['type'] == 'Table'):
                    cont = cont + 1
            numberSubvertex.append(cont)
            
bins = np.arange(0, 100, 1)
plt.hist(numberSubvertex, bins = bins, alpha=0.5, density = True)
print()

In [None]:
best = whichFitsBetter(numberSubvertex)
print(best)

## Columns per table

For the rule:
```
Tables (Database d) : Table -> 
		columns += Columns(d)#Distribution(parameters);
	;
```

In [None]:
numberSubvertex = []
for G in Gs:
    for n in G:
        if G.nodes[n]['type'] =='Table':
            cont = 0
            for e in G[n]:
                if (G.nodes[e]['type'] == 'Column'):
                    cont = cont + 1
            numberSubvertex.append(cont)
            
bins = np.arange(0, 100, 1)
plt.hist(numberSubvertex, bins = bins, alpha=0.5, density = True)

In [None]:
best = whichFitsBetter(numberSubvertex)
print(best)

## Indexes per table

For the rule:
```
Tables (Database d) : Table -> 
		indexes += Indexx(self)#Distribution(parameters);
	;
```

In [None]:
numberSubvertex = []
for G in Gs:
    for n in G:
        if G.nodes[n]['type'] =='Table':
            cont = 0
            for e in G[n]:
                if (G.nodes[e]['type'] == 'Index'):
                    cont = cont + 1
            numberSubvertex.append(cont)
            
bins = np.arange(0, 10, 1)
plt.hist(numberSubvertex, bins = bins, alpha=0.5, density = True)
print()

In [None]:
best = whichFitsBetter(numberSubvertex)
print(best)

In [None]:
0.15545412999776023**2

## Frefs per Colum

For the rule:
```
Columns (Database d): Column -> 
		foreignReferences += if (d.elements.filter[it instanceof Table].
			map[it as Table].flatMap[it.columns].size >= 1
		) 
		ReferenceF(d, self)#Distribution(parameters)
	;
```

In [None]:
number = []
for G in Gs:
    for n in G:
        if (G.nodes[n]['type'] == 'Column'):
            cont = 0
            for e in G[n]:
                for e2 in G[n][e]:
                     if (G[n][e][e2]['type'] == 'foreignReferences'):
                        cont = cont + 1
            number.append(cont)
bins = np.arange(0, 10, 1)
plt.hist(number, bins = bins, alpha=0.5, density = True)
print()

In [None]:
best = whichFitsBetter(number)
print(best)

## Prefs per column

For the rule:
```
Columns (Database d): Column -> 
		primaryReferences += if (d.elements.filter[it instanceof Table].
			map[it as Table].flatMap[it.columns].size >= 1
		) ReferenceP(d, self)#Distribution(parameters)
	;
```

In [None]:
number = []
for G in Gs:
    for n in G:
        if (G.nodes[n]['type'] == 'Column'):
            cont = 0
            for e in G[n]:
                for e2 in G[n][e]:
                     if (G[n][e][e2]['type'] == 'primaryReferences'):
                        cont = cont + 1
            number.append(cont)
bins = np.arange(0, 10, 1)
plt.hist(number, bins = bins, alpha=0.5, density = True)
print()

In [None]:
best = whichFitsBetter(number)
print(best)

## IndexColumns

For the rule:
```
Indexx (Table t) : Index->
		indexColumns += IndexColumnss(t)#Distribution(parameters)
	;
```

In [None]:
numberSubvertex = []
for G in Gs:
    for n in G:
        if G.nodes[n]['type'] =='Index':
            cont = 0
            for e in G[n]:
                if (G.nodes[e]['type'] == 'IndexColumn'):
                    cont = cont + 1
            numberSubvertex.append(cont)
            
bins = np.arange(0, 10, 1)
plt.hist(numberSubvertex, bins = bins, alpha=0.5, density = True)
print()

In [None]:
best = whichFitsBetter(numberSubvertex)
print(best)