# Importing some utilities

In [None]:
import glob
import sys
sys.path.append('../python/')
from json2graph import jsonFile2graph
from graphUtils import plot_graph
from statsUtils import whichFitsBetter

# Loading $R_{II}$

In [None]:
import networkx as nx
import glob
import json
import os

files = glob.glob("../realGraphs/RDS/R2/*.json")

Gs = []
for file in files:
    Gs.append(jsonFile2graph(file))


# Random EMF

For each rule in RandomEMF, depending on the type of rule, we estimate its parameters. More concretely, for shapes we use the function `whichFitsBetter` that selects the best distribuntion by using maximum likeihood. For priorities in alternative rules, the procedure described in the paper is done and it is based on counting each different alternative in the set $R_{II}$.

## Number of tables per database

For the rule:

```
Root : Database ->
		elements += Tables(self)#Distribution(parameters);
	;
```

In [None]:
import numpy as np
import matplotlib.pyplot as plt

numberSubvertex = []
for G in Gs:
    for n in G:
        if G.nodes[n]['type'] =='Database':
            cont = 0
            for e in G[n]:
                if (G.nodes[e]['type'] == 'Table'):
                    cont = cont + 1
            numberSubvertex.append(cont)
            
bins = np.arange(0, 100, 1)
plt.hist(numberSubvertex, bins = bins, alpha=0.5, density = True)
print()

In [None]:
best = whichFitsBetter(numberSubvertex)
print(best)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import factorial
from scipy.stats import nbinom
from scipy.stats import poisson


t = np.arange(0, 50, 1)
d = nbinom.pmf(t, 4.0, 0.435681, 0)
#np.exp(-np.mean(numberClassifiers))*np.power(np.mean(numberClassifiers), t)/factorial(t)
plt.hist(numberSubvertex, bins = bins, alpha=0.5, density = True)
plt.plot(t, d, '-')
plt.show()

## Columns per table

For the rule:
```
Tables (Database d) : Table -> 
		columns += Columns(d)#Distribution(parameters);
	;
```

In [None]:
numberSubvertex = []
for G in Gs:
    for n in G:
        if G.nodes[n]['type'] =='Table':
            cont = 0
            for e in G[n]:
                if (G.nodes[e]['type'] == 'Column'):
                    cont = cont + 1
            numberSubvertex.append(cont)
            
bins = np.arange(0, 100, 1)
plt.hist(numberSubvertex, bins = bins, alpha=0.5, density = True)

In [None]:
best = whichFitsBetter(numberSubvertex)
print(best)

In [None]:
t = np.arange(0, 50, 1)
d = nbinom.pmf(t, 3, 0.3630, 0)
#np.exp(-np.mean(numberClassifiers))*np.power(np.mean(numberClassifiers), t)/factorial(t)
plt.hist(numberSubvertex, bins = bins, alpha=0.5, density = True)
plt.plot(t, d, '-')
plt.show()

## Indexes per table

For the rule:
```
Tables (Database d) : Table -> 
		indexes += Indexx(self)#Distribution(parameters);
	;
```

In [None]:
numberSubvertex = []
for G in Gs:
    for n in G:
        if G.nodes[n]['type'] =='Table':
            cont = 0
            for e in G[n]:
                if (G.nodes[e]['type'] == 'Index'):
                    cont = cont + 1
            numberSubvertex.append(cont)
            
bins = np.arange(0, 10, 1)
plt.hist(numberSubvertex, bins = bins, alpha=0.5, density = True)
print()

In [None]:
best = whichFitsBetter(numberSubvertex)
print(best)

## Frefs per Colum

For the rule:
```
Columns (Database d): Column -> 
		foreignReferences += if (d.elements.filter[it instanceof Table].
			map[it as Table].flatMap[it.columns].size >= 1
		) 
		ReferenceF(d, self)#Distribution(parameters)
	;
```

In [None]:
number = []
for G in Gs:
    for n in G:
        if (G.nodes[n]['type'] == 'Column'):
            cont = 0
            for e in G[n]:
                for e2 in G[n][e]:
                     if (G[n][e][e2]['type'] == 'foreignReferences'):
                        cont = cont + 1
            number.append(cont)
bins = np.arange(0, 10, 1)
plt.hist(number, bins = bins, alpha=0.5, density = True)
print()

In [None]:
best = whichFitsBetter(number)
print(best)

## Prefs per column

For the rule:
```
Columns (Database d): Column -> 
		primaryReferences += if (d.elements.filter[it instanceof Table].
			map[it as Table].flatMap[it.columns].size >= 1
		) ReferenceP(d, self)#Distribution(parameters)
	;
```

In [None]:
number = []
for G in Gs:
    for n in G:
        if (G.nodes[n]['type'] == 'Column'):
            cont = 0
            for e in G[n]:
                for e2 in G[n][e]:
                     if (G[n][e][e2]['type'] == 'primaryReferences'):
                        cont = cont + 1
            number.append(cont)
bins = np.arange(0, 10, 1)
plt.hist(number, bins = bins, alpha=0.5, density = True)
print()

In [None]:
best = whichFitsBetter(number)
print(best)

## IndexColumns

For the rule:
```
Indexx (Table t) : Index->
		indexColumns += IndexColumnss(t)#Distribution(parameters)
	;
```

In [None]:
numberSubvertex = []
for G in Gs:
    for n in G:
        if G.nodes[n]['type'] =='Index':
            cont = 0
            for e in G[n]:
                if (G.nodes[e]['type'] == 'IndexColumn'):
                    cont = cont + 1
            numberSubvertex.append(cont)
            
bins = np.arange(0, 10, 1)
plt.hist(numberSubvertex, bins = bins, alpha=0.5, density = True)
print()

In [None]:
best = whichFitsBetter(numberSubvertex)
print(best)

# VIATRA and ALLOY, estimating the scope

For VIATRA and ALLOY, the distribution over the objects (i.e., $P(o)$) needs to be approximated. First, we calculate $\{o_1,\dots,o_n\}$ by counting the number of objects of each model in $R_{II}$.

We consider the KDE function:

$$\hat{f}_{h,K}(o)=\frac{1}{nh}\sum_{i=1}^nK\left(\frac{o-o_i}{h}\right).$$

Where $K \in \{\text{gaussian, tophat}\}$ and $h\in \texttt{np.logspace(-2, -1, 20)}$. $K$ and $h$ are fixed using crossvalidation.

In [None]:
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
import numpy as np

numberObjects = [[len([n for n in G])] for G in Gs]

params = {'bandwidth': np.logspace(-2, -1, 20),
         'kernel':['gaussian', 'tophat']}
grid = GridSearchCV(KernelDensity(), params, n_jobs = 8)
grid.fit(np.array(numberObjects))
print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))
print("best kernel: {0}".format(grid.best_estimator_.kernel))

We check that the histogram samples are close to the original one:

In [None]:
kde = grid.best_estimator_
new_data = kde.sample(1000, random_state=0)
new_data = new_data.reshape(-1)

In [None]:
import matplotlib.pyplot as plt
size_bin=2
bins = np.arange(-50, 200, size_bin)
numberObjects = [len([n for n in G]) for G in Gs]
hist = plt.hist(numberObjects, bins = bins, alpha=0.5,density=True)
plt.hist(new_data, bins = bins, alpha=0.5,density=True)
probs = hist[0]
probs = (probs/np.sum(probs))
objs = hist[1]

Finally, we generate the config files for VIATRA and ALLOY. These files are already provided together with the final and generated models. Therefore, you should not execute these code snippets.

**Note**: For Alloy we generate more samples (1000) since the generator fails more when it looks for a solution to the logic problem. Also we filter samples that are greater than 20 since ALLOY is not scalable using this meta-model. 


```
import numpy as np
import random
i = 0
new_data = kde.sample(300, random_state=0)
new_data = new_data.reshape(-1)
for s in new_data:
    with open('../configurationFiles/RDS/model.vsconfig', 'r') as file:
        data = file.read()
    x = data.replace("#node += 20..21", "#node += "+str(int(s)))
    x = x.replace("debug =\t\t\t\"outputs/debug\"","debug =\t\t\t\"outputs"+str(i)+"/debug\"")
    x = x.replace("log =\t\t\t\"outputs/log.txt\"","log =\t\t\t\"outputs"+str(i)+"/log.txt\"")
    x = x.replace("output =\t\t\"outputs/models\"","output =\t\t\"outputs"+str(i)+"/models\"")
    x = x.replace("runs = 10","runs = 1")
    with open("../configurationFiles/RDS/VIATRA/rdsGen"+str(i)+".vsconfig", "w") as text_file:
        text_file.write(x)
        i = i + 1
```

```
import numpy as np
import random
i = 0
new_data = kde.sample(1000, random_state=0)
new_data = new_data.reshape(-1)
for s in new_data:
    if int(s) > 20:
        continue
    with open('../configurationFiles/RDS/modelAlloy.vsconfig', 'r') as file:
        data = file.read()
    x = data.replace("#node += 20..21", "#node += "+str(int(s)))
    x = x.replace("debug =\t\t\t\"outputs/debug\"","debug =\t\t\t\"outputs"+str(i)+"/debug\"")
    x = x.replace("log =\t\t\t\"outputs/log.txt\"","log =\t\t\t\"outputs"+str(i)+"/log.txt\"")
    x = x.replace("output =\t\t\"outputs/models\"","output =\t\t\"outputs"+str(i)+"/models\"")
    x = x.replace("ViatraSolver", "AlloySolver")
    with open("../configurationFiles/RDS/ALLOY/rdsGen"+str(i)+".vsconfig", "w") as text_file:
        text_file.write(x)
        i = i + 1
```

# RANDOM generator

We do thy same as the previous section but considering pairs $(o,d)$ where $o$ is the number of objects and $d$ is the average out degree.

In [None]:
import numpy as np
deg_objects = [np.mean([G.out_degree(n) for n in G]) for G in Gs]
objects_deg = np.array(list(zip(numberObjects,deg_objects)))
objects_deg

In [None]:
params = {'bandwidth': np.logspace(-2, -1, 20),
         'kernel':['gaussian', 'tophat']}
grid2 = GridSearchCV(KernelDensity(), params, n_jobs = 10)
grid2.fit(objects_deg)
print("best bandwidth: {0}".format(grid2.best_estimator_.bandwidth))
print("best kernel: {0}".format(grid2.best_estimator_.kernel))

In [None]:
kde2 = grid2.best_estimator_
new_data2 = kde2.sample(500, random_state=0)

Now, using the new data generated (i.e., new pairs $(o,d)$), we call the RANDOM generator in order to generate the models. Doing something like this.

```
import numpy as np
import random
import subprocess
i = 0
for s in new_data2:
    subprocess.call(['java', '-jar', '../randJar/instantiate.jar', 
                     '-m','path to metamodel',
                    '-f','-n','1','-s',str(s[0]),'-d',str(s[1]),'-o',
                     'path to output folder',
                    '-e',str(i)])
    i = i + 1
```

The generated models used to report the results in the paper are already provided.