# Importing some utilities

In [None]:
import glob
import sys
sys.path.append('../python/')
from json2graph import jsonFile2graph
from graphUtils import plot_graph
from statsUtils import whichFitsBetter

# Loading $R_{II}$

In [None]:
import networkx as nx
import glob
import json
import os

files = glob.glob("../realGraphs/Ecore/R2/*.json")

Gs = []
for file in files:
    Gs.append(jsonFile2graph(file))

# RandomEMF

For each rule in RandomEMF, depending on the type of rule, we estimate its parameters. More concretely, for shapes we use the function `whichFitsBetter` that selects the best distribuntion by using maximum likeihood. For priorities in alternative rules, the procedure described in the paper is done and it is based on counting each different alternative in the set $R_{II}$.

## Number of classifiers

For the rule:

``` 
Package: EPackage ->
    eClassifiers += Classifier#Distribution(parameters);
```

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def isClassifier(tp):
    if tp == 'EClass':
        return True
    if tp == 'EDataType':
        return True
    if tp == 'EEnum':
        return True
    return False

bins = np.arange(0, 200, 5)
numberClassifiers = [len([n for n in G if isClassifier(G.nodes[n]['type'])]) for G in Gs]
plt.hist(numberClassifiers, bins = bins, alpha=0.5, density = True)

In [None]:
whichFitsBetter(numberClassifiers)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import factorial
from scipy.stats import nbinom
from scipy.stats import norm


t = np.arange(0, 200, 1)
d = nbinom.pmf(t, 1.0, 0.02159, 0)

#np.exp(-np.mean(numberClassifiers))*np.power(np.mean(numberClassifiers), t)/factorial(t)
f = plt.figure()
plt.hist(numberClassifiers, bins = bins, alpha=0.5, density = True)
plt.plot(t, d, '-')
plt.show()

## Proportions of classifiers that a package has

For the rule:

``` 
alter Classifier : EClassifier ->
  	Enum#a | DataType#b |Class#c
  ;
```

In [None]:
import numpy as np
ps = []
for G in Gs:
    p = [0, 0, 0]
    for n in G[0]:
        if (G.nodes[n]['type'] == 'EClass'):
            p[0] = p[0] + 1
        if (G.nodes[n]['type'] == 'EDataType'):
            p[1] = p[1] + 1
        if (G.nodes[n]['type'] == 'EEnum'):
            p[2] = p[2] + 1
    p = np.array(p)
    ps.append(p/np.sum(p))
ps = np.array(ps)
print(np.mean(ps, axis = 0)/0.03663906)

## Number of eliterals

For the rule:

``` 
Enum : EEnum ->
  	eLiterals += Literal#Distribution(parameters);
  ;
```

In [None]:
numberEliterals = []
for G in Gs:
    for n in G:
        if G.nodes[n]['type'] =='EEnum':
            cont = 0
            for e in G[n]:
                for e2 in G[n][e]:
                     if (G[n][e][e2]['type'] == 'eLiterals'):
                        cont = cont + 1
            numberEliterals.append(cont)

In [None]:
bins = np.arange(0, 50, 1)
plt.hist(numberEliterals, bins = bins, alpha=0.5, density = True)

print('mean',np.mean(numberEliterals))
print('var',np.var(numberEliterals))

In [None]:
whichFitsBetter(numberEliterals)

In [None]:
t = np.arange(0, 50, 1)
d = nbinom.pmf(t, 1, 0.1349, 0)

#np.exp(-np.mean(numberClassifiers))*np.power(np.mean(numberClassifiers), t)/factorial(t)
plt.hist(numberEliterals, bins = bins, alpha=0.5, density = True)
plt.plot(t, d, '-')
plt.show()

## Distribution Structural Features

For the rule:

``` 
Class: EClass ->
    eStructuralFeatures += Feature(self)#Distribution(parameters);
  ;
```

In [None]:
numberStrctFeat= []
for G in Gs:
    for n in G:
        if G.nodes[n]['type'] =='EClass':
            cont = 0
            for e in G[n]:
                for e2 in G[n][e]:
                     if (G[n][e][e2]['type'] == 'eStructuralFeatures'):
                        cont = cont + 1
            numberStrctFeat.append(cont)

In [None]:
whichFitsBetter(numberStrctFeat)

In [None]:
bins = np.arange(0, 50, 1)
plt.hist(numberStrctFeat, bins = bins, alpha=0.5, density = True)

print('mean',np.mean(numberStrctFeat))
print('var',np.var(numberStrctFeat))

## SuperTypes

For the rule:

``` 
Class: EClass ->
    eSuperTypes +=  Uniform(model.EClassifiers.filter[
      it instanceof org.eclipse.emf.ecore.EClass
    ].filter[!this.self.EAllSuperTypes.contains(it)].map[it as org.eclipse.emf.ecore.EClass])#Distribution(parameters);
  ;
```

In [None]:
superTypes= []
for G in Gs:
    for n in G:
        if G.nodes[n]['type'] =='EClass':
            cont = 0
            for e in G[n]:
                for e2 in G[n][e]:
                     if (G[n][e][e2]['type'] == 'eSuperTypes'):
                        cont = cont + 1
                superTypes.append(cont)

In [None]:
whichFitsBetter(superTypes)

## EAttributes vs EReferences

For the rule:

``` 
alter Feature (EClass c): EStructuralFeature ->  
     if (model.EClassifiers.filter[it instanceof EDataType].size > 0) Attribute#a |if (model.EClassifiers.filter[it instanceof org.eclipse.emf.ecore.EClass].size > 0)
     Reference(c)#b
  ;
```

Estimating `a` and `b`.

In [None]:
ps = []
for G in Gs:
    p = [0, 0]
    for n in G:
        if (G.nodes[n]['type'] == 'EAttribute'):
            p[0] = p[0] + 1
        if (G.nodes[n]['type'] == 'EReference'):
            p[1] = p[1] + 1
    p = np.array(p)
    if (np.sum(p) != 0):
        ps.append(p/np.sum(p))
    else:
        ps.append(p)
ps = np.array(ps)
print(np.mean(ps, axis = 0)/np.min(np.mean(ps, axis = 0)))

## EOpposite

For the rule:

``` 
Reference(EClass c):EReference ->
    eOpposite := if (UniformBool(a)) ReferenceOpp(self.EType as EClass,self,c)
  ; 
```

Estimating `a` by calculating the proportion of references that contain an opposite one.

In [None]:
opposite= []
for G in Gs:
    for n in G:
        if G.nodes[n]['type'] =='EReference':
            cont = 0
            for e in G[n]:
                for e2 in G[n][e]:
                     if (G[n][e][e2]['type'] == 'eOpposite'):
                        cont = cont + 1
                opposite.append(cont)

In [None]:
len([n for n in opposite if n == 1])/len(opposite)

# VIATRA and ALLOY, estimating the scope

For VIATRA and ALLOY, the distribution over the objects (i.e., $P(o)$) needs to be approximated. First, we calculate $\{o_1,\dots,o_n\}$ by counting the number of objects of each model in $R_{II}$.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

bins = np.arange(0, 500, 5)
numberObjects = [len([n for n in G]) for G in Gs]
hist = plt.hist(numberObjects, bins = bins, alpha=0.5)

We consider the KDE function:

$$\hat{f}_{h,K}(o)=\frac{1}{nh}\sum_{i=1}^nK\left(\frac{o-o_i}{h}\right).$$

Where $K \in \{\text{gaussian, tophat}\}$ and $h\in \texttt{np.logspace(-2, -1, 20)}$. $K$ and $h$ are fixed using crossvalidation.

In [None]:
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV

numberObjects = [[len([n for n in G])] for G in Gs]

params = {'bandwidth': np.logspace(-2, -1, 20),
         'kernel':['gaussian', 'tophat']}
grid = GridSearchCV(KernelDensity(), params)
grid.fit(np.array(numberObjects))
print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))
print("best kernel: {0}".format(grid.best_estimator_.kernel))

For VIATRA we sample 300 from $\hat{f}_{h,K}$.

In [None]:
kde = grid.best_estimator_
new_data = kde.sample(300, random_state=0)
new_data = new_data.reshape(-1)
new_data

We check that the histogram samples are close to the original one:

In [None]:
size_bin=5
bins = np.arange(-100, 500, size_bin)
numberObjects = [len([n for n in G]) for G in Gs]
hist = plt.hist(numberObjects, bins = bins, alpha=0.5,density=True)
plt.hist(new_data, bins = bins, alpha=0.5,density=True)
probs = hist[0]
probs = (probs/np.sum(probs))
objs = hist[1]

Finally, we generate the config files for VIATRA and ALLOY. These files are already provided together with the final and generated models. Therefore, you should not execute these code snippets. 

**Note**: For Alloy we generate more samples (400) since the generator fails more when it looks for a solution to the logic problem.

```
import numpy as np
import random
i = 0
for s in new_data:
    with open('../configurationFiles/Ecore/model.vsconfig', 'r') as file:
        data = file.read()
    x = data.replace("#node += 12..12", "#node += "+str(int(s)))
    x = x.replace("debug =\t\t\t\"outputs/debug\"","debug =\t\t\t\"outputs"+str(i)+"/debug\"")
    x = x.replace("log =\t\t\t\"outputs/log.txt\"","log =\t\t\t\"outputs"+str(i)+"/log.txt\"")
    x = x.replace("output =\t\t\"outputs/models\"","output =\t\t\"outputs"+str(i)+"/models\"")
    x = x.replace("runs = 400","runs = 1")
    with open("../configurationFiles/Ecore/VIATRA/smallEcoreGen"+str(i)+".vsconfig", "w") as text_file:
        text_file.write(x)
        i = i + 1
```

```
new_data = kde.sample(400, random_state=0)
new_data = new_data.reshape(-1)
new_data
i = 0
for s in new_data:
    if int(s)>30:
        print(s)
        continue
    with open('../configurationFiles/Ecore/modelAlloy.vsconfig', 'r') as file:
        data = file.read()
    x = data.replace("#node += 12..12", "#node += "+str(int(s)))
    x = x.replace("debug =\t\t\t\"outputs/debug\"","debug =\t\t\t\"outputs"+str(i)+"/debug\"")
    x = x.replace("log =\t\t\t\"outputs/log.txt\"","log =\t\t\t\"outputs"+str(i)+"/log.txt\"")
    x = x.replace("output =\t\t\"outputs/models\"","output =\t\t\"outputs"+str(i)+"/models\"")
    x = x.replace("ViatraSolver", "AlloySolver")
    with open("../configurationFiles/Ecore/ALLOY/smallEcoreGen"+str(i)+".vsconfig", "w") as text_file:
        text_file.write(x)
        i = i + 1
```

# RANDOM generator

We do thy same as the previous section but considering pairs $(o,d)$ where $o$ is the number of objects and $d$ is the average out degree.

In [None]:
deg_objects = [np.mean([G.out_degree(n) for n in G]) for G in Gs]
objects_deg = np.array(list(zip(numberObjects,deg_objects)))
objects_deg

In [None]:
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
params = {'bandwidth': np.logspace(-2, -1, 20),
         'kernel':['gaussian', 'tophat']}
grid2 = GridSearchCV(KernelDensity(), params)
grid2.fit(objects_deg)
print("best bandwidth: {0}".format(grid2.best_estimator_.bandwidth))
print("best kernel: {0}".format(grid2.best_estimator_.kernel))

In [None]:
kde2 = grid2.best_estimator_
new_data2 = kde2.sample(300, random_state=0)

Now, using the new data generated (i.e., new pairs $(o,d)$), we call the RANDOM generator in order to generate the models. Doing something like this.

```
import numpy as np
import random
import subprocess
i = 0
for s in new_data2:
    subprocess.call(['java', '-jar', 'path to jar of the generator', 
                     '-m','path to metamodel',
                    '-f','-n','1','-s',str(s[0]),'-d',str(s[1]),'-o',
                     'path to output folder',
                    '-e',str(i)])
    i = i + 1
```

The generated models used to report the results in the paper are already provided.