# Importing some utilities

In [None]:
%cd ../..
import glob
import sys
from dmg.realism.mle import whichFitsBetter
from scripts.modelSet import datasets_supported
msetObject = datasets_supported['ecore-github']
train_path = 'data/ecore-github/train'
backend = 'java'

# Loading Training set

In [None]:
Gs = [msetObject.getGraphReal(f,backend) 
                for f in glob.glob(train_path + "/*")]

# RandomEMF

For each rule in RandomEMF, depending on the type of rule, we estimate its parameters. More concretely, for shapes we use the function `whichFitsBetter` that selects the best distribuntion by using maximum likeihood. For priorities in alternative rules, the procedure described in the paper is done and it is based on counting each different alternative in the set $R_{II}$.

## Number of classifiers

For the rule:

``` 
Package: EPackage ->
    eClassifiers += Classifier#Distribution(parameters);
```

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def isClassifier(tp):
    if tp == 'EClass':
        return True
    if tp == 'EDataType':
        return True
    if tp == 'EEnum':
        return True
    return False

bins = np.arange(0, 200, 5)
numberClassifiers = [len([n for n in G if isClassifier(G.nodes[n]['type'])]) for G in Gs]
plt.hist(numberClassifiers, bins = bins, alpha=0.5, density = True)

In [None]:
whichFitsBetter(numberClassifiers)

## Proportions of classifiers that a package has

For the rule:

``` 
alter Classifier : EClassifier ->
  	Enum#a | DataType#b |Class#c
  ;
```

In [None]:
import numpy as np
ps = []
for G in Gs:
    p = [0, 0, 0]
    for n in G[0]:
        if (G.nodes[n]['type'] == 'EClass'):
            p[0] = p[0] + 1
        if (G.nodes[n]['type'] == 'EDataType'):
            p[1] = p[1] + 1
        if (G.nodes[n]['type'] == 'EEnum'):
            p[2] = p[2] + 1
    p = np.array(p)
    ps.append(p/np.sum(p))
ps = np.array(ps)
print(np.mean(ps, axis = 0)/np.min(np.mean(ps, axis = 0)))

## Number of eliterals

For the rule:

``` 
Enum : EEnum ->
  	eLiterals += Literal#Distribution(parameters);
  ;
```

In [None]:
numberEliterals = []
for G in Gs:
    for n in G:
        if G.nodes[n]['type'] =='EEnum':
            cont = 0
            for e in G[n]:
                for e2 in G[n][e]:
                     if (G[n][e][e2]['type'] == 'eLiterals'):
                        cont = cont + 1
            numberEliterals.append(cont)

In [None]:
bins = np.arange(0, 50, 1)
plt.hist(numberEliterals, bins = bins, alpha=0.5, density = True)

print('mean',np.mean(numberEliterals))
print('var',np.var(numberEliterals))

In [None]:
whichFitsBetter(numberEliterals)

## Distribution Structural Features

For the rule:

``` 
Class: EClass ->
    eStructuralFeatures += Feature(self)#Distribution(parameters);
  ;
```

In [None]:
numberStrctFeat= []
for G in Gs:
    for n in G:
        if G.nodes[n]['type'] =='EClass':
            cont = 0
            for e in G[n]:
                for e2 in G[n][e]:
                     if (G[n][e][e2]['type'] == 'eStructuralFeatures'):
                        cont = cont + 1
            numberStrctFeat.append(cont)

In [None]:
whichFitsBetter(numberStrctFeat)

## SuperTypes

For the rule:

``` 
Class: EClass ->
    eSuperTypes +=  Uniform(model.EClassifiers.filter[
      it instanceof org.eclipse.emf.ecore.EClass
    ].filter[!this.self.EAllSuperTypes.contains(it)].map[it as org.eclipse.emf.ecore.EClass])#Distribution(parameters);
  ;
```

In [None]:
superTypes= []
for G in Gs:
    for n in G:
        if G.nodes[n]['type'] =='EClass':
            cont = 0
            for e in G[n]:
                for e2 in G[n][e]:
                     if (G[n][e][e2]['type'] == 'eSuperTypes'):
                        cont = cont + 1
                superTypes.append(cont)

In [None]:
whichFitsBetter(superTypes)

## EAttributes vs EReferences

For the rule:

``` 
alter Feature (EClass c): EStructuralFeature ->  
     if (model.EClassifiers.filter[it instanceof EDataType].size > 0) Attribute#a |if (model.EClassifiers.filter[it instanceof org.eclipse.emf.ecore.EClass].size > 0)
     Reference(c)#b
  ;
```

Estimating `a` and `b`.

In [None]:
ps = []
for G in Gs:
    p = [0, 0]
    for n in G:
        if (G.nodes[n]['type'] == 'EAttribute'):
            p[0] = p[0] + 1
        if (G.nodes[n]['type'] == 'EReference'):
            p[1] = p[1] + 1
    p = np.array(p)
    if (np.sum(p) != 0):
        ps.append(p/np.sum(p))
    else:
        ps.append(p)
ps = np.array(ps)
print(np.mean(ps, axis = 0)/np.min(np.mean(ps, axis = 0)))

## EOpposite

For the rule:

``` 
Reference(EClass c):EReference ->
    eOpposite := if (UniformBool(a)) ReferenceOpp(self.EType as EClass,self,c)
  ; 
```

Estimating `a` by calculating the proportion of references that contain an opposite one.

In [None]:
opposite= []
for G in Gs:
    for n in G:
        if G.nodes[n]['type'] =='EReference':
            cont = 0
            for e in G[n]:
                for e2 in G[n][e]:
                     if (G[n][e][e2]['type'] == 'eOpposite'):
                        cont = cont + 1
                opposite.append(cont)

In [None]:
len([n for n in opposite if n == 1])/len(opposite)