In [1]:
from IPython.display import HTML
import random

def hide_toggle(for_next=False):
    this_cell = """$('div.cell.code_cell.rendered.selected')"""
    next_cell = this_cell + '.next()'

    toggle_text = 'Toggle show/hide'  # text shown on toggle link
    target_cell = this_cell  # target cell to control with toggle
    js_hide_current = ''  # bit of JS to permanently hide code in current cell (only when toggling next cell)

    if for_next:
        target_cell = next_cell
        toggle_text += ' next cell'
        js_hide_current = this_cell + '.find("div.input").hide();'

    js_f_name = 'code_toggle_{}'.format(str(random.randint(1,2**64)))

    html = """
        <script>
            function {f_name}() {{
                {cell_selector}.find('div.input').toggle();
            }}

            {js_hide_current}
        </script>

        <a href="javascript:{f_name}()">{toggle_text}</a>
    """.format(
        f_name=js_f_name,
        cell_selector=target_cell,
        js_hide_current=js_hide_current, 
        toggle_text=toggle_text
    )

    return HTML(html)

def all_cells():    
    return HTML('''<script>
    code_show=true; 
    function code_toggle() {
     if (code_show){
     $('div.input').hide();
     } else {
     $('div.input').show();
     }
     code_show = !code_show
    } 
    $( document ).ready(code_toggle);
    </script>
    The raw code for this IPython notebook is by default hidden for easier reading.
    To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

# the functions below grabbed from http://www.astroml.org/book_figures/appendix/fig_neural_network.html
# our standard imports: matplotlib and numpy
import matplotlib.pyplot as plt
import numpy as np

# just to overwrite default colab style
plt.style.use('default')
plt.style.use('seaborn-talk')

radius = 0.3

arrow_kwargs = dict(head_width=0.05, fc='black')

def draw_connecting_arrow(ax, circ1, rad1, circ2, rad2):
    theta = np.arctan2(circ2[1] - circ1[1],
                       circ2[0] - circ1[0])

    starting_point = (circ1[0] + rad1 * np.cos(theta),
                      circ1[1] + rad1 * np.sin(theta))

    length = (circ2[0] - circ1[0] - (rad1 + 1.4 * rad2) * np.cos(theta),
              circ2[1] - circ1[1] - (rad1 + 1.4 * rad2) * np.sin(theta))

    ax.arrow(starting_point[0], starting_point[1],
             length[0], length[1], **arrow_kwargs)


def draw_circle(ax, center, radius):
    circ = plt.Circle(center, radius, fill=False, lw=2)
    ax.add_patch(circ)
    
# based on borrowed function we can create a new one to draw NN

def draw_net(input_size, output_size, hidden_layers=[], w=6, h=4):
  """Draw a network"""
  x = 0  # initial layer position
  
  ax = plt.subplot()
  ax.set_aspect('equal')
  ax.axis('off')

  ax.set_xlim([-2, -2 + w])
  ax.set_ylim([-h / 2 , h / 2 + 1])
  
  # set y position  
  y_input = np.arange(-(input_size - 1) / 2, (input_size + 1) / 2, 1)
  y_output = np.arange(-(output_size - 1) / 2, (output_size + 1) / 2, 1)
  y_hidden = [np.arange(-(n - 1) / 2, (n + 1) / 2, 1) for n in hidden_layers]
  
  # draw input layer
  plt.text(x, h / 2 + 0.5, "Capa\nEntrada", ha='center', va='top', fontsize=16)
  
  for i, y in enumerate(y_input):
    draw_circle(ax, (x, y), radius)
    ax.text(x - 0.9, y, '$x_%i$' % (input_size - 1 - i),
            ha='right', va='center', fontsize=16)
    draw_connecting_arrow(ax, (x - 0.9, y), 0.1, (x, y), radius)
  
  last_layer = y_input  # last layer y positions
  
  # draw hidden layers
  for ys in y_hidden:
    # shift x
    x += 2
    plt.text(x, h / 2 + 0.5, "Capa\nOculta", ha='center', va='top', fontsize=16)
    
    # draw neurons for each hidden layer
    for i, y1 in enumerate(ys):
      draw_circle(ax, (x, y1), radius)
      
      # connect a neuron with all neurons from previous layer
      if i != len(ys) - 1: # skip bias
        for y2 in last_layer:
          draw_connecting_arrow(ax, (x - 2, y2), radius, (x, y1), radius)
        
    # update last layer
    last_layer = ys
  
  x += 2  # update position for output layer
    
  # draw output layer
  plt.text(x, h / 2 + 0.5, "Capa\nSalida", ha='center', va='top', fontsize=16)

  for i, y1 in enumerate(y_output):
    draw_circle(ax, (x, y1), radius)
    ax.text(x + 0.8, y1, 'Salida', ha='left', va='center', fontsize=16)
    draw_connecting_arrow(ax, (x, y1), radius, (x + 0.8, y1), 0.1)

    # connect each output neuron with all neurons from previous layer
    for y2 in last_layer:
      draw_connecting_arrow(ax, (x - 2, y2), radius, (x, y1), radius)

all_cells()

# Proyecto 5: Procesamiento de Lenguaje Natural (PLN)

## Integrantes: 

* Daniel Julián Rojas Cruz
* Johan Tanta Villanueva

Para este proyecto se trabajó con sentencias de papers completos que contienen interacciones entre proteínas. [Dataset](https://biotext.berkeley.edu/data/prot-prot-interactions/sentences_from_full_papers_that_contain_both_proteins_for_all_interactions)

En primer lugar, se comienza con leer el dataset.

In [2]:
hide_toggle(1)

In [3]:
from pprint import pprint as pp
f = open('corpus.txt', 'r', encoding= 'latin-1')
corpus = f.read()
f.close()

Luego, identificamos la variable **clase** con sus sentencias respectivamente, por lo que observamos que hay

In [4]:
hide_toggle(1)

In [5]:
# separamos por clase
nuevo_corpus = []
for text in corpus.split('\n'):
    if(text != ''):
        nuevo_corpus.append(text)
print("%d variables clases"%(len(nuevo_corpus)))

24 variables clases


Observamos la cantidad de sentencias por cada tipo de interacción (variable clase).

In [6]:
hide_toggle(1)

In [7]:
import pandas as pd
from IPython.display import display
info = []
data = []
for item in nuevo_corpus:
    clase, textos = item.split("=====")
    sentencias = textos.split("||")
    info.append((clase, len(sentencias)))
    for paper in sentencias:
        paper_id , text = paper.split("==>")
        registro = [text,clase]
        data.append(registro)

info = sorted(info, key=lambda item : item[1], reverse=True)
tabla = pd.DataFrame()
tabla["Interaccion"] = [item[0] for item in info]
tabla["NroSentencias"] = [item[1] for item in info]

display(tabla)

Unnamed: 0,Interaccion,NroSentencias
0,upregulates,119
1,regulates,117
2,downregulates,111
3,stimulates,103
4,binds,98
5,requires,96
6,activates,89
7,synergizes with,86
8,inhibits,78
9,inactivates,68


Antes de identificar los tokens de cada sentencia, descargamos el stopwords en formato inglés

In [8]:
hide_toggle(1)

In [9]:
import nltk
import seaborn as sns
sns.set()
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/johantv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/johantv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Procedemos a obtener los tokens por cada sentencia, por lo que definimos la función $get\_tokens(text)$ de la siguiente manera:
- Obtenemos los tokens incluídos los stopwords y convertimos cada caracter del token en minúscula.
```
def get_tokens(text):
    tokens = word_tokenize(text, 'english')
    tokens = [w.lower() for w in tokens if w[0].isalpha()]
```
- Procedemos a eliminar los stopwords
```
    tokens_sinSW = tokens[:]
    #delete stopwords
    for token in tokens_sinSW:
        if token in stopwords.words('english'):
            tokens_sinSW.remove(token)
```
- Por último, extraemos la raíz de cada token
```
    #lemmatization
    tokens_root = []
    for word in tokens_sinSW:
       
        root = porter.stem(word)
        #print(lemma)
        if root not in tokens_root:
            tokens_root.append(root)
        
    return tokens_root
```

In [10]:
hide_toggle(1)

In [11]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()

def get_tokens(text):
    tokens = word_tokenize(text, 'english')
    tokens = [w.lower() for w in tokens if w[0].isalpha()]
    
    tokens_sinSW = tokens[:]
    
    #delete stopwords
    for token in tokens_sinSW:
        if token in stopwords.words('english'):
            tokens_sinSW.remove(token)
    
    #lemmatization
    tokens_root = []
    for word in tokens_sinSW:
       
        root = porter.stem(word)
        if root not in tokens_root:
            tokens_root.append(root)
        
    return tokens_root

Procedemos a aplicar a cada sentencia la función definida anteriormente.

In [12]:
hide_toggle(1)

In [13]:
data_procesada = []

for registro in data:
    text, clase = registro
    tokens = get_tokens(text)
    text = " ".join(tokens)
    data_procesada.append([text, clase])

corpus = []
clase = []

for registro in  data_procesada:
    t, c = registro
    corpus.append(t)
    clase.append(c)

Definimos el bag of words

In [14]:
hide_toggle(1)

In [15]:
# Bag of words
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn import preprocessing
vec  = CountVectorizer()
X = vec.fit_transform(corpus)
df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
display(df)

Unnamed: 0,01,a1,a14t,a204,a3,aa,abbrevi,aberr,abil,abl,...,zhu,zinc,zocchi,zolotukhin,zone,µf,µg,µl,µm,µmol
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1566,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Decidimos eliminar las columnas donde la cantidad de 1's es menor al 1% del total de registros = 15,70

In [16]:
hide_toggle(1)

In [17]:
import pandas as pd
import numpy as np

def count(dataframe, index_list, values_list):
    resultado = dataframe
    for i in range(len(index_list)):
        #Se obtiene el nombre de la columna y el valor 
        index = index_list[i]
        value = values_list[i]

        #Se filtra las filas que cumplen con la condicion de esa columna 
        resultado = resultado[resultado[index] == value]
    #Devuelve el numero de instancias
    c =resultado.shape[0]
    return c

columns_to_eliminate = [] 
for column in df:
    c = count(df,[column], [1])
    if (c <= 8):
        columns_to_eliminate.append(column)
        
for column in columns_to_eliminate:
    df = df.drop([column], axis=1)

display(df)

Unnamed: 0,abil,abl,abolish,abrog,absenc,accumul,acetyl,acetyltransferas,acid,act,...,without,work,would,wt,xxfg,y15f,yang,yeast,yy1,µg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1566,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Añadimos la columna clase

In [18]:
hide_toggle(1)

In [19]:
# añadimos la columna clase
encoder = preprocessing.LabelEncoder()
encode_clase = encoder.fit_transform(clase)
df['clase'] = encode_clase
df

Unnamed: 0,abil,abl,abolish,abrog,absenc,accumul,acetyl,acetyltransferas,acid,act,...,work,would,wt,xxfg,y15f,yang,yeast,yy1,µg,clase
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,23
1566,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,23
1567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,23
1568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,23


Procedemos a separar el dataset de entrenamiento y de prueba.

In [20]:
hide_toggle(1)

In [21]:
## separar training data y test data
X = df.iloc[:,0:677]
Y = df.iloc[:,677]

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.25)

Los modelos que elegimos para realizar las pruebas fueron: **Support Vector Machines (SVM)**, **Gaussian Naive Bayes**, **Nearest centroid classifier**, **Multi-layer perceptron (MLP)**

In [22]:
hide_toggle(1)

In [23]:
# Probando modelos 

# SVM
from sklearn import svm
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
# Nearest Centroid
from sklearn.neighbors import NearestCentroid
#Neural Network
from sklearn.neural_network import MLPClassifier

## 1. Inicialización: SVM

In [24]:
hide_toggle(1)

In [25]:
# Suport Vector Machine
clf = svm.SVC(decision_function_shape='ovo')

## Fase de Entrenamiento: SVM

In [26]:
hide_toggle(1)

In [27]:
clf.fit(X_train, Y_train)

SVC(decision_function_shape='ovo')

## Fase de Test: SVM

In [28]:
hide_toggle(1)

In [29]:
resultSVM = clf.predict(X_test)

## 2. Inicialización: Gaussian Naive Bayes

In [30]:
hide_toggle(1)

In [31]:
gnb = GaussianNB()

## Fase de Entrenamiento: Gaussian Naive Bayes

In [32]:
hide_toggle(1)

In [33]:
gnb.fit(X_train, Y_train)

GaussianNB()

## Fase de Test: Gaussian Naive Bayes

In [34]:
hide_toggle(1)

In [35]:
resultGNB = gnb.predict(X_test)

## 3. Inicialización: Nearest centroid classifier

In [36]:
hide_toggle(1)

In [37]:
nc = NearestCentroid()

## Fase de Entrenamiento: Nearest centroid classifier

In [38]:
hide_toggle(1)

In [39]:
nc.fit(X_train, Y_train)

NearestCentroid()

## Fase de Test: Nearest centroid classifier

In [40]:
hide_toggle(1)

In [41]:
resultNC = nc.predict(X_test)

## 4. Inicialización: Red Neuronal

In [42]:
hide_toggle(1)

In [43]:
mlp =  MLPClassifier (hidden_layer_sizes=(24,), max_iter= 100, learning_rate_init= 0.3, activation = 'logistic')

## Fase de entrenamiento: Red Neuronal

In [44]:
hide_toggle(1)

In [45]:
mlp.fit(X_train, Y_train)

MLPClassifier(activation='logistic', hidden_layer_sizes=(24,),
              learning_rate_init=0.3, max_iter=100)

## Fase de Test: Red Neuronal

In [46]:
hide_toggle(1)

In [47]:
resultMLP = mlp.predict(X_test)

# Métricas

Construimos la matriz de confusion De cada modelo

In [48]:
hide_toggle(1)

In [49]:
def construir_matriz_confusion( y_real, y_modelo, n_clases ):
    matriz = [ [0]*n_clases for _ in range(n_clases) ]
    M = len(y_real)
    for m in range(M):
        clase_real = y_real[m]
        clase_predecida = y_modelo[m]
        matriz[ clase_predecida ][ clase_real ] += 1
    return matriz
y_real = Y_test.tolist()
mc1 = construir_matriz_confusion(y_real, resultSVM, 24)
mc2 = construir_matriz_confusion(y_real, resultGNB, 24)
mc3 = construir_matriz_confusion(y_real, resultNC, 24)
mc4 = construir_matriz_confusion(y_real, resultMLP, 24)

In [50]:
hide_toggle(1)

In [51]:
def accuracy_n(matriz_confusion):
  # nro de clases 
  n_clases = len(matriz_confusion)
  true_cases  = 0
  for i in range(n_clases):
    true_cases += matriz_confusion[i][i]
  total_cases = 0
  for fila in matriz_confusion:
    total_cases += sum(fila)
  return true_cases/total_cases

def precision_n(matriz_confusion):
  n_clases = len(matriz_confusion)
  precision_list = []
  for i in range(n_clases):
    tp = matriz_confusion[i][i]
    suma = sum(matriz_confusion[i])
    if(suma == 0):
      precision_i = 0
    else:
      precision_i = tp/suma
    precision_list.append(precision_i)
  return precision_list

def recall_n(matriz_confusion):
  n_clases = len(matriz_confusion)
  recall_list = []
  for i in range(n_clases):
    tp = matriz_confusion[i][i]
    total = 0
    for j in range(n_clases):
      total += matriz_confusion[j][i]
    if(total == 0):
      recall_i = 0
    else:
      recall_i = tp/total
    recall_list.append(recall_i)
  return recall_list
  

def f1_n(matriz_confusion):
  prec_n = precision_n(matriz_confusion)
  rec_n = recall_n(matriz_confusion)
  list_f1 = []
  assert len(prec_n) == len(rec_n), "invalido"
  n_clases = len(prec_n)
  for i in range(n_clases):
    if(prec_n[i] + rec_n[i] == 0):
      f1_i = 0
    else:
      f1_i = 2 *prec_n[i]*rec_n[i]/(prec_n[i]+rec_n[i])
    list_f1.append(f1_i)
  return list_f1

def macro_precision(matriz_confusion):
  n_clases  = len(matriz_confusion)
  prec_n = precision_n(matriz_confusion)
  
  suma = 0
  for i in range(n_clases):
    suma += prec_n[i]
  return suma/n_clases

def macro_recall(matriz_confusion):
  n_clases  = len(matriz_confusion)
  rec_n = recall_n(matriz_confusion)
  
  suma = 0
  for i in range(n_clases):
    suma += rec_n[i]
  return suma/n_clases

def macro_f1(matriz_confusion):
  n_clases = len(matriz_confusion)
  f_n = f1_n(matriz_confusion)
  suma = 0
  for i in range(n_clases):
    suma += f_n[i]
  return suma/n_clases

In [52]:
hide_toggle(1)

In [53]:
print("Metrica accuracy para SVM: ", accuracy_n(mc1)*100)
print("Metrica accuracy para Gaussian NB: ", accuracy_n(mc2)*100)
print("Metrica accuracy para Nearest centroid: ", accuracy_n(mc3)*100)
print("Metrica accuracy para Red Neuronal: ", accuracy_n(mc4)*100)

Metrica accuracy para SVM:  82.69720101781171
Metrica accuracy para Gaussian NB:  75.06361323155217
Metrica accuracy para Nearest centroid:  76.33587786259542
Metrica accuracy para Red Neuronal:  81.1704834605598


In [54]:
hide_toggle(1)

In [55]:
print("Macro Precision para SVM: ", macro_precision(mc1)*100)
print("Macro Recall para SVM: ", macro_recall(mc1)*100)
print("Macro F1 para SVM: ", macro_f1(mc1)*100)

Macro Precision para SVM:  80.46379260386513
Macro Recall para SVM:  78.72361906727757
Macro F1 para SVM:  78.41488479697544


In [56]:
hide_toggle(1)

In [57]:
print("Macro Precision para Gaussian NB: ", macro_precision(mc2)*100)
print("Macro Recall para Gaussian NB: ", macro_recall(mc2)*100)
print("Macro F1 para Gaussian NB: ", macro_f1(mc2)*100)

Macro Precision para Gaussian NB:  78.2809103324892
Macro Recall para Gaussian NB:  72.16443669430184
Macro F1 para Gaussian NB:  73.50720880066885


In [58]:
hide_toggle(1)

In [59]:
print("Macro Precision para Nearest centroid: ", macro_precision(mc3)*100)
print("Macro Recall para Nearest centroid: ", macro_recall(mc3)*100)
print("Macro F1 para Nearest centroid: ", macro_f1(mc3)*100)

Macro Precision para Nearest centroid:  77.49181424056913
Macro Recall para Nearest centroid:  76.00862787585555
Macro F1 para Nearest centroid:  74.65308595477573


In [60]:
hide_toggle(1)

In [61]:
print("Macro Precision para Red Neuronal: ", macro_precision(mc4)*100)
print("Macro Recall para Red Neuronal: ", macro_recall(mc4)*100)
print("Macro F1 para Red Neuronal: ", macro_f1(mc4)*100)

Macro Precision para Red Neuronal:  77.94256477694691
Macro Recall para Red Neuronal:  77.96020077855542
Macro F1 para Red Neuronal:  77.21719441587834


## Algorimo A priori

In [62]:
hide_toggle(1)

In [63]:
from itertools import combinations
def algoritmo_a_priori(df , treshhold):
    result = []

    k = 1
    lista = [column for column in df]
    #print(lista)
    while (True):
        #  combinar en k
  
        
        # filtrar por count
        l_prima = []
        for r in combinations(lista, k):
            query = [i for i in r]
            values = [1 for i in range(k)]
            c = count(df, query , values)
            
            
            if (c >= treshhold):
                l_prima.append(query)
        
        #print("lprima ",l_prima)
        # numeros sin repeticion
        
        if (len(l_prima) == 0):
            break
        
        for r in combinations(lista, k):
     
            query = [i for i in r]
            result.append(query)


        l_next = []

        for c in l_prima:
            for i in c:
                if i not in l_next:
                    l_next.append(i)

        #print("l_next ",l_next)
        lista = l_next
        k = k +1
    
    return result

Ponemos a prueba el algoritmo a priori

In [64]:
hide_toggle(1)

In [65]:
df = df.drop(['clase'], axis=1)
df

Unnamed: 0,abil,abl,abolish,abrog,absenc,accumul,acetyl,acetyltransferas,acid,act,...,without,work,would,wt,xxfg,y15f,yang,yeast,yy1,µg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1566,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
r1 = algoritmo_a_priori(df,treshhold=300)
print(r1)

[['abil'], ['abl'], ['abolish'], ['abrog'], ['absenc'], ['accumul'], ['acetyl'], ['acetyltransferas'], ['acid'], ['act'], ['action'], ['activ'], ['addit'], ['adhes'], ['affect'], ['affin'], ['agreement'], ['al'], ['all'], ['allow'], ['alon'], ['also'], ['alter'], ['although'], ['amino'], ['amount'], ['an'], ['analys'], ['analysi'], ['analyz'], ['and'], ['angiogen'], ['anti'], ['antibodi'], ['antigen'], ['apobec'], ['apobec3g'], ['apoptosi'], ['appear'], ['are'], ['arrest'], ['as'], ['assay'], ['assembl'], ['associ'], ['astrocyt'], ['astrogli'], ['at'], ['atf4'], ['basal'], ['base'], ['basic'], ['be'], ['been'], ['bfgf'], ['bind'], ['biolog'], ['block'], ['blood'], ['blot'], ['bodi'], ['both'], ['bound'], ['by'], ['ca'], ['caf'], ['camp'], ['can'], ['carboxyl'], ['casein'], ['cat'], ['catalyt'], ['caus'], ['cbp'], ['ccr5'], ['cd4'], ['cdc2'], ['cdc25'], ['cdk7'], ['cdk9'], ['cdna'], ['cell'], ['cellular'], ['cem15'], ['cerevisia'], ['chang'], ['chemokin'], ['chen'], ['chromatin'], ['cle

In [67]:
r2 = algoritmo_a_priori(df,treshhold=500)
print(r2)

[['abil'], ['abl'], ['abolish'], ['abrog'], ['absenc'], ['accumul'], ['acetyl'], ['acetyltransferas'], ['acid'], ['act'], ['action'], ['activ'], ['addit'], ['adhes'], ['affect'], ['affin'], ['agreement'], ['al'], ['all'], ['allow'], ['alon'], ['also'], ['alter'], ['although'], ['amino'], ['amount'], ['an'], ['analys'], ['analysi'], ['analyz'], ['and'], ['angiogen'], ['anti'], ['antibodi'], ['antigen'], ['apobec'], ['apobec3g'], ['apoptosi'], ['appear'], ['are'], ['arrest'], ['as'], ['assay'], ['assembl'], ['associ'], ['astrocyt'], ['astrogli'], ['at'], ['atf4'], ['basal'], ['base'], ['basic'], ['be'], ['been'], ['bfgf'], ['bind'], ['biolog'], ['block'], ['blood'], ['blot'], ['bodi'], ['both'], ['bound'], ['by'], ['ca'], ['caf'], ['camp'], ['can'], ['carboxyl'], ['casein'], ['cat'], ['catalyt'], ['caus'], ['cbp'], ['ccr5'], ['cd4'], ['cdc2'], ['cdc25'], ['cdk7'], ['cdk9'], ['cdna'], ['cell'], ['cellular'], ['cem15'], ['cerevisia'], ['chang'], ['chemokin'], ['chen'], ['chromatin'], ['cle

In [68]:
r3 = algoritmo_a_priori(df,treshhold=700)
print(r3)

[['abil'], ['abl'], ['abolish'], ['abrog'], ['absenc'], ['accumul'], ['acetyl'], ['acetyltransferas'], ['acid'], ['act'], ['action'], ['activ'], ['addit'], ['adhes'], ['affect'], ['affin'], ['agreement'], ['al'], ['all'], ['allow'], ['alon'], ['also'], ['alter'], ['although'], ['amino'], ['amount'], ['an'], ['analys'], ['analysi'], ['analyz'], ['and'], ['angiogen'], ['anti'], ['antibodi'], ['antigen'], ['apobec'], ['apobec3g'], ['apoptosi'], ['appear'], ['are'], ['arrest'], ['as'], ['assay'], ['assembl'], ['associ'], ['astrocyt'], ['astrogli'], ['at'], ['atf4'], ['basal'], ['base'], ['basic'], ['be'], ['been'], ['bfgf'], ['bind'], ['biolog'], ['block'], ['blood'], ['blot'], ['bodi'], ['both'], ['bound'], ['by'], ['ca'], ['caf'], ['camp'], ['can'], ['carboxyl'], ['casein'], ['cat'], ['catalyt'], ['caus'], ['cbp'], ['ccr5'], ['cd4'], ['cdc2'], ['cdc25'], ['cdk7'], ['cdk9'], ['cdna'], ['cell'], ['cellular'], ['cem15'], ['cerevisia'], ['chang'], ['chemokin'], ['chen'], ['chromatin'], ['cle