In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, Bidirectional
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

# Cargar los archivos con el separador adecuado (tabulación)
entities_df = pd.read_csv('/kaggle/input/litcoinrnn/entities_train.csv', sep='\t')
relations_df = pd.read_csv('/kaggle/input/litcoinrnn/relations_train.csv', sep='\t')
abstracts_df = pd.read_csv('/kaggle/input/litcoinrnn/abstracts_train.csv', sep='\t')

# Asegurarse de que todos los identificadores de 'abstract_id' sean strings
entities_df['abstract_id'] = entities_df['abstract_id'].astype(str)
abstracts_df['abstract_id'] = abstracts_df['abstract_id'].astype(str)
relations_df['abstract_id'] = relations_df['abstract_id'].astype(str)

# Unir los resúmenes con las relaciones usando 'abstract_id'
merged_df = pd.merge(relations_df, abstracts_df[['abstract_id', 'abstract', 'title']], on='abstract_id', how='inner')

# Unir la información de las entidades, primero para 'entity_1' y luego para 'entity_2'
merged_df = pd.merge(merged_df, entities_df[['entity_ids', 'mention', 'type', 'abstract_id']],
                     left_on=['abstract_id', 'entity_1_id'],
                     right_on=['abstract_id', 'entity_ids'],
                     how='inner')
merged_df.rename(columns={'mention': 'entity_1_mention', 'type': 'entity_1_type'}, inplace=True)

merged_df = pd.merge(merged_df, entities_df[['entity_ids', 'mention', 'type', 'abstract_id']],
                     left_on=['abstract_id', 'entity_2_id'],
                     right_on=['abstract_id', 'entity_ids'],
                     how='inner')
merged_df.rename(columns={'mention': 'entity_2_mention', 'type': 'entity_2_type'}, inplace=True)

# Crear una nueva columna combinando el abstract con las menciones de las dos entidades
merged_df['combined_input'] = (
    merged_df['abstract'] + " [ENT1] " + merged_df['entity_1_mention'] + " [ENT2] " + merged_df['entity_2_mention']
)

# Convertir la entrada combinada a minúsculas
merged_df['combined_input'] = merged_df['combined_input'].str.lower()

# Tokenización y Padding
max_words = 10000  # Número máximo de palabras en el vocabulario
max_len = 100  # Longitud máxima de las secuencias
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(merged_df['combined_input'])
sequences = tokenizer.texts_to_sequences(merged_df['combined_input'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Convertir las etiquetas de relaciones (type_x) a números
label_encoder = LabelEncoder()
merged_df['type_x'] = label_encoder.fit_transform(merged_df['type_x'])

# Parámetros del modelo
vocab_size = 10000  # Tamaño del vocabulario
embedding_dim = 128  # Dimensión del embedding
dropout_rate = 0.5  # Tasa de Dropout
num_classes = len(label_encoder.classes_)  # Número de clases (relaciones)
learning_rate = 1e-4  # Tasa de aprendizaje

# Construcción del modelo
model_rnn = Sequential()
model_rnn.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))
model_rnn.add(Bidirectional(SimpleRNN(128, return_sequences=True)))
model_rnn.add(SimpleRNN(64, return_sequences=False))
model_rnn.add(Dropout(dropout_rate))
model_rnn.add(Dense(num_classes, activation='softmax', kernel_regularizer=l2(0.01)))

# Compilación del modelo
optimizer = Adam(learning_rate=learning_rate)
model_rnn.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Resumen del modelo
model_rnn.summary()

# Entrenamiento del modelo
history_rnn = model_rnn.fit(
    padded_sequences, 
    merged_df['type_x'], 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2
)




Epoch 1/10
[1m1848/1848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 79ms/step - accuracy: 0.6337 - loss: 1.1045 - val_accuracy: 0.4678 - val_loss: 1.9477
Epoch 2/10
[1m1848/1848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 77ms/step - accuracy: 0.9644 - loss: 0.2481 - val_accuracy: 0.4519 - val_loss: 2.3226
Epoch 3/10
[1m1848/1848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 77ms/step - accuracy: 0.9862 - loss: 0.1444 - val_accuracy: 0.4611 - val_loss: 2.5514
Epoch 4/10
[1m1848/1848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 77ms/step - accuracy: 0.9924 - loss: 0.1054 - val_accuracy: 0.4290 - val_loss: 2.6815
Epoch 5/10
[1m1848/1848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 77ms/step - accuracy: 0.9932 - loss: 0.0917 - val_accuracy: 0.4818 - val_loss: 2.6170
Epoch 6/10
[1m1848/1848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 77ms/step - accuracy: 0.9961 - loss: 0.0756 - val_accuracy: 0.4533 - val_loss: 2.651

In [3]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Supongamos que ya has cargado el modelo
model = model_rnn

# Ejemplo de un nuevo abstract y dos entidades
new_abstract = "The protein kinase interacts with the receptor in a significant way."
entity_1_mention = "protein kinase"
entity_2_mention = "receptor"

# Preprocesar el texto: concatenar abstract y menciones
input_text = new_abstract.lower() + " [ENT1] " + entity_1_mention.lower() + " [ENT2] " + entity_2_mention.lower()

# Tokenizar el texto usando el tokenizer entrenado
sequence = tokenizer.texts_to_sequences([input_text])  # Convertir a secuencia numérica
padded_sequence = pad_sequences(sequence, maxlen=100, padding='post')  # Aplicar padding

# Realizar la predicción
prediction = model.predict(padded_sequence)

# Obtener la clase predicha
predicted_class = np.argmax(prediction, axis=1)[0]  # Índice de la clase con mayor probabilidad

# Convertir el índice numérico de vuelta al nombre de la relación
predicted_relation = label_encoder.inverse_transform([predicted_class])

print("Relación predicha:", predicted_relation)
print("Entidad 1:", entity_1_mention)
print("Entidad 2:", entity_2_mention)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Relación predicha: ['Positive_Correlation']
Entidad 1: protein kinase
Entidad 2: receptor
