In [5]:
!pip install gensim tensorflow scikit-learn matplotlib seaborn



In [95]:
import numpy as np
import pandas as pd
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.optimizers import Adam

In [96]:
df = pd.read_csv('merged_dataset_new.csv')

In [1]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [3]:
!pip install gensim tensorflow scikit-learn matplotlib seaborn



In [5]:
import gensim
pretrained_model_path = '/content/drive/My Drive/BioWordVec/bio_embedding_extrinsic'
pretrained_model = gensim.models.KeyedVectors.load_word2vec_format(pretrained_model_path, binary=True)

vocab_size = len(pretrained_model)
print(f"Pre-trained model vocabulary size: {vocab_size}")

Pre-trained model vocabulary size: 2324849


In [97]:
vector_size = pretrained_model.vector_size
print(f"Vector size of the pre-trained model: {vector_size}")


Vector size of the pre-trained model: 200


In [98]:
def preprocess_text(text):
    tokens = text.split()
    tokens = [word for word in tokens if word in pretrained_model.key_to_index]
    return tokens

def text_to_vector(text):
    tokens = preprocess_text(text)
    if not tokens:
        return np.zeros(200)
    return np.mean([pretrained_model[word] for word in tokens], axis=0)


In [99]:
df['Type_vector'] = df['Type'].apply(lambda x: text_to_vector(x))
df['Statement_vector'] = df['Statement'].apply(lambda x: text_to_vector(x))
df['Reference_vector'] = df['Reference_data'].apply(lambda x: text_to_vector(x))

In [100]:
df.head()

Unnamed: 0,test_ID,Type,Section_id,Primary_id,Secondary_id,Statement,Label,Primary_evidence_index,Secondary_evidence_index,Reference_data,Type_vector,Statement_vector,Reference_vector
0,5bc844fc-e852-4270-bfaf-36ea9eface3d,Comparison,Intervention,NCT01928186,NCT00684983,All the primary trial participants do not rece...,Contradiction,INTERVENTION 1: Diagnostic (FLT PET) Pati...,INTERVENTION 1: Arm A Patients receive or...,INTERVENTION 1: Diagnostic (FLT PET) Pati...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.20870005, 0.010578745, -0.14042668, -0.1325...","[0.27742657, -0.0015234367, -0.14305797, -0.01..."
1,86b7cb3d-6186-4a04-9aa6-b174ab764eed,Single,Eligibility,NCT00662129,,"Patients with Platelet count over 100,000/mm¬¨...",Contradiction,"PATIENT CHARACTERISTICS: ANC 1,500/mm³ ...",,DISEASE CHARACTERISTICS: Histologically or c...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.27734584, 0.14244205, -0.083380654, 0.02588...","[0.24579634, 0.06359309, -0.19657847, -0.04272..."
2,dbed5471-c2fc-45b5-b26f-430c9fa37a37,Comparison,Adverse Events,NCT00093145,NCT00703326,Heart-related adverse events were recorded in ...,Entailment,Adverse Events 1: Supraventricular tachycard...,Adverse Events 1: Atrial fibrillation 1/752 ...,Adverse Events 1: Total: 5/32 (15.63%) Feb...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.26496604, 0.15919074, -0.11285321, -0.18515...","[0.15163456, 0.20475844, -0.24133596, 0.050696..."
3,20c35c89-8d23-4be3-b603-ac0ee0f3b4de,Single,Eligibility,NCT01097642,,Adult Patients with histologic confirmation of...,Contradiction,Inclusion Criteria: Patients with histologic...,,Inclusion Criteria: Patients with histologic...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.13775751, 0.020728718, -0.15459715, -0.1009...","[0.21726197, 0.1062329, -0.1627092, -0.0487415..."
4,f17cb242-419d-4f5d-bfa4-41494ed5ac0e,Comparison,Intervention,NCT00852930,NCT02308020,Laser Therapy is in each cohort of the primary...,Contradiction,INTERVENTION 1: Laser Therapy Alone thera...,"INTERVENTION 1: Part A Abemaciclib: HR+, HE...",INTERVENTION 1: Laser Therapy Alone thera...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.2073716, 0.14273366, -0.12802854, -0.110927...","[0.21484275, -0.03353516, -0.19706348, -0.0449..."


In [101]:
X = np.hstack([df['Type_vector'].values.tolist(),
               df['Statement_vector'].values.tolist(),
               df['Reference_vector'].values.tolist()])

In [102]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Label'])

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [104]:
kf = StratifiedKFold(n_splits=5)
accuracies = []
precisions = []
recalls = []
f1s = []

for train_index, val_index in kf.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    model = Sequential()
    model.add(Dense(512, input_dim=X_train_scaled.shape[1], activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    optimizer = Adam(learning_rate=0.0001)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    model_checkpoint = ModelCheckpoint('Label_Prediction.h5', save_best_only=True, monitor='val_loss', mode='min')

    history = model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_data=(X_val_scaled, y_val), callbacks=[early_stopping, model_checkpoint])

    model.load_weights('Label_Prediction.h5')
    y_val_pred = (model.predict(X_val_scaled) > 0.5).astype("int32")

    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)

    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

print(f'Cross-validated accuracy: {np.mean(accuracies)}')
print(f'Cross-validated precision: {np.mean(precisions)}')
print(f'Cross-validated recall: {np.mean(recalls)}')
print(f'Cross-validated F1 score: {np.mean(f1s)}')

Epoch 1/100
Epoch 2/100

  saving_api.save_model(


Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 1/100
Epoch 2/100

  saving_api.save_model(


Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 1/100
Epoch 2/100

  saving_api.save_model(


Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 1/100
Epoch 2/100

  saving_api.save_model(


Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 1/100
Epoch 2/100

  saving_api.save_model(


Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Cross-validated accuracy: 0.4847058823529412
Cross-validated precision: 0.4766746773262375
Cross-validated recall: 0.4576470588235294
Cross-validated F1 score: 0.4304724529340581


In [106]:
model.load_weights('Label_Prediction.h5')

y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype("int32")

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Confusion matrix:\n {conf_matrix}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Confusion matrix:
 [[  8 164]
 [ 23 145]]
Accuracy: 0.45
Precision: 0.4692556634304207
Recall: 0.8630952380952381
F1 Score: 0.6079664570230607


In [None]:
coling

In [None]:
aaai