In [1]:
import re
import time
import nltk
import ssl
import math
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from datetime import datetime
from collections import Counter, defaultdict
from scipy.sparse import hstack
from nltk.corpus import stopwords
from mlxtend.classifier import StackingClassifier

from sklearn import model_selection
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold 
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import normalized_mutual_info_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

# warnings.filterwarnings("ignore");
# nltk.download()


data = pd.read_csv('./training_variants')
print('Number of data points:', data.shape[0])
print('Number of features:', data.shape[1])
print('Features:', data.columns.values)
data.head()

Number of data points: 3321
Number of features: 4
Features: ['ID' 'Gene' 'Variation' 'Class']


Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [2]:
import pandas as pd

data_text =pd.read_csv("./training_text",sep="\|\|",engine="python",names=["ID","TEXT"],skiprows=1)
print('Number of data points : ', data_text.shape[0])
print('Number of features : ', data_text.shape[1])
print('Features : ', data_text.columns.values)
data_text.head()

Number of data points :  3321
Number of features :  2
Features :  ['ID' 'TEXT']


Unnamed: 0,ID,TEXT
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [3]:
stop_words = set(stopwords.words('english'))

def nlp_preprocessing(total_text, index, column):
    if type(total_text) is not int:
        string = ""
        # Replace every special char with space
        total_text = re.sub('[^a-zA-Z0-9\n]', ' ', total_text)
        # Replace multiple spaces with single space
        total_text = re.sub('\s+',' ', total_text)
        # Converting all the chars into lower-case
        total_text = total_text.lower()
        
        for word in total_text.split():
        # If the word is not a stop word then retain that word from the data
            if not word in stop_words:
                string += word + " "
        
        data_text[column][index] = string

In [4]:
start_time = time.perf_counter()
for index, row in data_text.iterrows():
    if type(row['TEXT']) is str:
        nlp_preprocessing(row['TEXT'], index, 'TEXT')
    else:
        print("There is no text description for id:", index)
print('Time took for preprocessing the text:', time.perf_counter() - start_time, "seconds")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_text[column][index] = string


There is no text description for id: 1109
There is no text description for id: 1277
There is no text description for id: 1407
There is no text description for id: 1639
There is no text description for id: 2755
Time took for preprocessing the text: 25.929852092000147 seconds


In [5]:
# Merging both gene_variations and text data based on ID
result = pd.merge(data, data_text, on='ID', how='left')
result.head()

Unnamed: 0,ID,Gene,Variation,Class,TEXT
0,0,FAM58A,Truncating Mutations,1,cyclin dependent kinases cdks regulate variety...
1,1,CBL,W802*,2,abstract background non small cell lung cancer...
2,2,CBL,Q249E,2,abstract background non small cell lung cancer...
3,3,CBL,N454D,3,recent evidence demonstrated acquired uniparen...
4,4,CBL,L399V,4,oncogenic mutations monomeric casitas b lineag...


In [6]:
result[result.isnull().any(axis=1)]

Unnamed: 0,ID,Gene,Variation,Class,TEXT
1109,1109,FANCA,S1088F,1,
1277,1277,ARID5B,Truncating Mutations,1,
1407,1407,FGFR3,K508M,6,
1639,1639,FLT1,Amplification,6,
2755,2755,BRAF,G596C,7,


In [7]:
result.loc[result['TEXT'].isnull(),'TEXT'] = result['Gene'] + ' ' + result['Variation']

result.head()

Unnamed: 0,ID,Gene,Variation,Class,TEXT
0,0,FAM58A,Truncating Mutations,1,cyclin dependent kinases cdks regulate variety...
1,1,CBL,W802*,2,abstract background non small cell lung cancer...
2,2,CBL,Q249E,2,abstract background non small cell lung cancer...
3,3,CBL,N454D,3,recent evidence demonstrated acquired uniparen...
4,4,CBL,L399V,4,oncogenic mutations monomeric casitas b lineag...


In [8]:
y_true = result['Class'].values

result.Gene = result.Gene.str.replace('\s+', '_')
result.Variation = result.Variation.str.replace('\s+', '_')

# Split the data into test and train by maintaining same distribution of output varaible 'y_true' [stratify=y_true]
X_train, test_df, y_train, y_test = train_test_split(result, y_true, stratify=y_true, test_size=0.2)
# Split the train data into train and cross validation by maintaining same distribution of output varaible 'y_train' [stratify=y_train]
train_df, cv_df, y_train, y_cv = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2)



  result.Gene = result.Gene.str.replace('\s+', '_')
  result.Variation = result.Variation.str.replace('\s+', '_')


In [9]:
train_df

Unnamed: 0,ID,Gene,Variation,Class,TEXT
2548,2548,BRCA1,Y1853*,4,abstract germline mutations inactivate tumor s...
966,966,ESR1,S463P,2,seventy percent breast cancers express estroge...
924,924,PDGFRA,Y849C,7,gastrointestinal stromal tumor gist common mes...
568,568,SMAD3,W406A,1,hub proteins connected binding interactions ma...
2742,2742,BRAF,L485_Q494del,2,identifi ed previously undiscovered braf frame...
...,...,...,...,...,...
2203,2203,PTEN,A126V,4,pten phosphatase tensin homolog phosphatase un...
2308,2308,JAK1,A723D,7,summary hepatocellular adenomas hca benign liv...
3210,3210,RB1,S567L,1,hereditary predisposition retinoblastoma cause...
699,699,CDKN2B,D86N,4,molecular pathogenesis sporadic parathyroid ad...


In [10]:
import torch
import pandas as pd
from transformers import XLNetTokenizer, XLNetModel, XLNetForSequenceClassification, BertTokenizer

X_TRAIN = train_df[['Variation', 'Gene', 'TEXT']]
X_TEST = test_df[['Variation', 'Gene', 'TEXT']]

X_TRAIN['TEXT'] = X_TRAIN[X_TRAIN.columns[0:]].apply(lambda x: '| '.join(x.dropna().astype(str)), axis=1)
X_TEST['TEXT'] = X_TEST[X_TEST.columns[0:]].apply(lambda x: '| '.join(x.dropna().astype(str)), axis=1)


#SEPARATE INTO SENTENCES
X_TRAIN_SENTENCES = X_TRAIN.TEXT.values
X_TEST_SENTENCES = X_TEST.TEXT.values


X_TRAIN_SENTENCES = [sentence + " [SEP] [CLS]" for sentence in X_TRAIN_SENTENCES]
X_TEST_SENTENCES = [sentence + " [SEP] [CLS]" for sentence in X_TEST_SENTENCES]

  from .autonotebook import tqdm as notebook_tqdm
2023-01-04 19:30:16.861591: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_TRAIN['TEXT'] = X_TRAIN[X_TRAIN.columns[0:]].apply(lambda x: '| '.join(x.dropna().astype(str)), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retur

We need to add special tokens (“[SEP]” and “[CLS]”) at the beginning and end of each sentence for XLNet to work properly.
With XLNet the token pattern looks like this:
## Sentence_A + [SEP] + Sentence_B + [SEP] + [CLS]


In [11]:
#Load Xlnet tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

#ENCODER
def xlnet_encode(sentences):
    input_ids = []
    attention_masks = []

    for sentence in sentences:
        encoded = tokenizer.encode_plus(
            
            sentence,
            add_special_tokens=True,
            max_length=4096,
            pad_to_max_length=True,
            return_attention_mask=True
            
        )

        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    return np.array(input_ids), np.array(attention_masks)

    


In [12]:
train_input_ids,train_attention_masks = xlnet_encode(X_TRAIN_SENTENCES)
test_input_ids,test_attention_masks = xlnet_encode(X_TEST_SENTENCES)


print(train_input_ids[0], train_attention_masks[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[  17  117 1010 ...  142    4    3] [1 1 1 ... 1 1 1]


In [13]:
from keras.utils.np_utils import to_categorical

Y_TRAIN = to_categorical(y_train, num_classes=10, dtype='int32')
Y_TRAIN_TENSOR = torch.tensor(Y_TRAIN)
Y_TEST = to_categorical(y_test, num_classes=10, dtype='int32')
Y_TEST_TENSOR = torch.tensor(Y_TEST)

train_input_ids_tensor = torch.tensor(train_input_ids)
test_input_ids_tensor = torch.tensor(test_input_ids)
train_attention_masks_tensor = torch.tensor(train_attention_masks)
test_attention_masks_tensor = torch.tensor(test_attention_masks)


print(Y_TEST_TENSOR)
print(Y_TRAIN_TENSOR)



print(train_input_ids_tensor[0], len(train_input_ids_tensor[0]))
print(test_input_ids_tensor[0], len(test_input_ids_tensor[0]))


tensor([[0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0]], dtype=torch.int32)
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 1,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0],
        ...,
        [0, 1, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 0, 0]], dtype=torch.int32)
tensor([  17,  117, 1010,  ...,  142,    4,    3]) 4096
tensor([ 17,  93, 233,  ..., 316,   4,   3]) 4096


In [14]:
print(train_attention_masks_tensor.shape)
print(train_input_ids_tensor.shape)
print(Y_TRAIN_TENSOR.shape)

print(test_attention_masks_tensor.shape)
print(test_input_ids_tensor.shape)
print(Y_TEST_TENSOR.shape[0])




torch.Size([2124, 4096])
torch.Size([2124, 4096])
torch.Size([2124, 10])
torch.Size([665, 4096])
torch.Size([665, 4096])
665


In [15]:
import tensorflow as tf
from transformers import TFXLNetModel, XLNetTokenizer, XLNetModel

xlnet_model = 'xlnet-base-cased'

def create_model_xlnet(xlnet_model, num_labels):
    word_inputs = tf.keras.Input(shape=(4096,), name='word_inputs', dtype='int32')

    
    xlnet = TFXLNetModel.from_pretrained(xlnet_model)

    xlnet.classifier = torch.nn.Linear(768, num_labels)


    xlnet_encodings = xlnet(word_inputs)[0]

    # Collect last step from last hidden state (CLS)
    doc_encoding = tf.squeeze(xlnet_encodings[:, -1:, :], axis=1)
    
    doc_encoding = tf.keras.layers.Dropout(.1)(doc_encoding)
     
    outputs = tf.keras.layers.Dense(10, activation='softmax', name='outputs')(doc_encoding)

    model = tf.keras.Model(inputs=[word_inputs], outputs=[outputs], trainable=True)

    for layer in xlnet.layers:
        layer.trainable = True
        
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

    return model

def create_xlnet_sequence_model(xlnet_model, num_labels, device):

    model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels = num_labels)

    model = model.to(device)

    return model

In [16]:
import torch
# xlnet = create_model_xlnet(xlnet_model,9)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlnet = create_xlnet_sequence_model(xlnet_model=xlnet_model, num_labels=10, device=device)


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [17]:
#history_xl = xlnet.fit(train_input_ids,Y_TRAIN,validation_data=(test_input_ids,Y_TEST), epochs=4,batch_size=1)

from transformers import AdamW , get_linear_schedule_with_warmup

EPOCHS = 3
BATCH_SIZE = 1

param_optimizer = list(xlnet.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
                                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)

total_steps = Y_TRAIN_TENSOR.shape[0] * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)



In [18]:
##

In [19]:
from sklearn import metrics
from torch import nn


def train_epoch(model, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    acc = 0
    counter = 0
    blocks = Y_TRAIN_TENSOR.shape[0] // BATCH_SIZE
    
    for i in range(0,blocks):
        begin = i*BATCH_SIZE
        end = begin+BATCH_SIZE
        input_ids = train_input_ids_tensor[begin:end].reshape(BATCH_SIZE,4096).to(device)
        attention_mask = train_attention_masks_tensor[begin:end].reshape(BATCH_SIZE,4096).to(device)
        targets = torch.tensor(y_train)[begin:end].to(device)


        print(
            f"STEP # {i+1} \n", 
            "INPUT_SHAPE => ", input_ids.shape, "\n", 
            "ATTENTION_MASK_SHAPE =>", attention_mask.shape, "\n", 
            "TARGETS_SHAPE =>", targets.shape, "\n")
        outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels = targets)
        loss = outputs[0]
        logits = outputs[1]

        # preds = preds.cpu().detach().numpy()
        _, prediction = torch.max(outputs[1], dim=1)
        targets = targets.cpu().detach().numpy()
        prediction = prediction.cpu().detach().numpy()
        accuracy = metrics.accuracy_score(targets, prediction)

        acc += accuracy
        losses.append(loss.item())

        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        counter = counter + 1

    return acc / counter, np.mean(losses)

In [20]:
%%time
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        xlnet,     
        optimizer, 
        device, 
        scheduler, 
        Y_TEST_TENSOR.shape[0]
    )

    print(f'Train loss {train_loss} Train accuracy {train_acc}')

Epoch 1/3
----------
STEP # 1 
 INPUT_SHAPE =>  torch.Size([1, 4096]) 
 ATTENTION_MASK_SHAPE => torch.Size([1, 4096]) 
 TARGETS_SHAPE => torch.Size([1]) 



KeyboardInterrupt: 

In [None]:
torch.save(xlnet.state_dict(), './model.pth')

In [21]:
model = xlnet.load_state_dict(torch.load('./model.pth', map_location=torch.device('cpu')))

In [22]:
def eval_model(model, device, n_examples):
    model = model.eval()
    losses = []
    acc = 0
    counter = 0
  
    with torch.no_grad():
        
        blocks = Y_TEST_TENSOR.shape[0] // BATCH_SIZE
    
        for i in range(0,blocks):
            begin = i*BATCH_SIZE
            end = begin+BATCH_SIZE
            input_ids = test_input_ids_tensor[begin:end].reshape(1,4096).to(device)
            attention_mask = test_attention_masks_tensor[begin:end].reshape(1,4096).to(device)
            targets = torch.tensor(y_test)[begin:end].to(device)
            
            outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels = targets)
            loss = outputs[0]
            logits = outputs[1]

            _, prediction = torch.max(outputs[1], dim=1)
            targets = targets.cpu().detach().numpy()
            prediction = prediction.cpu().detach().numpy()
            accuracy = metrics.accuracy_score(targets, prediction)

            acc += accuracy
            print(f"Accuracy => {acc}")
            losses.append(loss.item())
            counter += 1

    return acc / counter, np.mean(losses)

In [23]:
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)


    val_acc, val_loss = eval_model(
        xlnet,
        device, 
        Y_TEST_TENSOR.shape[0]
    )

    print(f'Val loss {val_loss} Val accuracy {val_acc}')
    print()

Epoch 1/3
----------
Accuracy => 1.0
Accuracy => 2.0
Accuracy => 3.0
Accuracy => 4.0
Accuracy => 4.0
Accuracy => 4.0
Accuracy => 5.0
Accuracy => 6.0
Accuracy => 6.0
Accuracy => 6.0
Accuracy => 6.0
Accuracy => 7.0
Accuracy => 7.0
Accuracy => 8.0
Accuracy => 8.0
Accuracy => 8.0
Accuracy => 8.0
Accuracy => 9.0
Accuracy => 10.0
Accuracy => 10.0
Accuracy => 10.0
Accuracy => 10.0
Accuracy => 10.0
Accuracy => 10.0
Accuracy => 10.0
Accuracy => 10.0
Accuracy => 10.0
Accuracy => 11.0
Accuracy => 11.0
Accuracy => 11.0
Accuracy => 11.0
Accuracy => 11.0
Accuracy => 11.0
Accuracy => 11.0
Accuracy => 11.0
Accuracy => 11.0
Accuracy => 12.0
Accuracy => 12.0
Accuracy => 13.0
Accuracy => 13.0
Accuracy => 14.0
Accuracy => 15.0
Accuracy => 16.0
Accuracy => 16.0
Accuracy => 16.0
Accuracy => 16.0
Accuracy => 17.0
Accuracy => 17.0
Accuracy => 17.0
Accuracy => 17.0
Accuracy => 17.0
Accuracy => 18.0
Accuracy => 19.0
Accuracy => 19.0
Accuracy => 20.0
Accuracy => 20.0
Accuracy => 20.0
Accuracy => 21.0
Accuracy =>

In [None]:
data = pd.read_csv('./test_variants')
print('Number of data points:', data.shape[0])
print('Number of features:', data.shape[1])
print('Features:', data.columns.values)
data.head()

Number of data points: 5668
Number of features: 3
Features: ['ID' 'Gene' 'Variation']


Unnamed: 0,ID,Gene,Variation
0,0,ACSL4,R570S
1,1,NAGLU,P521L
2,2,PAH,L333F
3,3,ING1,A148D
4,4,TMEM216,G77A


In [None]:
data_text =pd.read_csv("./test_text",sep="\|\|",engine="python",names=["ID","TEXT"],skiprows=1)
print('Number of data points : ', data_text.shape[0])
print('Number of features : ', data_text.shape[1])
print('Features : ', data_text.columns.values)
data_text.head()

Number of data points :  5668
Number of features :  2
Features :  ['ID' 'TEXT']


Unnamed: 0,ID,TEXT
0,0,2. This mutation resulted in a myeloproliferat...
1,1,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,Vascular endothelial growth factor receptor (V...
3,3,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,Abstract Retinoblastoma is a pediatric retina...


In [None]:
start_time = time.perf_counter()
for index, row in data_text.iterrows():
    if type(row['TEXT']) is str:
        nlp_preprocessing(row['TEXT'], index, 'TEXT')
    else:
        print("There is no text description for id:", index)
print('Time took for preprocessing the text:', time.perf_counter() - start_time, "seconds")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_text[column][index] = string


There is no text description for id: 1623
Time took for preprocessing the text: 39.970932785989135 seconds


In [None]:
# Merging both gene_variations and text data based on ID
result = pd.merge(data, data_text, on='ID', how='left')
result.head(5)

Unnamed: 0,ID,Gene,Variation,TEXT
0,0,ACSL4,R570S,2 mutation resulted myeloproliferative phenoty...
1,1,NAGLU,P521L,abstract large tumor suppressor 1 lats1 serine...
2,2,PAH,L333F,vascular endothelial growth factor receptor ve...
3,3,ING1,A148D,inflammatory myofibroblastic tumor imt neoplas...
4,4,TMEM216,G77A,abstract retinoblastoma pediatric retinal tumo...


In [None]:

#X_TEST_SENTENCES = [sentence + " [SEP] [CLS]" for sentence in X_TEST_SENTENCES]
print(result.Variation.values[0], result['Class'].values)
print(result.Gene.values[0])
print(result.TEXT.values[0])

KeyError: 'Class'