In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader

In [None]:
#!pip install transformers

In [None]:
#!pip install torch

In [2]:
df_new = pd.read_csv('pathology.tsv', sep='\t')

In [34]:
df_new.head()

Unnamed: 0,Gene,Gene name,Cancer,High,Medium,Low,Not detected,prognostic - favorable,unprognostic - favorable,prognostic - unfavorable,unprognostic - unfavorable
0,ENSG00000000003,TSPAN6,breast cancer,1.0,7.0,2.0,2.0,0.0,0.07712,0.0,0.0
1,ENSG00000000003,TSPAN6,carcinoid,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0
2,ENSG00000000003,TSPAN6,cervical cancer,11.0,1.0,0.0,0.0,0.0,0.08967,0.0,0.0
3,ENSG00000000003,TSPAN6,colorectal cancer,0.0,6.0,2.0,2.0,0.0,0.03562,0.0,0.0
4,ENSG00000000003,TSPAN6,endometrial cancer,10.0,2.0,0.0,0.0,0.0,0.0,0.0,0.2567


In [4]:
# Checking for missing values
print("Missing values in the dataset:")
print(df_new.isnull().sum())

Missing values in the dataset:
Gene                               0
Gene name                          0
Cancer                             0
High                           97183
Medium                         97183
Low                            97183
Not detected                   97183
prognostic - favorable        394849
unprognostic - favorable      267122
prognostic - unfavorable      393054
unprognostic - unfavorable    260635
dtype: int64


In [5]:
df_new.fillna(0, inplace=True)

In [6]:
df = df_new.head(10000)

In [7]:
len(df)

10000

In [8]:
# Label encode the target variable (Cancer types)
le = LabelEncoder()
df['Cancer'] = le.fit_transform(df['Cancer'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Cancer'] = le.fit_transform(df['Cancer'])


In [9]:
# Check distinct values in a specific column, e.g., 'Cancer'
unique_values = df['Cancer'].unique()

print("Distinct values in the 'Cancer' column:")
print(unique_values)

Distinct values in the 'Cancer' column:
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


In [10]:
# Step 2: Create a Custom Dataset Class
# -----------------------------------------
class CancerDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.texts = dataframe['Gene name']  # Using 'Gene name' as the text feature
        self.labels = dataframe['Cancer']
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [11]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [12]:
# Tokenization and dataset preparation
MAX_LEN = 128  # Max length of BERT input sequence
dataset = CancerDataset(df, tokenizer, MAX_LEN)

In [13]:
# Split the dataset into training and test sets
train_size = 0.8
train_dataset, test_dataset = train_test_split(dataset, test_size=1-train_size, random_state=42)

In [14]:
# Create DataLoader for both training and test sets
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [15]:
# Step 3: Fine-tuning BERT for Classification
# -----------------------------------------
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['Cancer'].unique()))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"      # Evaluate after each epoch
)



In [17]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [19]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.0043,3.009039
2,3.0073,3.000426
3,2.9969,2.998971


TrainOutput(global_step=1500, training_loss=2.9997474110921223, metrics={'train_runtime': 575.4942, 'train_samples_per_second': 41.703, 'train_steps_per_second': 2.606, 'total_flos': 1578921467904000.0, 'train_loss': 2.9997474110921223, 'epoch': 3.0})

In [None]:
train_dataset

In [21]:
len(test_dataset)

2000

In [20]:
# Step 4: Model Evaluation
# -----------------------------------------
# Evaluate the model using the test dataset
eval_results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Evaluation results: {eval_results}")

Evaluation results: {'eval_loss': 2.9989709854125977, 'eval_runtime': 13.52, 'eval_samples_per_second': 147.929, 'eval_steps_per_second': 9.246, 'epoch': 3.0}


In [21]:
import torch

# Step 5: Function to Make Predictions on New Data
# ------------------------------------------------------
def predict_cancer(model, tokenizer, text, max_len=128):
    """
    Given a trained model, tokenizer, and input text (gene name), this function will predict the cancer type.

    Args:
    model: Trained BERT model for classification.
    tokenizer: BERT tokenizer.
    text: Input gene name to predict cancer type for.
    max_len: Maximum length of the input sequence.

    Returns:
    Predicted cancer type.
    """
    # Tokenize and encode the text
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_token_type_ids=False,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'  # Return tensors as PyTorch tensors
    )

    # Move model and inputs to the same device (GPU if available, otherwise CPU)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Make the prediction (no gradient calculation needed)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # Get the predicted label (the index with the highest score in logits)
    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()

    # Convert label index back to the original cancer label
    predicted_cancer = le.inverse_transform([predicted_label])[0]

    return predicted_cancer

In [22]:
# Step 6: Test the Prediction Function with New Data
# ------------------------------------------------------

# Example gene names for prediction
new_gene_names = ['TSPAN6', 'BRCA1', 'TP53', 'TNMD']

# Loop through and predict the cancer type for each gene
for gene_name in new_gene_names:
    predicted_cancer = predict_cancer(model, tokenizer, gene_name)
    print(f"Gene name: {gene_name} --> Predicted Cancer: {predicted_cancer}")

Gene name: TSPAN6 --> Predicted Cancer: head and neck cancer
Gene name: BRCA1 --> Predicted Cancer: head and neck cancer
Gene name: TP53 --> Predicted Cancer: head and neck cancer
Gene name: TNMD --> Predicted Cancer: head and neck cancer
