<a href="https://colab.research.google.com/github/CYS3013/ChatGPT-Feishu/blob/master/test_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
tpm_file_path = '/content/drive/MyDrive/idTABtpm_final.txt'

In [None]:
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments
import torch
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
!pip install accelerate -U
!pip install transformers[torch]  #重启节点



In [None]:
df = pd.read_csv('/content/drive/MyDrive/idTABtpm_final.txt', sep='\t', header=None, names=['gene_id', 'tpm_value', 'cDNA_sequence'])

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        cDNA_sequence = self.data.iloc[idx]['cDNA_sequence']
        inputs = self.tokenizer.encode_plus(
            cDNA_sequence,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'tpm_value': torch.tensor(self.data.iloc[idx]['tpm_value'], dtype=torch.float32)
        }

# Tokenizer initialization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Create dataset instances
train_dataset = CustomDataset(train_df, tokenizer)
val_dataset = CustomDataset(val_df, tokenizer)

# Define the model architecture for regression
class BertRegressionModel(torch.nn.Module):
    def __init__(self, bert_model):
        super(BertRegressionModel, self).__init__()
        self.bert = bert_model
        self.dropout = torch.nn.Dropout(0.1)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs['pooler_output']
        pooled_output = self.dropout(pooled_output)
        regression_output = self.linear(pooled_output)
        return regression_output.squeeze(-1)

# Load BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')
model = BertRegressionModel(bert_model)

# Setup training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=30,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=500,
    evaluation_strategy='steps',
    eval_steps=500,
    save_total_limit=3,
    disable_tqdm=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Use the validation dataset for evaluation
)




In [None]:
trainer.train()

Step,Training Loss,Validation Loss
500,-29.8293,No log
1000,-45.7741,No log
1500,-59.4331,No log
2000,-71.2687,No log
2500,-80.5613,No log


Step,Training Loss,Validation Loss
500,-29.8293,No log
1000,-45.7741,No log
1500,-59.4331,No log
2000,-71.2687,No log
2500,-80.5613,No log
3000,-88.136,No log
3500,-93.7594,No log
4000,-97.5798,No log
4500,-99.1957,No log


TrainOutput(global_step=4620, training_loss=-70.78847968031317, metrics={'train_runtime': 3821.8919, 'train_samples_per_second': 9.616, 'train_steps_per_second': 1.209, 'total_flos': 0.0, 'train_loss': -70.78847968031317, 'epoch': 30.0})

In [None]:
eval_results = trainer.evaluate(eval_dataset=val_dataset)
print(f"Evaluation results: {eval_results}")

Evaluation results: {'eval_runtime': 14.7671, 'eval_samples_per_second': 20.79, 'eval_steps_per_second': 2.641, 'epoch': 30.0}


In [None]:
predictions = trainer.predict(val_dataset)

# Extract predicted values and true labels
y_pred = predictions.predictions.squeeze()
y_true = val_df['tpm_value'].values

# Calculate evaluation metrics
mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R² Score: {r2}")

Mean Squared Error (MSE): 11692.113013214137
Mean Absolute Error (MAE): 104.71413195600783
R² Score: -15.081279965138538
