In [12]:
import torch  # Import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

Training

In [15]:
# --- Training Code ---

# Import necessary libraries
import pandas as pd  # For data manipulation
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer  # For using the GPT-2 model and training
import torch  # For deep learning operations
import evaluate  # For evaluating the model
from sklearn.model_selection import train_test_split  # For splitting data

# Set the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the dataset from a CSV file
df = pd.read_csv("youtube_data.csv")

# Convert non-string values in 'video_title' column to strings
df['video_title'] = df['video_title'].fillna('').astype(str)

# Split data into training and evaluation sets (80% train, 20% eval)
train_df, eval_df = train_test_split(df, test_size=0.2) #random_state=42 

# Extract video titles for training and evaluation
train_titles = train_df['video_title'].tolist()
eval_titles = eval_df['video_title'].tolist()

# Initialize tokenizer and model
# model_name = "gpt2"  # Jika mau menggunakan gpt2 (model dasar)
model_name = "EdwardFang09/QuestionableYouTubeVideoTitleV1"  # Path to your initial fine-tuned model
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Load the tokenizer

# Add a padding token (often set to EOS token for language models)
tokenizer.pad_token = tokenizer.eos_token 

# Load the pre-trained GPT-2 model
model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device)  # Move the model to the appropriate device

# Prepare the dataset for training
class TitlesDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings  # Store the tokenized encodings

    def __getitem__(self, idx):
        # Retrieve encoded data and create labels (targets)
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = item["input_ids"].clone()  # Labels are the same as input IDs for language modeling
        return item

    def __len__(self):
        return len(self.encodings.input_ids)  # Return the length of the dataset

# Tokenize the titles for training and evaluation
train_encodings = tokenizer(train_titles, truncation=True, padding=True)  # Truncate long titles and pad shorter ones
eval_encodings = tokenizer(eval_titles, truncation=True, padding=True) 

# Create dataset objects
train_dataset = TitlesDataset(train_encodings)
eval_dataset = TitlesDataset(eval_encodings)

# Define a metric (BLEU in this case)
metric = evaluate.load('bleu')  # Load the BLEU metric for evaluation

# Define a function to compute the BLEU score
def compute_metrics(pred):
    labels_ids = pred.label_ids  # Get the true labels
    pred_ids = pred.predictions  # Get the predicted labels

    # Decode the predicted IDs to text
    pred_str = [
        tokenizer.decode(ids, skip_special_tokens=True) 
        for ids in pred_ids 
        if all(isinstance(x, int) and x >= 0 for x in ids)  # Check for valid integer IDs
    ]

    # Replace -100 (ignore index) with EOS token ID in true labels
    labels_ids[labels_ids == -100] = tokenizer.eos_token_id 
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)  # Decode true labels

    # Check if predictions and labels are valid and have the same length
    if pred_str and label_str and len(pred_str) == len(label_str):
        # Compute the BLEU score
        result = metric.compute(predictions=pred_str, references=[[l] for l in label_str])
        return {
            "bleu": result["bleu"],
        }
    else:
        # Handle cases with empty or mismatched predictions and labels
        print("Warning: Empty predictions or labels encountered, or mismatched lengths.")
        return {"bleu": 0}  # Or another appropriate default value

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save training outputs
    num_train_epochs=30,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size per device
    save_steps=1000,  # Save checkpoints every 1000 steps
    logging_dir="./logs",  # Directory for training logs
    fp16=True,  # Enable mixed precision training (if supported)
    evaluation_strategy="epoch",  # Evaluate the model at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    load_best_model_at_end=True,  # Load the best performing model at the end of training
    metric_for_best_model="bleu",  # Use the BLEU score to select the best model
)

# Define the Trainer
trainer = Trainer(
    model=model,  # The GPT-2 model to be fine-tuned
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=eval_dataset,  # Evaluation dataset
    compute_metrics=compute_metrics,  # Function to compute the evaluation metric
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
trainer.save_model("./QuestionableYouTubeVideoTitleV1_finetuned")  # Save the model
tokenizer.save_pretrained("./QuestionableYouTubeVideoTitleV1_finetuned")  # Save the tokenizer


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/840 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.1021509170532227, 'eval_bleu': 0, 'eval_runtime': 0.9795, 'eval_samples_per_second': 56.152, 'eval_steps_per_second': 4.084, 'epoch': 1.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.7044501304626465, 'eval_bleu': 0, 'eval_runtime': 0.9954, 'eval_samples_per_second': 55.257, 'eval_steps_per_second': 4.019, 'epoch': 2.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.5867444276809692, 'eval_bleu': 0, 'eval_runtime': 1.0106, 'eval_samples_per_second': 54.421, 'eval_steps_per_second': 3.958, 'epoch': 3.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.6240445375442505, 'eval_bleu': 0, 'eval_runtime': 1.0231, 'eval_samples_per_second': 53.757, 'eval_steps_per_second': 3.91, 'epoch': 4.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.6623347997665405, 'eval_bleu': 0, 'eval_runtime': 0.999, 'eval_samples_per_second': 55.055, 'eval_steps_per_second': 4.004, 'epoch': 5.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.7036473751068115, 'eval_bleu': 0, 'eval_runtime': 1.0151, 'eval_samples_per_second': 54.182, 'eval_steps_per_second': 3.94, 'epoch': 6.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.7288987636566162, 'eval_bleu': 0, 'eval_runtime': 1.0121, 'eval_samples_per_second': 54.345, 'eval_steps_per_second': 3.952, 'epoch': 7.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.7540889978408813, 'eval_bleu': 0, 'eval_runtime': 1.0433, 'eval_samples_per_second': 52.717, 'eval_steps_per_second': 3.834, 'epoch': 8.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.763527750968933, 'eval_bleu': 0, 'eval_runtime': 0.9985, 'eval_samples_per_second': 55.082, 'eval_steps_per_second': 4.006, 'epoch': 9.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.7874102592468262, 'eval_bleu': 0, 'eval_runtime': 1.0243, 'eval_samples_per_second': 53.695, 'eval_steps_per_second': 3.905, 'epoch': 10.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.7930964231491089, 'eval_bleu': 0, 'eval_runtime': 1.0411, 'eval_samples_per_second': 52.83, 'eval_steps_per_second': 3.842, 'epoch': 11.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.787025809288025, 'eval_bleu': 0, 'eval_runtime': 1.0376, 'eval_samples_per_second': 53.009, 'eval_steps_per_second': 3.855, 'epoch': 12.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.7926087379455566, 'eval_bleu': 0, 'eval_runtime': 1.0093, 'eval_samples_per_second': 54.495, 'eval_steps_per_second': 3.963, 'epoch': 13.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.818159580230713, 'eval_bleu': 0, 'eval_runtime': 1.019, 'eval_samples_per_second': 53.974, 'eval_steps_per_second': 3.925, 'epoch': 14.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.8094326257705688, 'eval_bleu': 0, 'eval_runtime': 1.0238, 'eval_samples_per_second': 53.721, 'eval_steps_per_second': 3.907, 'epoch': 15.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.8457216024398804, 'eval_bleu': 0, 'eval_runtime': 0.971, 'eval_samples_per_second': 56.64, 'eval_steps_per_second': 4.119, 'epoch': 16.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.8403187990188599, 'eval_bleu': 0, 'eval_runtime': 0.9744, 'eval_samples_per_second': 56.444, 'eval_steps_per_second': 4.105, 'epoch': 17.0}




{'loss': 0.3644, 'grad_norm': 264291.78125, 'learning_rate': 2.023809523809524e-05, 'epoch': 17.86}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.8545863628387451, 'eval_bleu': 0, 'eval_runtime': 1.0011, 'eval_samples_per_second': 54.939, 'eval_steps_per_second': 3.996, 'epoch': 18.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.8562724590301514, 'eval_bleu': 0, 'eval_runtime': 0.961, 'eval_samples_per_second': 57.235, 'eval_steps_per_second': 4.163, 'epoch': 19.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.8523170948028564, 'eval_bleu': 0, 'eval_runtime': 0.9837, 'eval_samples_per_second': 55.912, 'eval_steps_per_second': 4.066, 'epoch': 20.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.8588027954101562, 'eval_bleu': 0, 'eval_runtime': 0.9876, 'eval_samples_per_second': 55.691, 'eval_steps_per_second': 4.05, 'epoch': 21.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.8791452646255493, 'eval_bleu': 0, 'eval_runtime': 1.0143, 'eval_samples_per_second': 54.226, 'eval_steps_per_second': 3.944, 'epoch': 22.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.876784086227417, 'eval_bleu': 0, 'eval_runtime': 0.9942, 'eval_samples_per_second': 55.318, 'eval_steps_per_second': 4.023, 'epoch': 23.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.8675496578216553, 'eval_bleu': 0, 'eval_runtime': 0.9896, 'eval_samples_per_second': 55.579, 'eval_steps_per_second': 4.042, 'epoch': 24.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.8780399560928345, 'eval_bleu': 0, 'eval_runtime': 0.9695, 'eval_samples_per_second': 56.728, 'eval_steps_per_second': 4.126, 'epoch': 25.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.8772058486938477, 'eval_bleu': 0, 'eval_runtime': 1.0419, 'eval_samples_per_second': 52.788, 'eval_steps_per_second': 3.839, 'epoch': 26.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.888100504875183, 'eval_bleu': 0, 'eval_runtime': 1.0297, 'eval_samples_per_second': 53.414, 'eval_steps_per_second': 3.885, 'epoch': 27.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.8937504291534424, 'eval_bleu': 0, 'eval_runtime': 1.0541, 'eval_samples_per_second': 52.179, 'eval_steps_per_second': 3.795, 'epoch': 28.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.8952534198760986, 'eval_bleu': 0, 'eval_runtime': 1.0411, 'eval_samples_per_second': 52.83, 'eval_steps_per_second': 3.842, 'epoch': 29.0}


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.8969427347183228, 'eval_bleu': 0, 'eval_runtime': 1.2903, 'eval_samples_per_second': 42.626, 'eval_steps_per_second': 3.1, 'epoch': 30.0}


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


{'train_runtime': 495.8701, 'train_samples_per_second': 13.189, 'train_steps_per_second': 1.694, 'train_loss': 0.29044576372419084, 'epoch': 30.0}


('./fine_tuned_gpt2_ver1.17\\tokenizer_config.json',
 './fine_tuned_gpt2_ver1.17\\special_tokens_map.json',
 './fine_tuned_gpt2_ver1.17\\vocab.json',
 './fine_tuned_gpt2_ver1.17\\merges.txt',
 './fine_tuned_gpt2_ver1.17\\added_tokens.json',
 './fine_tuned_gpt2_ver1.17\\tokenizer.json')

Prompt here