<a href="https://colab.research.google.com/github/EdwardFang09/questionable-youtube-video-title/blob/main/Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import torch  # Import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
!nvidia-smi

Wed Dec 11 16:29:46 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

Training

In [1]:
# Import necessary libraries
import pandas as pd  # For data manipulation
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer  # For using the GPT-2 model and training
import torch  # For deep learning operations
from sklearn.model_selection import train_test_split  # For splitting data

# Set the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the dataset from a CSV file
df = pd.read_csv("youtube_data.csv")

# Convert non-string values in 'video_title' column to strings
df['video_title'] = df['video_title'].fillna('').astype(str)

# Split data into training and evaluation sets (80% train, 20% eval)
train_df, eval_df = train_test_split(df, test_size=0.2) #random_state=42

# Extract video titles for training and evaluation
train_titles = train_df['video_title'].tolist()
eval_titles = eval_df['video_title'].tolist()

# Initialize tokenizer and model
# model_name = "gpt2"  # Jika mau menggunakan gpt2 (model dasar)
model_name = "EdwardFang09/QuestionableYouTubeVideoTitleV1"  # Path to your initial fine-tuned model
# model_name = "QuestionableYouTubeVideoTitleV1_finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Load the tokenizer

# Add a padding token (often set to EOS token for language models)
tokenizer.pad_token = tokenizer.eos_token

# Load the pre-trained GPT-2 model
model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device)  # Move the model to the appropriate device

# Prepare the dataset for training
class TitlesDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings  # Store the tokenized encodings

    def __getitem__(self, idx):
        # Retrieve encoded data and create labels (targets)
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = item["input_ids"].clone()  # Labels are the same as input IDs for language modeling
        return item

    def __len__(self):
        return len(self.encodings.input_ids)  # Return the length of the dataset

# Tokenize the titles for training and evaluation
train_encodings = tokenizer(train_titles, truncation=True, padding=True)  # Truncate long titles and pad shorter ones
eval_encodings = tokenizer(eval_titles, truncation=True, padding=True)

# Create dataset objects
train_dataset = TitlesDataset(train_encodings)
eval_dataset = TitlesDataset(eval_encodings)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save training outputs
    num_train_epochs=30,  # Number of training epochs
    per_device_train_batch_size=256,  # Batch size per device
    save_steps=100000,  # Save checkpoints every 1000 steps
    logging_dir="./logs",  # Directory for training logs
    logging_steps=100000,  # Log every 100 steps  # Added logging_steps
    fp16=True,  # Enable mixed precision training (if supported)
    evaluation_strategy="epoch",  # Evaluate the model at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    load_best_model_at_end=True,  # Load the best performing model at the end of training
    # Since we're removing BLEU, we'll use loss to determine the best model
    metric_for_best_model="loss",
)

# Define the Trainer
trainer = Trainer(
    model=model,  # The GPT-2 model to be fine-tuned
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=eval_dataset,  # Evaluation dataset
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
trainer.save_model("./QuestionableYouTubeVideoTitleV1_finetuned2")  # Save the model
tokenizer.save_pretrained("./QuestionableYouTubeVideoTitleV1_finetuned2")  # Save the tokenizer

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33medward-zzz1137[0m ([33medward-zzz1137-calvin-institute-of-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.694808
2,No log,0.69625
3,No log,0.695625
4,No log,0.695628
5,No log,0.696108
6,No log,0.695516
7,No log,0.695655
8,No log,0.697467
9,No log,0.699754
10,No log,0.702439


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


('./QuestionableYouTubeVideoTitleV1_finetuned2/tokenizer_config.json',
 './QuestionableYouTubeVideoTitleV1_finetuned2/special_tokens_map.json',
 './QuestionableYouTubeVideoTitleV1_finetuned2/vocab.json',
 './QuestionableYouTubeVideoTitleV1_finetuned2/merges.txt',
 './QuestionableYouTubeVideoTitleV1_finetuned2/added_tokens.json',
 './QuestionableYouTubeVideoTitleV1_finetuned2/tokenizer.json')

In [2]:
import os
import zipfile
from google.colab import files # Import the 'files' object from google.colab

def download_folder(folder_path):
    """Downloads a folder as a zip file from Google Colab."""
    zip_filename = f"{folder_path}.zip"
    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        for root, _, filenames in os.walk(folder_path): # Rename 'files' to 'filenames'
            for filename in filenames:
                zipf.write(os.path.join(root, filename))
    files.download(zip_filename) # Now 'files' refers to the object from google.colab

In [3]:
download_folder('QuestionableYouTubeVideoTitleV1_finetuned2')  # Replace 'my_folder' with the actual folder name

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>