<a href="https://colab.research.google.com/github/Faye912/samples/blob/main/fine_tuning_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Connected to base (Python 3.12.2)

In [1]:
# !pip install transformers datasets
import pandas as pd

In [2]:
# remove all punctuation to prepare training data
import string
translator = str.maketrans('', '', string.punctuation)

In [None]:
# train with hashtag dataset
hashtag_df = pd.read_csv("punctuated_hashtag_data.csv")

In [None]:
hashtag_df['transcript']

In [None]:
hashtag_df['transcript'] = hashtag_df['transcript'].astype(str)
hashtag_df['transcript_no_punct'] = hashtag_df['transcript'].apply(lambda x: x.translate(translator))

In [None]:
# merge datasets
hashtag_train_df = pd.DataFrame({
    "unpunctuated": hashtag_df['transcript_no_punct'],
    "punctuated": hashtag_df['transcript']
})

In [None]:
from datasets import Dataset
# dataset = Dataset.from_pandas(train_df)
dataset = Dataset.from_pandas(hashtag_train_df)

In [None]:
hashtag_train_df

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
def tokenize(batch):
    inputs = tokenizer(batch['unpunctuated'], padding='max_length', truncation=True, max_length=128)
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(batch['punctuated'], padding='max_length', truncation=True, max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

dataset = dataset.map(tokenize, batched=True, remove_columns=['unpunctuated', 'punctuated'])

In [None]:
# train test split
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
from transformers import TrainingArguments
from transformers import Trainer

training_args = TrainingArguments(
    output_dir="./punctuate_tiktok_finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"]
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("./punctuate_tiktok_finetuned")
tokenizer.save_pretrained("./punctuate_tiktok_finetuned")

In [None]:
# testing
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_dir = "./punctuate_tiktok_finetuned"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

In [None]:
def punctuate_text_tiktok(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=64)
    outputs = model.generate(
        **inputs,
        max_new_tokens=64,
        repetition_penalty=1.2, # initial samples repeated
        no_repeat_ngram_size=3,
        num_beams=4
    )
    punctuated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return punctuated

In [None]:
samples = [
    "i dont know what to do today maybe ill go to the gym later",
    "hey guys welcome back to my channel today were gonna talk about skincare routines",
    "what do you think about this dress should i buy it or not",
    "cant believe its already monday again",
    "this is so funny i cant stop laughing"
]

for s in samples:
    print("Original:", s)
    print("Punctuated:", punctuate_text_tiktok(s))

In [None]:
hashtag_train_df.to_csv("hashtag_training_set.csv")