In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load CSV
csv_path = "/content/data.csv"
df = pd.read_csv(csv_path)

In [None]:
# Create new prompt column
df['Prompt'] = df['Channel'].astype(str) + ":" + df['Category'].astype(str) + ":" + df['Title'].astype(str)

# Keep only Prompt and Transcript columns
df_final = df[['Prompt', 'Transcript']]

# Save new dataset
df_final.to_csv("prepared_prompt_dataset.csv", index=False)

print("Dataset prepared and saved as prepared_prompt_dataset.csv")

Dataset prepared and saved as prepared_prompt_dataset.csv


In [None]:
print(df_final.sample(10))

                                                 Prompt  \
1391  Hell's Kitchen:Food:Gordon's Politest Customer...   
1674  SEA:Science:UY Scuti - The Largest Star Ever D...   
1180  Insider News:News:Video Captures Massive Volca...   
895   Lex Clips:Blog,Science:Georges St-Pierre on hi...   
1946  Epicurious:Food:4 Levels of Omelets: Amateur t...   
462   Key & Peele:Comedy:Key & Peele - Pegasus Sighting   
2009  Key & Peele:Comedy:Key & Peele - Fraternity Br...   
467   NBC News:News:The Third Presidential Debate: H...   
702   BBC News:News:Chaos in Washington as Trump sup...   
2368  Kurzgesagt – In a Nutshell:Science:Quantum Com...   

                                             Transcript  
1391  walk and shift to lamb chef service please coo...  
1674  there are hundreds of six still ian's of stars...  
1180  the ash fall has been significant and the tsun...  
895   a lot of people ask me about khabib and that f...  
1946  hi I'm Emily and I'm a level one chef hi I'm L...  
46

In [None]:
ds=df_final

In [None]:
ds.sample(5)

Unnamed: 0,Prompt,Transcript
350,"First We Feast:Food,Entertainment:Ken Jeong Pe...",you warm me up I say you're a nice guy I reall...
715,Austin Evans:Tech:iPhone 6 vs Samsung Galaxy S...,hey guys this is Austin so my buddy John and I...
80,"penguinz0:Blog,Comedy:This Youtuber Is Slowly ...",how far would you go for some updos on the int...
1668,"Lex Clips:Blog,Science:Number go up: Bitcoin i...",so there's going to be a moment if bitcoin con...
298,BBC News:News:Yazidi survivor: 'I was raped ev...,[Laughter] [Music] such a person even starting...


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import os
import pickle

In [None]:
# Define dataset class
class TranscriptDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=1024):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        prompt = self.dataframe.iloc[idx]['Prompt']
        transcript = self.dataframe.iloc[idx]['Transcript']

        # Concatenate prompt and transcript as single text input
        text = f"{prompt}: {transcript}"

        # Encode
        encodings = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )

        input_ids = encodings.input_ids.squeeze()
        attention_mask = encodings.attention_mask.squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': input_ids
        }

In [None]:
# Tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# Create dataset
dataset = TranscriptDataset(ds, tokenizer)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_finetuned_transcript",
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=True,
    logging_steps=100,
)

In [None]:
# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

In [None]:
# Train
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrishi2003das[0m ([33mrishi2003das-National Institute of Technology Rourkela[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,3.8734
200,3.825
300,3.8202
400,3.7795
500,3.7753
600,3.7528
700,3.7009
800,3.7827
900,3.6893
1000,3.6635


TrainOutput(global_step=6290, training_loss=3.492461605102345, metrics={'train_runtime': 2095.2086, 'train_samples_per_second': 6.002, 'train_steps_per_second': 3.002, 'total_flos': 6571494604800000.0, 'train_loss': 3.492461605102345, 'epoch': 5.0})

In [None]:
# Save final model
model.save_pretrained("./gpt2_finetuned_transcript")
tokenizer.save_pretrained("./gpt2_finetuned_transcript")

('./gpt2_finetuned_transcript/tokenizer_config.json',
 './gpt2_finetuned_transcript/special_tokens_map.json',
 './gpt2_finetuned_transcript/vocab.json',
 './gpt2_finetuned_transcript/merges.txt',
 './gpt2_finetuned_transcript/added_tokens.json')

In [None]:
# Also save as pickle for production (use with caution)
with open("gpt2_finetuned_transcripts.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model fine-tuning completed and saved.")

Model fine-tuning completed and saved.


In [None]:
from flask import Flask, request, jsonify
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

In [None]:
#Creating the Flask app for producing API endpoints
app = Flask(__name__)

# Load fine-tuned model and tokenizer
model_path = "./gpt2_finetuned_transcript"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.eval()

In [None]:
@app.route('/generate_transcript', methods=['POST'])
def generate_transcript():
    data = request.get_json()
    if not data or 'prompt' not in data:
        return jsonify({'error': 'Missing prompt in request'}), 400

    prompt_text = data['prompt']

    # Encode prompt
    input_ids = tokenizer.encode(prompt_text, return_tensors='pt')

    # Generate transcript
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_length=1024,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7
        )

    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Remove prompt prefix if needed
    transcript_only = generated_text[len(prompt_text):].strip()

    return jsonify({'transcript': transcript_only})

In [None]:
if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000, debug=True)