In [1]:
import pandas as pd
import numpy as np
import transformers
import torch
from tqdm import tqdm
import os
import datasets
import json

In [2]:
json_list = [a for a in os.listdir() if ".json" in a]

In [1]:
# json_list = [a for a in json_list if "fetched_videos_foundational_mathematics_youtube_videos_with_transcripts.json" in a]

In [5]:
file_name = []
data = []

for file in json_list:
    file_name.append(file)
    output_file = '/projects/anra7539/projects/big_data/'+file
    data.append(pd.read_json(output_file))

shape = sum([d.shape[0] for d in data])

print(f"Total {shape} transcripts.")

Total 136 transcripts.


## Summarization

In [6]:
name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
device = "cuda"

model = transformers.AutoModelForCausalLM.from_pretrained(name,
                                                          load_in_8bit = True,
                                                          trust_remote_code = True,
                                             device_map = device,
                                             cache_dir='/scratch/alpine/anra7539')

tokenizer = transformers.AutoTokenizer.from_pretrained(name, truncation_side = "left")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
def summarization(transcript, prompt):
    with torch.no_grad():
        input_text = f'''{prompt}\n\nTranscript: {transcript}\nSummary:'''
        input_tokens = tokenizer(input_text, return_tensors="pt", truncation=True, 
                                 max_length=128000).to(device)
        outputs = model.generate(**input_tokens, 
                                 max_new_tokens=500, 
                                 temperature=0.1, 
                                 top_p=0.9, 
                                 repetition_penalty=1.1, 
                                 pad_token_id=tokenizer.eos_token_id, do_sample=True)        
        
        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        summary = full_response.split("Summary:")[1].strip()
        
        last_period_idx = summary.rfind(".")
        if last_period_idx != -1:
            summary = summary[:last_period_idx + 1].strip()
    return summary

In [9]:
prompt = f'''You are an expert technical writer.
Read the transcript of a technical talk and write a detailed summary in first-person voice, as if you are explaining the ideas.
Do NOT mention or refer to the speaker, presenter, or author.
Avoid all phrases like "the speaker says," "in this talk," or "Hannes explains."
Your goal is to internalize the knowledge from the transcript and re-express it clearly and concisely, in your own words, as though you were explaining it to a peer.
Write 25–35 sentences.
'''

In [10]:
for i in range(len(file_name)):
    output_file = f'/projects/anra7539/projects/big_data/transcript_summaries/summary_{file_name[i]}'
    
    if os.path.exists(output_file):
        with open(output_file, 'r') as f:
            try:
                existing_data = [json.loads(line) for line in f]
            except json.JSONDecodeError:
                existing_data = []
    else:
        existing_data = []

    processed_indices = {item['index'] for item in existing_data}

    with open(output_file, 'a') as f:
        for j in tqdm(range(len(data[i]))):
            if j in processed_indices:
                continue 
    
            summary = summarization(data[i].Transcript[j], prompt)
            
            result = {
                "index": j,
                "Domain": data[i].Domain[j],	
                "Sub Domain": data[i]['Sub Domain'][j],
                "Topic": data[i].Topic[j],
                "Video Title": data[i]['Video Title'][j],
                "URL": data[i].URL[j],
                "Thumbnail": data[i].Thumbnail[j],
                "ID": data[i].ID[j],
                "Publish Time": data[i]['Publish Time'][j],
                "Channel": data[i].Channel[j],
                "Channel ID": data[i]['Channel ID'][j],
                "Transcript": data[i].Transcript[j],
                "Summary": summary
            }
    
            f.write(json.dumps(result) + "\n")
            f.flush()

  0%|          | 0/136 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
100%|██████████| 136/136 [53:14<00:00, 23.49s/it]
