# Step 1 - Extract and translate sentences from video

In [None]:
import os
import sys
import pandas as pd
import assemblyai as aai
import os
from pytubefix import YouTube

import pandas as pd
import torch
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM


# Suppress warnings
warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow logging
os.environ['PYTHONWARNINGS'] = 'ignore'  # Suppress Python warnings


In [None]:
video_url = "https://www.youtube.com/watch?v=dN3mv5QiZLY&t=952s"
output_path = "extracted_sentences.csv"
aai.settings.api_key = '4ba97f247dd44f86b2c51a29f14caa26'
model_path = "./translation_model"  

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_path)


In [None]:
# aai.settings.api_key = ASSEMBLYAI_API_KEY


In [None]:

def video_transcription(video_url, output_path):
    
    # url input from youtube
    yt = YouTube(video_url)

    # extract only audio
    video = yt.streams.filter(only_audio=True).first()

    # set destination to save file
    #destination = ("/Users/Buas/Desktop/Data Start/GitHub/Ano 2/2024-25c-fai2-adsai-DeuzaVarela235065/Data")

    # download the file
    out_file = video.download()

    # save the file
    base, ext = os.path.splitext(out_file)
    new_file = base + '.mp3'
    os.rename(out_file, new_file)
    
    # Configure transcription with Spanish language
    config = aai.TranscriptionConfig(language_code="es")
    transcriber = aai.Transcriber(config=config)

    # Transcribe the audio
    transcript = transcriber.transcribe(new_file)

    # Check for errors
    if transcript.status == aai.TranscriptStatus.error:
        print(f"Transcription failed: {transcript.error}")
        exit(1)

    # Create lists to store data
    sentences = []
    start_times = []
    end_times = []

    # Extract sentences with their timestamps
    sentence_objects = transcript.get_sentences()
    for sentence in sentence_objects:
        sentences.append(sentence.text)
        start_times.append(sentence.start)  # Start time in milliseconds
        end_times.append(sentence.end)      # End time in milliseconds

    # Convert to DataFrame for easier manipulation
    transcript_df = pd.DataFrame({
        'sentence': sentences,
        'start_time_ms': start_times,
        'end_time_ms': end_times
    })

    # Convert milliseconds to a more readable format
    transcript_df['start_time'] = transcript_df['start_time_ms'].apply(
        lambda ms: f"{int(ms/60000):02d}:{int((ms%60000)/1000):02d}.{int(ms%1000):03d}"
    )
    transcript_df['end_time'] = transcript_df['end_time_ms'].apply(
        lambda ms: f"{int(ms/60000):02d}:{int((ms%60000)/1000):02d}.{int(ms%1000):03d}"
    )

    # Format the DataFrame
    transcript_df = transcript_df[['start_time', 'end_time', 'sentence']]

    # Rename the columns as per template
    transcript_df.columns = ['Start Time', 'End Time', 'Sentence']
    

    # Save to CSV for the next step in your pipeline
    transcript_df.to_csv(output_path, index=False)

    #print(f"Extracted {len(sentences)} sentences with timestamps and saved to {output_path}")
    return transcript_df

video_df = video_transcription(video_url, output_path)

# Step 2: Translate the sentences to English

In [None]:
# === Translate a column of sentences ===
def translate(sentence):
       
    # Tokenize input sentence
    inputs = tokenizer.encode(sentence, return_tensors="tf", padding=True, truncation=True, max_length=256)
    # Generate translation
    outputs = model.generate(inputs, max_length=256)
    # Decode the output
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Apply translation to the column (replace 'sentence' with your actual column name)
video_df['Translation'] = video_df['Sentence'].apply(translate)

# Save the DataFrame with translations to a new CSV file
video_df.to_csv("translated_output.csv", index=False)  