In [None]:
import os
from pytubefix import YouTube
from pydub import AudioSegment
import assemblyai as aai
import pandas as pd
import numpy as np
import re


# Set your API key
aai.settings.api_key = "<API_KEY>"  # Replace with your AssemblyAI API key

# === CONFIGURATION ===
SAVE_DIR = "video_data"
FILENAME = "audio.mp4"
CLIPPED_FILENAME_TEMPLATE = "{}_minutes_clipped.mp3"


# === DOWNLOAD FUNCTIONS ===
def download_youtube_audio(url: str, save_dir: str = SAVE_DIR, filename: str = FILENAME) -> str:
    yt = YouTube(url)
    audio_stream = yt.streams.filter(only_audio=True).first()
    out_path = audio_stream.download(output_path=save_dir, filename=filename)
    return out_path


# === AUDIO PROCESSING ===
def clip_audio(filepath: str, duration_minutes: int = None, save_dir: str = SAVE_DIR) -> str:
    audio = AudioSegment.from_file(filepath, format="mp4")
    
    if duration_minutes is not None:
        max_duration_ms = len(audio)
        clip_duration_ms = duration_minutes * 60 * 1000
        if clip_duration_ms > max_duration_ms:
            print(f"Requested {duration_minutes} minutes exceeds video duration. Clipping full audio.")
            clip_duration_ms = max_duration_ms
        audio = audio[:clip_duration_ms]
        clipped_filename = CLIPPED_FILENAME_TEMPLATE.format(duration_minutes)
    else:
        clipped_filename = "full_audio.mp3"

    clipped_path = os.path.join(save_dir, clipped_filename)
    audio.export(clipped_path, format="mp3")
    return clipped_path


# === TRANSCRIPTION ===
def transcribe_audio(filepath: str, language_code: str = "ru"):
    config = aai.TranscriptionConfig(language_code=language_code)
    transcriber = aai.Transcriber(config=config)
    transcript = transcriber.transcribe(filepath)
    return transcript.get_sentences()


# === CONTROLLER FUNCTION ===
def process_youtube_transcription(
    url: str,
    duration_minutes: int = None,
    full_video: bool = False,
    language_code: str = "ru"
):
    print("Downloading audio...")
    audio_path = download_youtube_audio(url)

    print("Clipping audio...")
    clip_path = clip_audio(audio_path, None if full_video else duration_minutes)

    print("Transcribing audio...")
    sentences = transcribe_audio(clip_path, language_code=language_code)

    print("Transcription completed.")
    return sentences


In [9]:
sentences = process_youtube_transcription(
    url="https://www.youtube.com/watch?v=jcHAZLXrUaA&ab_channel=%D0%97%D0%B2%D0%B0%D0%BD%D1%8B%D0%B9%D1%83%D0%B6%D0%B8%D0%BD",
    duration_minutes=1,
    full_video=False,
    language_code="ru"
)

Downloading audio...
Clipping audio...
Transcribing audio...
Transcription completed.


In [10]:
# Assuming 'transcript.segments' returns a list of segment objects/dicts with 'start', 'end', and 'text'
data = []
for segment in sentences:
    # Extract start and end timestamps and the sentence text
    data.append([segment.start, segment.end, segment.text])

# Create a DataFrame with columns for start time, end time, and sentence
df = pd.DataFrame(data, columns=["Start Time", "End Time", "Sentence"])

# Convert start and end times from miliseconds to hh:mm:ss,ms format
def format_time(ms):
    ms = int(ms)
    hours, remainder = divmod(ms, 3600000)
    minutes, seconds = divmod(remainder, 60000)
    seconds, milliseconds = divmod(seconds, 1000)
    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

df["Start Time"] = df["Start Time"].apply(format_time)
df["End Time"] = df["End Time"].apply(format_time)

# Save the DataFrame to CSV using semicolon as separator
df.to_csv("video_data/transcribed_data_assemblyAI.csv", sep=";", index=False)

In [11]:
df.head()

Unnamed: 0,Start Time,End Time,Sentence
0,"00:00:02,347","00:00:07,712",С понедельника по пятницу пять совершенно разн...
1,"00:00:07,772","00:00:14,898",Каждый из них имеет одинаковые условия при пок...
2,"00:00:14,958","00:00:19,081",Задача каждого продемонстрировать гостям все с...
3,"00:00:19,141","00:00:23,325",Что подать к столу и как развлекать гостей реш...
4,"00:00:23,365","00:00:31,571",Каждый вечер гости выставляют хозяину оценки и...


In [None]:
import torch
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
)

# Load trained model and tokenizer
model_path = "mbart_translation_full"
tokenizer = MBart50TokenizerFast.from_pretrained(model_path)
model = MBartForConditionalGeneration.from_pretrained(model_path)

# Set source and target language codes
tokenizer.src_lang = "ru_RU"
tokenizer.tgt_lang = "en_XX"

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Define translation function (batched)
def batch_translate(sentences, model, tokenizer, num_beams=5):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    translated_tokens = model.generate(
        **inputs,
        num_beams=num_beams,
        max_length=128,
        early_stopping=True
    )
    return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

# Translate sentences in batches
batch_size = 16
sentences = df["Sentence"].astype(str).tolist()
translations = [
    translation
    for i in range(0, len(sentences), batch_size)
    for translation in batch_translate(sentences[i:i + batch_size], model, tokenizer)
]

# Remove unwanted characters and extra spaces
translations = [
    re.sub(r"[^А-Яа-яa-zA-Z0-9?!¿]+", " ", translation).strip()
    for translation in translations
]

# Add translations to DataFrame and save
df["Translation"] = translations

In [11]:
df

Unnamed: 0,Start Time,End Time,Sentence,Translation
0,"00:00:02,347","00:00:07,712",С понедельника по пятницу пять совершенно разн...,From Monday through Friday five completely dif...
1,"00:00:07,772","00:00:14,898",Каждый из них имеет одинаковые условия при пок...,Each of them has the same conditions for buyin...
2,"00:00:14,958","00:00:19,081",Задача каждого продемонстрировать гостям все с...,It s everyone s job to show their talents to t...
3,"00:00:19,141","00:00:23,325",Что подать к столу и как развлекать гостей реш...,It s up to the host to decide what to put on t...
4,"00:00:23,365","00:00:31,571",Каждый вечер гости выставляют хозяину оценки и...,Every evening guests give the host a score and...
5,"00:00:32,311","00:00:37,777",Победитель недели станет участником суперфинал...,The winner of the week will go on to win the S...
6,"00:00:37,837","00:00:45,184",Смотрите на канале РЕН-ТВ программу «Званый уж...,You can watch the RENT TV program A Sound Dinn...


In [12]:
# Sklearn for classic models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tqdm import tqdm
import tensorflow as tf

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [13]:
from transformers import Trainer, TrainingArguments, RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
from datasets import Dataset, DatasetDict
from transformers import EarlyStoppingCallback

test_df = df[['Translation']]

print("Test shape:", test_df.shape)

# Convert DataFrame to Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_df)

# Combine into DatasetDict
dataset = DatasetDict({
    'test': test_dataset
})

# Ensure label column is correctly named for Hugging Face Trainer
dataset = dataset.rename_column("Translation", "text")

# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained("distilbert/distilroberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize dataset with progress bar (using tqdm)
tokenized_datasets = dataset.map(tokenize_function, batched=True, desc="Tokenizing")

# Convert dataset format
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask"])

Test shape: (7, 1)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Tokenizing:   0%|          | 0/7 [00:00<?, ? examples/s]

In [14]:
from transformers import RobertaForSequenceClassification, Trainer
import torch

# Load your fine-tuned model
model = RobertaForSequenceClassification.from_pretrained("./distilroberta_finetuned_v2")

# Use the Trainer for prediction
trainer = Trainer(model=model)
trainer.args.report_to = "none"  # Disable logging to avoid unnecessary output

# Predict
predictions = trainer.predict(tokenized_datasets["test"])

# Get predicted class (argmax of logits)
predicted_classes = torch.argmax(torch.tensor(predictions.predictions), dim=1)

# Add predictions back to DataFrame
test_df["prediction"] = predicted_classes.numpy()

print(test_df)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Current

                                         Translation  prediction
0  From Monday through Friday five completely dif...           4
1  Each of them has the same conditions for buyin...           4
2  It s everyone s job to show their talents to t...           3
3  It s up to the host to decide what to put on t...           4
4  Every evening guests give the host a score and...           3
5  The winner of the week will go on to win the S...           3
6  You can watch the RENT TV program A Sound Dinn...           4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["prediction"] = predicted_classes.numpy()


In [15]:
# Load label encoder
import pickle

with open("label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

# Decode the predicted labels
decoded_predictions = label_encoder.inverse_transform(test_df["prediction"])
df["Emotion"] = decoded_predictions

In [None]:
df

In [None]:
df.to_csv("results.csv", sep=";", index=False)