# Extract Audio from Video

In [1]:
from moviepy.editor import VideoFileClip

video_file_path = 'sample1.mp4'
audio_file_path = 'audioFile1.mp3'

video = VideoFileClip(video_file_path)
video.audio.write_audiofile(audio_file_path)

MoviePy - Writing audio in audioFile1.mp3


                                                                      

MoviePy - Done.




# Transcribe Audio with Whisper

In [2]:
import whisper
whisper_model = whisper.load_model("base")
result = whisper_model.transcribe("audioFile1.mp3")
videoText = result["text"]
print(f' The text in video: \n {result["text"]}')

 The text in video: 
  Let's assume your monthly income were to somehow double. Congratulations! You can now, for the most part, afford to buy two times more stuff. But what if the monthly income of everyone else were to also double? Well, in that case, you'd no longer be able to buy two times more stuff, because since everyone else also earns two times more, there would be two times more money chasing roughly the same number of goods. Let's take things one step further. What if your income doubles, but the income of everyone else triples? In that case, you'd actually become poorer. Why? Because making two times more money is not enough to keep up with everyone else, who now makes three times more. If there's an overall increase in the price of goods and services of X percent this year, you have to earn at least X percent more than last year to avoid becoming poorer in real terms. So, the salary makes the effects of inflation crystal clear. Today, each dollar buys you approximately 39 

# Q&A with Bert

In [4]:
from transformers import BertForQuestionAnswering, BertTokenizer
import torch

# Load pre-trained model and tokenizer
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
# Example text and question
question = "What does it say about inflation?"

# Tokenize input
inputs = tokenizer(question, videoText, return_tensors="pt")

In [6]:
# Generate predictions
outputs = model(**inputs)

# Get the start and end scores
start_scores = outputs.start_logits
end_scores = outputs.end_logits

In [7]:

# Get the most likely start and end scores
start_index = torch.argmax(start_scores)
end_index = torch.argmax(end_scores) +  1

# Convert token indices to actual text
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index]))

print(answer)

the salary makes the effects of inflation crystal clear


# Using the SQUAD data set for FineTuning.

In [9]:
from datasets import load_dataset

# Load SQuAD version 2
dataset = load_dataset("squad_v2")
print(dataset)

Downloading readme: 100%|██████████| 8.92k/8.92k [00:00<00:00, 29.9MB/s]
Downloading data: 100%|██████████| 16.4M/16.4M [00:05<00:00, 2.77MB/s]
Downloading data: 100%|██████████| 1.35M/1.35M [00:02<00:00, 516kB/s]
Generating train split: 100%|██████████| 130319/130319 [00:00<00:00, 1411674.40 examples/s]
Generating validation split: 100%|██████████| 11873/11873 [00:00<00:00, 1107850.13 examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})





# Preprocessing

In [18]:

import numpy as np
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad', use_fast=True)

def find_answer_positions(context, answer):
    # Check if the answer is not empty
    if answer["text"]:
        start_idx = context.find(answer["text"][0])
        end_idx = start_idx + len(answer["text"][0]) - 1

        # Tokenize context to find token positions, ensuring not to add special tokens
        tokenized_context = tokenizer(context, return_offsets_mapping=True, add_special_tokens=False)
        offsets = tokenized_context["offset_mapping"]

        start_token_pos = None
        end_token_pos = None

        for i, offset in enumerate(offsets):
            if start_idx >= offset[0] and start_idx <= offset[1]:
                start_token_pos = i
            if end_idx >= offset[0] and end_idx <= offset[1]:
                end_token_pos = i
                break  # Stop once the end position is found

        return start_token_pos, end_token_pos
    else:
        # Return default positions if no answer is provided
        return 0, 0


def preprocess_data(data):
    # Tokenize questions and contexts
    inputs = tokenizer(data["question"], data["context"], truncation=True, padding="max_length", max_length=384, return_tensors="pt")

    # Initialize lists to hold start and end positions
    start_positions = []
    end_positions = []

    for context, answer in zip(data["context"], data["answers"]):
        # Use the manual approach to find start and end token positions
        start_pos, end_pos = find_answer_positions(context, answer)
        
        # Append the positions to the lists
        start_positions.append(start_pos if start_pos is not None else 0)
        end_positions.append(end_pos if end_pos is not None else 0)

    # Convert lists to tensors and add them to the inputs
    inputs["start_positions"] = torch.tensor(start_positions, dtype=torch.long)
    inputs["end_positions"] = torch.tensor(end_positions, dtype=torch.long)

    return inputs


# Apply the preprocessing function to the training and validation dataset
train_dataset = dataset['train'].map(preprocess_data, batched=True)
validation_dataset = dataset['validation'].map(preprocess_data, batched=True)


Map:  25%|██▌       | 3000/11873 [00:00<00:02, 3593.98 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (575 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 11873/11873 [00:03<00:00, 3302.95 examples/s]


In [11]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    # Initialize containers for our batched data
    batched_output = {
        'input_ids': [],
        'attention_mask': [],
        'start_positions': [],
        'end_positions': []
    }
    
    # Process each item in our batch
    for item in batch:
        for key in batched_output.keys():
            # Ensure every item is a tensor. If it's not, convert it.
            element = item[key]
            if not isinstance(element, torch.Tensor):
                element = torch.tensor(element, dtype=torch.long)
            batched_output[key].append(element)
    
    # Now concatenate each list of tensors along a new batch dimension
    for key in batched_output.keys():
        batched_output[key] = torch.stack(batched_output[key], dim=0)
    
    return batched_output

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
validation_dataloader = DataLoader(validation_dataset, batch_size=16, collate_fn=collate_fn)


In [12]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), ep

# Training


In [13]:
from transformers import AdamW
from tqdm.notebook import tqdm  # Use tqdm.auto if you're running in a non-notebook environment

optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in tqdm(range(1), desc="Epoch"):  # Wrapping the epoch loop with tqdm for progress tracking
    # Initialize a progress bar for the batches within the current epoch
    progress_bar = tqdm(train_dataloader, desc="Batch", leave=False)
    for batch in progress_bar:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()
        
        # Optional: Update the progress bar with the current loss
        progress_bar.set_postfix({'loss': loss.item()})
    
    # Save model and tokenizer checkpoints
    model.save_pretrained(f"./model_checkpoint_epoch_{epoch}")
    tokenizer.save_pretrained(f"./model_checkpoint_epoch_{epoch}")

print("Training completed.")


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Batch:   0%|          | 0/8145 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [20]:
fine_tuned_model = BertForQuestionAnswering.from_pretrained('./model_checkpoint_epoch_0')
fine_tuned_tokenizer = BertTokenizer.from_pretrained('./model_checkpoint_epoch_0')

# Example text and question
question = "What does it say about inflation?"

# Tokenize input
inputs = tokenizer(question, videoText, return_tensors="pt")

# Generate predictions
outputs = fine_tuned_model(**inputs)

# Get the start and end scores
start_scores = outputs.start_logits
end_scores = outputs.end_logits
# Get the most likely start and end scores
start_index = torch.argmax(start_scores)
end_index = torch.argmax(end_scores) +  1

# Convert token indices to actual text
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index]))

print(answer)

[CLS]


# Testing the preprocessing


In [21]:
data = dataset['train'][:5]  # Adjust index for sampling
preprocessed_samples = preprocess_data(data)
for i, sample in enumerate(sample_data['context']):
    print(f"Original context: {sample}")
    print(f"Start position: {preprocessed_samples['start_positions'][i]}, End position: {preprocessed_samples['end_positions'][i]}\n")


Original context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Start position: 66, End position: 69

Original context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing comp

In [20]:
from datasets import load_dataset

dataset = load_dataset("squad_v2")

# Attempting to create a sample slice of the data
data = dataset['train'].select(range(5))  # This should ensure data is a proper subset of the dataset

for i in range(len(data)):
    example = data[i]  # Access each example by index
    context = example['context']
    answer_texts = example['answers']['text']
    answer = answer_texts[0] if answer_texts else 'No answer found'
    
    # Assuming your find_answer_positions and the rest of the code is correct
    start_pos, end_pos = find_answer_positions(context, {'text': [answer]})
    tokenized_context = tokenizer(context, return_offsets_mapping=True, add_special_tokens=False)
    answer_tokens = tokenized_context['input_ids'][start_pos:end_pos+1] if start_pos is not None and end_pos is not None else []
    print(f"Expected answer: {answer}")
    print(f"Recovered answer: {tokenizer.decode(answer_tokens)}\n")


Expected answer: in the late 1990s
Recovered answer: in the late 1990s

Expected answer: singing and dancing
Recovered answer: singing and dancing

Expected answer: 2003
Recovered answer: 2003

Expected answer: Houston, Texas
Recovered answer: houston, texas

Expected answer: late 1990s
Recovered answer: late 1990s



# Training again

In [37]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.auto import tqdm
import torch
import numpy as np
from datasets import load_metric

# Assuming dataset, model, tokenizer, preprocess_data are already defined
dataset = load_dataset("squad_v2")

# Apply preprocessing
train_dataset = dataset['train'].map(preprocess_data, batched=True)
validation_dataset = dataset['validation'].map(preprocess_data, batched=True)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
validation_dataloader = DataLoader(validation_dataset, batch_size=16, collate_fn=collate_fn)

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Setup optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Setup scheduler
total_steps = len(train_dataloader) * 3  # for 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

# Setup metric
metric = load_metric("squad_v2")

# Function to compute metrics
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Training
model.train()
best_validation_score = float('-inf')
for epoch in tqdm(range(3), desc="Epoch"):
    progress_bar = tqdm(train_dataloader, desc="Batch", leave=False)
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        progress_bar.set_postfix({'loss': loss.item()})
    
    # Validation
    model.eval()
    for batch in tqdm(validation_dataloader, desc="Validating", leave=False):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        
        # Assume outputs are logits and you have a way to calculate your validation score
        logits = outputs.logits.detach().cpu().numpy()
        labels = batch['labels'].detach().cpu().numpy()  # adjust this based on your actual label key
        validation_score = compute_metrics((logits, labels))
        
        # Check if this is the best model based on validation score
        if validation_score > best_validation_score:
            best_validation_score = validation_score
            print(f"New best score: {validation_score}. Saving model.")
            model.save_pretrained("./best_model")
            tokenizer.save_pretrained("./best_model")
    
    model.train()

print("Training completed.")


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Batch:   0%|          | 0/8145 [00:00<?, ?it/s]