In [None]:

!pip install torch transformers soundfile moviepy numpy pandas nltk faiss-cpu

In [None]:
%env HUGGINGFACE_HUB_CACHE= models

In [None]:
import numpy as np
import pandas as pd
from moviepy import VideoFileClip, AudioFileClip
import os

import nltk
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, WhisperProcessor, \
    WhisperForConditionalGeneration
# from nemo.collections.asr.models import ClusteringDiarizer
import soundfile as sf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import nltk
from sentence_transformers import SentenceTransformer
import faiss
import re
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

In [None]:
def extract_audio(input_file, output_folder="extracted_audio"):
    os.makedirs(output_folder, exist_ok=True)

    base_name = os.path.basename(input_file)
    file_name, file_ext = os.path.splitext(base_name)

    output_file_path = os.path.join(output_folder, f"{file_name}.mp3")

    if file_ext.lower() == ".mp4":
        print(f"Detected MP4 file. Extracting audio from '{input_file}'...")
        try:
            video_clip = VideoFileClip(input_file)
            audio_clip = video_clip.audio
            audio_clip.write_audiofile(output_file_path)
            audio_clip.close()
            video_clip.close()
            print(f"Audio extracted successfully and saved to '{output_file_path}'")
        except Exception as e:
            print(f"An error occurred during MP4 processing: {e}")

    elif file_ext.lower() == ".mp3":
        print(f"Detected MP3 file. Copying '{input_file}'...")
        try:
            with open(input_file, 'rb') as f_in, open(output_file_path, 'wb') as f_out:
                f_out.write(f_in.read())
            return f_out
        except Exception as e:
            print(f"An error occurred during MP3 processing: {e}")

    else:
        print(f"Unsupported file format: {file_ext}. Please provide an MP4 or MP3 file.")

In [None]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small", torch_dtype=torch_dtype).to(device)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,

    generate_kwargs={"language": "en", "task": "transcribe"}
)

In [None]:
def audio_to_text(audio):
    data, samplerate = sf.read(audio)

    if len(data.shape) > 1:
        mono_data = np.mean(data, axis=1)
    else:
        mono_data = data

    audtext = pipe({"array": mono_data, "sampling_rate": samplerate}, return_timestamps=True)

    return audtext

In [None]:
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')


In [None]:
def lemmatize_text(text):
    words = text.lower().split()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    return " ".join(lemmatized_words)

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
def text_to_vector(text):
    text_sentences = [s.strip() for s in re.split(r'[.?!]\s+', text) if s.strip()]
    text_embeddings = model.encode(text_sentences)
    return text_embeddings

In [None]:
def optimal_n_clusters(vectarray):
    range_n_clusters = list(range(5, 15))
    silhouette_scores = []

    for n_clusters in range_n_clusters:
        clustering_model = KMeans(n_clusters=n_clusters, random_state=0, n_init=15)
        cluster_labels = clustering_model.fit_predict(vectarray)

        score = silhouette_score(vectarray, cluster_labels)
        silhouette_scores.append(score)
        print(f"Number of clusters: {n_clusters}, Silhouette Score: {score:.4f}")

    optimal_n_clusters = range_n_clusters[np.argmax(silhouette_scores)]

    return optimal_n_clusters

In [None]:
def sentence_merger(vectarray, text):
    n = optimal_n_clusters(vectarray)
    final_clustering_model = KMeans(n_clusters=n, random_state=0, n_init=10)
    merger_model = final_clustering_model.fit_predict(vectarray)
    merged_sentences = []
    merged_embeddings = []
    for i in range(n):
        cluster_indices = np.argwhere(merger_model == i).flatten()

        if len(cluster_indices) > 0:
            cluster_sentences = [text[j] for j in cluster_indices]
            cluster_embeddings = vectarray[cluster_indices]

            cluster_centroid = np.mean(cluster_embeddings, axis=0)
            distances = np.linalg.norm(cluster_embeddings - cluster_centroid, axis=1)
            closest_sentence_idx = np.argmin(distances)

            representative_sentence = cluster_sentences[closest_sentence_idx]
            merged_sentences.append(representative_sentence)
            merged_embeddings.append(cluster_centroid)

    return merged_embeddings, merged_sentences

In [None]:
def faiss_scoring(base_vectarray, test_vectarray):
    vectarray = np.array(test_vectarray)
    faiss_index = faiss.IndexFlatIP(vectarray.shape[1])
    faiss_index.add(test_vectarray)

    distances, _ = faiss_index.search(base_vectarray, k=1)

    similarity_scores = distances.flatten()

    return similarity_scores

In [None]:
base_audio = extract_audio(r"C:\Users\dudec\OneDrive\Studies\Coursera\Google_AI_Essentials\AI and future of work.mp4")

test_audio = extract_audio(r"C:\Users\dudec\OneDrive\Studies\Coursera\Google_AI_Essentials\AI and future of work.mp4")

In [None]:
base_text = audio_to_text(base_audio)
test_text = audio_to_text(test_audio)

In [None]:
base_text_lemmat = lemmatize_text(base_text)
test_text_lemmat = lemmatize_text(test_text)

In [None]:
base_vectors = text_to_vector(base_text)
test_vectors = text_to_vector(test_text)

In [None]:
merged_base_vectors, merged_base_sentences = sentence_merger(base_vectors, test_vectors)

In [None]:
faiss_symscore = faiss_scoring(merged_base_vectors, test_vectors)