In [1]:
# !pip install torch transformers soundfile moviepy numpy pandas nltk
!pip install torch transformers soundfile moviepy numpy pandas nltk faiss-cpu

Collecting soundfile
  Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl.metadata (16 kB)
Collecting moviepy
  Downloading moviepy-2.2.1-py3-none-any.whl.metadata (6.9 kB)
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp313-cp313-win_amd64.whl.metadata (5.2 kB)
Collecting imageio<3.0,>=2.5 (from moviepy)
  Downloading imageio-2.37.0-py3-none-any.whl.metadata (5.2 kB)
Collecting imageio_ffmpeg>=0.2.0 (from moviepy)
  Downloading imageio_ffmpeg-0.6.0-py3-none-win_amd64.whl.metadata (1.5 kB)
Collecting proglog<=1.0.0 (from moviepy)
  Downloading proglog-0.1.12-py3-none-any.whl.metadata (794 bytes)
Collecting python-dotenv>=0.10 (from moviepy)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting click (from nltk)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl (1.0 MB)
   --------------------------------------

In [2]:
%env HUGGINGFACE_HUB_CACHE= models

env: HUGGINGFACE_HUB_CACHE=models


In [43]:
import numpy as np
import pandas as pd
from moviepy import VideoFileClip, AudioFileClip
import os

import nltk
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, WhisperProcessor, \
    WhisperForConditionalGeneration
# from nemo.collections.asr.models import ClusteringDiarizer
import soundfile as sf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import nltk
from sentence_transformers import SentenceTransformer
import faiss
import re
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


In [4]:
def extract_audio(input_file, output_folder="extracted_audio"):
    os.makedirs(output_folder, exist_ok=True)

    base_name = os.path.basename(input_file)
    file_name, file_ext = os.path.splitext(base_name)

    output_file_path = os.path.join(output_folder, f"{file_name}.mp3")

    if file_ext.lower() == ".mp4":
        print(f"Detected MP4 file. Extracting audio from '{input_file}'...")
        try:
            video_clip = VideoFileClip(input_file)
            audio_clip = video_clip.audio
            audio_clip.write_audiofile(output_file_path)
            audio_clip.close()
            video_clip.close()
            print(f"Audio extracted successfully and saved to '{output_file_path}'")
        except Exception as e:
            print(f"An error occurred during MP4 processing: {e}")

    elif file_ext.lower() == ".mp3":
        print(f"Detected MP3 file. Copying '{input_file}'...")
        try:
            with open(input_file, 'rb') as f_in, open(output_file_path, 'wb') as f_out:
                f_out.write(f_in.read())
            print(f"File copied successfully to '{output_file_path}'")
        except Exception as e:
            print(f"An error occurred during MP3 processing: {e}")

    else:
        print(f"Unsupported file format: {file_ext}. Please provide an MP4 or MP3 file.")

In [None]:
extract_audio(r"C:\Users\dudec\OneDrive\Studies\Coursera\Google_AI_Essentials\AI and future of work.mp4")

In [None]:
extract_audio(r"C:\Users\dudec\OneDrive\Studies\Coursera\Google_AI_Essentials\AI and future of work.mp4")

In [None]:
#pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float32 if torch.cuda.is_available() else torch.float32

processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small", torch_dtype=torch_dtype).to(device)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
    # chunk_length_s=30,
    # stride_length_s=(4, 2),
    generate_kwargs={"language": "en", "task": "transcribe"}
)

audio_file_path = r"extracted_audio\lR-ip0EZQXS_uczWPpahSQ_b9e17546063c4d59822e7488419afbf1_200826.005_MP4_720.mp3"

data, samplerate = sf.read(audio_file_path)

if len(data.shape) > 1:
    mono_data = np.mean(data, axis=1)
else:
    mono_data = data

result = pipe({"array": mono_data, "sampling_rate": samplerate}, return_timestamps=True)

print(result["text"])

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cuda


 Welcome to this video on the Flutter development environment. After watching this video you'll be able to describe the main components of Flutter development, identify the Dart programming language, identify emulators and physical devices, explain how the main components of Flutter work together. Flutter is an open-source user interface or UI software development toolkit. Developers use it to develop cross-platform applications by just writing code once. Flutter helps developers create native applications for iOS, Android, and the web with consistent UI. What makes this work is Flutter's main components. These include Flutter software development kit or SDK, the Dart programming language, Integrated Development Environment, or IDE, and emulators and physical devices. Let's explore those main components in more detail. The Flutter SDK is a collection of tools that developers need to create applications with Flutter. It includes the Dart SDK, which is essential for running and compiling

In [5]:
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')


def lemmatize_text(text):
    words = text.lower().split()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    return " ".join(lemmatized_words)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
processed_text = lemmatize_text(result["text"])

In [7]:
processed_text

"welcome to this video on the flutter development environment. after watch this video you'll be able to describe the main components of flutter development, identify the dart program language, identify emulators and physical devices, explain how the main components of flutter work together. flutter be an open-source user interface or ui software development toolkit. developers use it to develop cross-platform applications by just write code once. flutter help developers create native applications for ios, android, and the web with consistent ui. what make this work be flutter's main components. these include flutter software development kit or sdk, the dart program language, integrate development environment, or ide, and emulators and physical devices. let's explore those main components in more detail. the flutter sdk be a collection of tool that developers need to create applications with flutter. it include the dart sdk, which be essential for run and compile flutter apps. the sdk p

In [54]:
# This is an example of what would need to be added

# Load a pre-trained sentence-transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
base_sentences = [s.strip() for s in re.split(r'[.?!]\s+', processed_text) if s.strip()]
test_sentences = base_sentences[:]
# Get embeddings for your processed texts
base_embeddings = model.encode(base_sentences)
test_embeddings = model.encode(test_sentences)

In [73]:
range_n_clusters = list(range(5, 15))
silhouette_scores = []

for n_clusters in range_n_clusters:
    clustering_model = KMeans(n_clusters=n_clusters, random_state=0, n_init=15)
    cluster_labels = clustering_model.fit_predict(base_embeddings)

    score = silhouette_score(base_embeddings, cluster_labels)
    silhouette_scores.append(score)
    print(f"Number of clusters: {n_clusters}, Silhouette Score: {score:.4f}")

optimal_n_clusters = range_n_clusters[np.argmax(silhouette_scores)]



Number of clusters: 5, Silhouette Score: 0.1291
Number of clusters: 6, Silhouette Score: 0.1040




Number of clusters: 7, Silhouette Score: 0.1238
Number of clusters: 8, Silhouette Score: 0.1010




Number of clusters: 9, Silhouette Score: 0.0724
Number of clusters: 10, Silhouette Score: 0.1025




Number of clusters: 11, Silhouette Score: 0.0971
Number of clusters: 12, Silhouette Score: 0.1129




Number of clusters: 13, Silhouette Score: 0.1079




Number of clusters: 14, Silhouette Score: 0.1095


In [74]:
final_clustering_model = KMeans(n_clusters=optimal_n_clusters, random_state=0, n_init=10)
final_cluster_labels = final_clustering_model.fit_predict(base_embeddings)



In [75]:
merged_sentences = []
merged_embeddings = []
for i in range(optimal_n_clusters):
    cluster_indices = np.argwhere(final_cluster_labels == i).flatten()

    if len(cluster_indices) > 0:
        cluster_sentences = [base_sentences[j] for j in cluster_indices]
        cluster_embeddings = base_embeddings[cluster_indices]

        cluster_centroid = np.mean(cluster_embeddings, axis=0)
        distances = np.linalg.norm(cluster_embeddings - cluster_centroid, axis=1)
        closest_sentence_idx = np.argmin(distances)

        representative_sentence = cluster_sentences[closest_sentence_idx]
        merged_sentences.append(representative_sentence)
        merged_embeddings.append(cluster_centroid)


In [76]:
merged_sentences

["next, you manage the state of your app use flutter's built-in state management techniques",
 'it also support just-in-time jit compilation, where dart code be compile into intermediate code, which be then interpret and compile into machine code at runtime',
 'the flutter sdk be a collection of tool that developers need to create applications with flutter',
 'it include libraries, tool for compile code into native machine code, and tool for debugging',
 'flutter use emulators and physical devices to test the performance and user experience of apps on different devices']

In [77]:
merged_embeddings = np.array(merged_embeddings)
vector_dimension = merged_embeddings.shape[1]

In [80]:
faiss_index = faiss.IndexFlatIP(vector_dimension)
faiss_index.add(test_embeddings)

# distances, _ = faiss_index.search(base_embeddings, k=1)
distances, _ = faiss_index.search(merged_embeddings, k=1)

similarity_scores = distances.flatten()

In [81]:
similarity_scores

array([0.63831323, 0.80729413, 0.7520376 , 0.4428746 , 0.7568732 ],
      dtype=float32)

In [None]:
vectorizer=CountVectorizer(stop_words='english')
# vectorizer = TfidfVectorizer()


In [None]:
X = vectorizer.fit_transform([processed_text])
features = vectorizer.get_feature_names_out()
# print("Features (words) kept:", vectorizer.get_feature_names_out())
word_counts = X.toarray()

In [None]:
filtered_words = [word for word in processed_text.split() if word in features]
paragraph_with_features = " ".join(filtered_words)
filtered_words

In [None]:
df = pd.DataFrame(data=word_counts, columns=features)
df
