In [None]:

# !pip install torch transformers soundfile moviepy numpy pandas nltk faiss-cpu



In [85]:
%env HUGGINGFACE_HUB_CACHE= models

env: HUGGINGFACE_HUB_CACHE=models


In [1]:
import numpy as np
import pandas as pd
from moviepy import VideoFileClip, AudioFileClip
import os

import nltk
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, WhisperProcessor, \
    WhisperForConditionalGeneration
# from nemo.collections.asr.models import ClusteringDiarizer
import soundfile as sf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import nltk
from sentence_transformers import SentenceTransformer
import faiss
import re
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'moviepy'

In [86]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

In [None]:
def extract_audio(input_file, output_folder="extracted_audio"):
    os.makedirs(output_folder, exist_ok=True)

    base_name = os.path.basename(input_file)
    file_name, file_ext = os.path.splitext(base_name)

    output_file_path = os.path.join(output_folder, f"{file_name}.mp3")

    if file_ext.lower() == ".mp4":
        print(f"Detected MP4 file. Extracting audio from '{input_file}'...")
        try:
            video_clip = VideoFileClip(input_file)
            audio_clip = video_clip.audio
            audio_clip.write_audiofile(output_file_path)
            audio_clip.close()
            video_clip.close()
            print(f"Audio extracted successfully and saved to '{output_file_path}'")
        except Exception as e:
            print(f"An error occurred during MP4 processing: {e}")

    elif file_ext.lower() == ".mp3":
        print(f"Detected MP3 file. Copying '{input_file}'...")
        try:
            with open(input_file, 'rb') as f_in, open(output_file_path, 'wb') as f_out:
                f_out.write(f_in.read())
            return f_out
        except Exception as e:
            print(f"An error occurred during MP3 processing: {e}")

    else:
        print(f"Unsupported file format: {file_ext}. Please provide an MP4 or MP3 file.")

In [88]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small", torch_dtype=torch_dtype).to(device)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,

    generate_kwargs={"language": "en", "task": "transcribe"}
)

Device set to use cuda


In [89]:
def audio_to_text(audio):
    data, samplerate = sf.read(audio)

    if len(data.shape) > 1:
        mono_data = np.mean(data, axis=1)
    else:
        mono_data = data

    audtext = pipe({"array": mono_data, "sampling_rate": samplerate}, return_timestamps=True)

    return audtext

In [90]:
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [91]:
def lemmatize_text(text):
    words = text.lower().split()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    return " ".join(lemmatized_words)

In [92]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [93]:
def text_to_vector(text):
    text_sentences = [s.strip() for s in re.split(r'[.?!]\s+', text) if s.strip()]
    text_embeddings = model.encode(text_sentences)
    return text_embeddings

In [101]:
def optimal_n_clusters(vectarray):
    range_n_clusters = list(range(5, 30))
    silhouette_scores = []

    for n_clusters in range_n_clusters:
        clustering_model = KMeans(n_clusters=n_clusters, random_state=0, n_init=15)
        cluster_labels = clustering_model.fit_predict(vectarray)

        score = silhouette_score(vectarray, cluster_labels)
        silhouette_scores.append(score)
        print(f"Number of clusters: {n_clusters}, Silhouette Score: {score:.4f}")

    optimal_n_clusters = range_n_clusters[np.argmax(silhouette_scores)]

    return optimal_n_clusters

In [102]:
def sentence_merger(vectarray, text):
    n = optimal_n_clusters(vectarray)
    final_clustering_model = KMeans(n_clusters=n, random_state=0, n_init=10)
    merger_model = final_clustering_model.fit_predict(vectarray)
    merged_sentences = []
    merged_embeddings = []
    for i in range(n):
        cluster_indices = np.argwhere(merger_model == i).flatten()

        if len(cluster_indices) > 0:
            cluster_sentences = [text[j] for j in cluster_indices]
            cluster_embeddings = vectarray[cluster_indices]

            cluster_centroid = np.mean(cluster_embeddings, axis=0)
            distances = np.linalg.norm(cluster_embeddings - cluster_centroid, axis=1)
            closest_sentence_idx = np.argmin(distances)

            representative_sentence = cluster_sentences[closest_sentence_idx]
            merged_sentences.append(representative_sentence)
            merged_embeddings.append(cluster_centroid)

    return merged_embeddings, merged_sentences

In [113]:
def faiss_scoring(base_vectarray, test_vectarray):
    test_vectarray = np.array(test_vectarray)
    base_vectarray = np.array(base_vectarray)

    faiss_index = faiss.IndexFlatIP(test_vectarray.shape[1])
    faiss_index.add(test_vectarray)

    distances, _ = faiss_index.search(base_vectarray, k=1)

    similarity_scores = distances.flatten()

    return similarity_scores

In [None]:
base_audio = extract_audio(r"C:\Users\dudec\OneDrive\Studies\Coursera\Google_AI_Essentials\AI and future of work.mp4")

test_audio = extract_audio(r"C:\Users\dudec\OneDrive\Studies\Coursera\Google_AI_Essentials\AI and future of work.mp4")

In [117]:
base_text = """To teach B.Tech students about word embeddings, start with a conceptual overview before diving into the technical details. Begin by explaining the fundamental problem: computers don't understand words as humans do. They need a numerical representation of text to perform any task. Traditional methods like one-hot encoding are simple but flawed. Illustrate one-hot encoding by showing that each word gets a unique vector of zeros with a single '1'. For example, if a vocabulary has 10,000 words, each word is a vector of 10,000 dimensions, which is extremely sparse and computationally inefficient.

Highlight the biggest limitation of one-hot encoding: it treats every word as a completely independent entity. The words "king" and "queen" are as different to the computer as "king" and "apple". There is no way to capture the semantic relationships between words. This is where word embeddings come in. Define a word embedding as a way to represent words as dense, real-valued vectors in a lower-dimensional space. These vectors are designed to capture the meaning and relationships of words. Use an analogy: imagine a map where the location of a city corresponds to its vector. Cities that are geographically close, like Mumbai and Pune, are also close on the map. In the same way, words with similar meanings, like "king" and "queen", will have vectors that are numerically "close" in the embedding space.

Introduce the core idea behind learning these embeddings: the distributional hypothesis. Simply put, words that appear in similar contexts have similar meanings. Use an example: "The cat sat on the mat" and "The dog sat on the rug." The words "cat" and "dog" appear in similar contexts, suggesting they are related. This is the principle that many embedding models leverage.

Next, introduce two of the most popular models: Word2Vec and GloVe. Explain Word2Vec as a predictive, neural network-based model. It has two main architectures: Continuous Bag of Words (CBOW) and Skip-Gram. Explain CBOW as predicting the current word from its surrounding context. For instance, given "the cat sat on," the model predicts "mat." Contrast this with Skip-Gram, which does the opposite: given a word like "mat," it predicts the surrounding context words like "the," "cat," and "on." Emphasize that training these models involves a shallow neural network and a large corpus of text. The final word vectors are the trained weights from the hidden layer.

Then, discuss the GloVe (Global Vectors for Word Representation) model. Contrast it with Word2Vec by explaining that GloVe is a count-based model. It doesn't use a neural network to predict words. Instead, it leverages global co-occurrence statistics from the entire corpus. Explain that GloVe creates a co-occurrence matrix that counts how often words appear together, and then uses matrix factorization to generate the word vectors. The core idea is that ratios of co-occurrence probabilities can encode meaning.

After explaining the models, show some practical applications. A classic example is vector arithmetic. Show that vector("king") - vector("man") + vector("woman") ≈ vector("queen"). This is a powerful demonstration of how embeddings capture complex semantic relationships. Another example is using cosine similarity to find the most similar words to a given word, like finding the closest words to "computer" would be "laptop," "PC," and "server." Finally, touch upon the use of pre-trained embeddings, which are widely available and can save time and computational resources for many tasks."""

test_text = """To teach B.Tech students about word embeddings, start with a conceptual overview before diving into the technical details. Begin by explaining the fundamental problem: computers don't understand words as humans do. They need a numerical representation of text to perform any task. Traditional methods like one-hot encoding are simple but flawed. Illustrate one-hot encoding by showing that each word gets a unique vector of zeros with a single '1'. For example, if a vocabulary has 10,000 words, each word is a vector of 10,000 dimensions, which is extremely sparse and computationally inefficient.

Highlight the biggest limitation of one-hot encoding: it treats every word as a completely independent entity. The words "king" and "queen" are as different to the computer as "king" and "apple". There is no way to capture the semantic relationships between words. This is where word embeddings come in. Define a word embedding as a way to represent words as dense, real-valued vectors in a lower-dimensional space. These vectors are designed to capture the meaning and relationships of words. Use an analogy: imagine a map where the location of a city corresponds to its vector. Cities that are geographically close, like Mumbai and Pune, are also close on the map. In the same way, words with similar meanings, like "king" and "queen", will have vectors that are numerically "close" in the embedding space.

Introduce the core idea behind learning these embeddings: the distributional hypothesis. Simply put, words that appear in similar contexts have similar meanings. Use an example: "The cat sat on the mat" and "The dog sat on the rug." The words "cat" and "dog" appear in similar contexts, suggesting they are related. This is the principle that many embedding models leverage.

After explaining the models, show some practical applications. A classic example is vector arithmetic. Show that vector("king") - vector("man") + vector("woman") ≈ vector("queen"). This is a powerful demonstration of how embeddings capture complex semantic relationships. Another example is using cosine similarity to find the most similar words to a given word, like finding the closest words to "computer" would be "laptop," "PC," and "server." Finally, touch upon the use of pre-trained embeddings, which are widely available and can save time and computational resources for many tasks."""

# base_text = audio_to_text(base_audio)
# test_text = audio_to_text(test_audio)

In [118]:
base_text_lemmat = lemmatize_text(base_text)
test_text_lemmat = lemmatize_text(test_text)

In [119]:
base_vectors = text_to_vector(base_text)
test_vectors = text_to_vector(test_text)

In [120]:
merged_base_vectors, merged_base_sentences = sentence_merger(base_vectors, base_text)

Number of clusters: 5, Silhouette Score: 0.0707
Number of clusters: 6, Silhouette Score: 0.0753
Number of clusters: 7, Silhouette Score: 0.0915
Number of clusters: 8, Silhouette Score: 0.0937
Number of clusters: 9, Silhouette Score: 0.0936
Number of clusters: 10, Silhouette Score: 0.1017
Number of clusters: 11, Silhouette Score: 0.0958
Number of clusters: 12, Silhouette Score: 0.0928
Number of clusters: 13, Silhouette Score: 0.0925
Number of clusters: 14, Silhouette Score: 0.1063
Number of clusters: 15, Silhouette Score: 0.0875
Number of clusters: 16, Silhouette Score: 0.0938
Number of clusters: 17, Silhouette Score: 0.0990
Number of clusters: 18, Silhouette Score: 0.0888
Number of clusters: 19, Silhouette Score: 0.0968
Number of clusters: 20, Silhouette Score: 0.0934
Number of clusters: 21, Silhouette Score: 0.0750
Number of clusters: 22, Silhouette Score: 0.0866
Number of clusters: 23, Silhouette Score: 0.0817
Number of clusters: 24, Silhouette Score: 0.0715
Number of clusters: 25, S

In [121]:
faiss_symscore = faiss_scoring(merged_base_vectors, test_vectors)

In [122]:
faiss_symscore

array([0.37516057, 0.66274476, 0.66860807, 0.41999638, 0.46056998,
       1.        , 0.7807846 , 0.64560837, 1.        , 0.99999994,
       0.9999998 , 1.        , 1.        , 0.99999994], dtype=float32)

In [None]:

import textstat
import warnings
warnings.filterwarnings('ignore')

base_duration_sec = 180
teacher_duration_sec = 120


similarity_scores = faiss_symscore
avg_semantic_score = np.mean(similarity_scores)
percent_strong_matches = (len([s for s in similarity_scores if s > 0.7]) / len(similarity_scores)) * 100

teacher_vectors_normalized = teacher_vectors / np.linalg.norm(teacher_vectors, axis=1, keepdims=True)
merged_base_vectors_normalized = merged_base_vectors / np.linalg.norm(merged_base_vectors, axis=1, keepdims=True)

topic_index = faiss.IndexFlatIP(merged_base_vectors_normalized.shape[1])
topic_index.add(merged_base_vectors_normalized)
_, topic_assignments = topic_index.search(teacher_vectors_normalized, k=1)

num_topics_covered = len(np.unique(topic_assignments))
total_num_topics = len(merged_base_vectors)
thematic_coverage_score = (num_topics_covered / total_num_topics) * 100

base_word_count = len(re.findall(r'\b[a-z]+\b', base_text.lower()))
teacher_word_count = len(re.findall(r'\b[a-z]+\b', test_text.lower()))
base_fk_grade = textstat.flesch_kincaid_grade(base_text)
teacher_fk_grade = textstat.flesch_kincaid_grade(test_text)
base_wpm = (base_word_count / base_duration_sec) * 60 if base_duration_sec > 0 else 0
teacher_wpm = (teacher_word_count / teacher_duration_sec) * 60 if teacher_duration_sec > 0 else 0

report_data = {
    "Feature": ["Semantic Score (Avg)", "% Sentences > 70% Match", "Thematic Coverage", "Readability (F-K Grade)", "Pace (WPM)"],
    "Base Lecture": ["---", "---", "---", f"{base_fk_grade:.2f}", f"{base_wpm:.2f}"],
    "Teacher Lecture": ["---", "---", "---", f"{teacher_fk_grade:.2f}", f"{teacher_wpm:.2f}"],
    "Comparison Score": [f"{avg_semantic_score:.4f}", f"{percent_strong_matches:.2f}%", f"{thematic_coverage_score:.2f}%", "---", "---"]
}
report_df = pd.DataFrame(report_data)
print("Report assembled.")

weights = {
    "Semantic": 0.40, "Coverage": 0.25, "Thematic": 0.15,
    "Clarity": 0.10, "Pacing": 0.10
}

clarity_score = min(base_fk_grade / teacher_fk_grade, 1.0) if teacher_fk_grade > 0 else 0
pacing_score = min(teacher_wpm / base_wpm, 1.0) if base_wpm > 0 else 0

final_score = (avg_semantic_score * weights["Semantic"] + 
               (percent_strong_matches / 100) * weights["Coverage"] +
               (thematic_coverage_score / 100) * weights["Thematic"] +
               clarity_score * weights["Clarity"] +
               pacing_score * weights["Pacing"])

final_score_out_of_10 = final_score * 10
print("Final score calculated.")

print("\n" + "="*50)
print("="*50)
print(report_df.to_string(index=False))
print("\n" + "="*50)
print(" Final Teacher Rating ")
print(f"The final calculated score for the teacher is: {final_score_out_of_10:.2f} / 10")
print("="*50)


Final Metrics 


NameError: name 'faiss_symscore' is not defined




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
