In [None]:
%pip install langchain faiss-cpu tiktoken
%pip install sentence-transformers
%pip install -U langchain-community

In [256]:
import os
import getpass

from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS 


file_path = "output.txt"

loader = TextLoader(file_path, encoding="utf-8")
documents = loader.load()

print(documents[0].page_content[:600])

She isn't coming yet, Toto. Did she hurt you? She tried to, didn't she? Come on - we'll go tell Uncle Henry and Auntie Em. Come on, Toto.
Aunt Em!
Dorothy!
AUNT EM
Fifty-seven, fifty-eight --
Dorothy, please! We're trying to count!
Fifty-eight--
Oh, but Aunt Em, she hit him over the --  

Don't bother us now, honey -- this old incubator's gone bad, and we're likely to lose a lot of our chicks.

Oh -- oh, the poor little things. Oh, but Aunt Em, Miss Gulch hit Toto right over the back with a rake just because she says he gets in her garden and chases her nasty old cat every day.

Seventy --    


In [257]:
script = documents[0].page_content
add = False
scriptDict = {}
for i, line in enumerate(script.splitlines()):
    scriptDict[i] = line.strip() + " "        
print(len(scriptDict))
    

3059


In [None]:
scriptDict

In [258]:
modelPath = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,    
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [259]:
import json

with open ('combined_scenes.json', 'r') as file:
    scenes = json.load(file)
    

In [None]:
scriptDict

In [260]:
transcript_text = []
transcript_vectors = []

for i in range(len(scriptDict)):
    transcript_text.append(scriptDict[i])
    transcript_vectors.append(embeddings.embed_query(scriptDict[i]))
    
scene_vectors = []
for i, scene in enumerate(scenes):
    scene_vectors.append(embeddings.embed_query(scene['subtitle']))
    


In [None]:
scene_arr = []
for i, scene in enumerate(scenes):
    scene_arr.append(scene['subtitle'])

scene_text = " ".join(scene_arr).strip()
trans_text = " ".join(transcript_text).strip()

scene_text = scene_text.replace("\\", "")
trans_text = trans_text.replace("\\", "")


In [None]:
print(trans_text)

In [None]:
original_transcript_text = transcript_text.copy()

Kaushik's Thoughts
Old Approach

In [None]:
from numpy import dot
from numpy.linalg import norm

context_length = 10
res = {}
cache = {}

for i, scene_vector in enumerate(scene_vectors):
    print("i: " + str(i))
    print("Scene: " + scenes[i]['subtitle'])
    scene_norm = norm(scene_vector)
    best_similarity = 0.0
    start = None
    end = None
    res[i] = []
    
    j = 0
    while j < len(transcript_vectors):
        print("j: " + str(j))
        if start is None:
            start = j
            end = j
        lines = transcript_text[start:j + 1]
        key = (start, j + 1)
        if key in cache:
            possible_vector = cache[key]
        else:
            possible_text = " ".join(lines).strip()
            print("possible text:" + possible_text)
            possible_vector = embeddings.embed_query(possible_text)
            cache[key] = possible_vector
        
        possible_norm = norm(possible_vector)
        if scene_norm == 0.0 or possible_norm == 0.0:
            current_similarity = 0.0
        else:
            current_similarity = dot(scene_vector, possible_vector) / (scene_norm * possible_norm)
            print("Current similarity: " + str(current_similarity))
        
        tol = 0.01 if best_similarity > 0.9 else 0.05
        
        if current_similarity + tol >= best_similarity:
            best_similarity = current_similarity
            print("Best similarity: " + str(best_similarity))
            end = j
            j += 1 
        else:
            break
            
    if start is not None and end is not None:
        final_text_list = transcript_text[start:end + 1]
        final_text = " ".join(final_text_list).strip()
        
        contextual_start = max(0, start - context_length)
        contextual_end = min(len(transcript_text) - 1, end + context_length)
        
        context_list = transcript_text[contextual_start:contextual_end + 1]
        context_text = " ".join(context_list).strip()
        
        result = {
            "chunk start": start,
            "chunk end": end,
            "best similarity": best_similarity,
            "subtitle": scenes[i]['subtitle'],
            "text": final_text,
            "context": context_text,
            "range": (contextual_start, contextual_end)
        }
        
        res[i].append(result)
        transcript_vectors = transcript_vectors[end + 1:]
        transcript_text = transcript_text[end + 1:]
        cache = {}
        j = 0


In [None]:
%pip install fuzzywuzzy

Kaushik's Thoughts

Dynamic Derivative Similarity Approach

T(s) = k * (1-s) ** a

In [None]:
transcript_text = original_transcript_text.copy()

In [None]:
transcript_text

In [None]:
from numpy import dot
from numpy.linalg import norm
from fuzzywuzzy import fuzz
import statistics

context_length = 10
res = {}
cache = {}

k = 0.9
alpha = 1.7
min_allowed_increment = 0.0005
patience = 20

window_length = 6
fuzzy_threshold = 75

subtitle_scores = []
for i, scene_vector in enumerate(scene_vectors):
    print("i: " + str(i))
    print("Scene: " + scenes[i]['subtitle'])
    scene_norm = norm(scene_vector)
    best_similarity = 0.0
    start = None
    end = None
    res[i] = []
    prev_similarity = 0.0
    current_similarity = 0.0
    no_improvement = 0
    
    subtitle_start = " ".join(scenes[i]['subtitle'].split()[:10])
    found_start = None
    for j in range(0, len(transcript_text) - window_length + 1):
        window_text = " ".join(transcript_text[j:j+window_length])
        ratio = fuzz.partial_ratio(subtitle_start.lower(), window_text.lower())
        if ratio >= fuzzy_threshold:
            found_start = j
            break  
    if found_start is not None:
        start = found_start
        j = start   
        print("fuzzy")
    else:
        start = 0
        j = 0
        print("No fuzzy")
    
    while j < len(transcript_vectors):
        print("j: " + str(j))
        lines = transcript_text[start:j + 1]
        key = (start, j + 1)
        if key in cache:
            possible_vector = cache[key]
        else:
            possible_text = " ".join(lines).strip()
            print("possible text: " + possible_text)
            possible_vector = embeddings.embed_query(possible_text)
            cache[key] = possible_vector
        
        possible_norm = norm(possible_vector)
        if scene_norm == 0.0 or possible_norm == 0.0:
            prev_similarity = current_similarity
            current_similarity = 0.0
        else:
            prev_similarity = current_similarity
            current_similarity = dot(scene_vector, possible_vector) / (scene_norm * possible_norm)
            print("Current similarity: " + str(current_similarity))
        
        tolerance = k * (1 - best_similarity) ** alpha
        delta = current_similarity - prev_similarity
        
        if delta < 0 and abs(delta) > tolerance:
            break
        else:
            if delta < min_allowed_increment and transcript_text[j] != "":
                no_improvement += 1
            else:
                no_improvement = 0
            if no_improvement > patience:
                break
            
            if current_similarity > best_similarity:
                best_similarity = current_similarity
                end = j
            j += 1
    #go through possible_text and delete from the start line by line to see if similarity can get better. if it gets worse (tolerance), leave. if it gets better, use new
    if end is not None:
        for k in range(start, end):
            lines = transcript_text[k:end + 1]
            lines_vector = embeddings.embed_query(" ".join(lines).strip())
            lines_norm = norm(lines_vector)
            ratio = dot(scene_vector, lines_vector) / (scene_norm * lines_norm)
            if ratio > best_similarity:
                best_similarity = ratio
                start = k
            elif best_similarity - ratio > tolerance:
                break
    
            
    if start is not None and end is not None:
        final_text_list = transcript_text[start:end + 1]
        final_text = " ".join(final_text_list).strip()
        
        contextual_start = max(0, start - context_length)
        contextual_end = min(len(transcript_text) - 1, end + context_length)
        
        context_list = transcript_text[contextual_start:contextual_end + 1]
        context_text = " ".join(context_list).strip()
        
        result = {
            "chunk start": start,
            "chunk end": end,
            "best similarity": best_similarity,
            "subtitle": scenes[i]['subtitle'],
            "text": final_text,
            "context": context_text,
            "range": (contextual_start, contextual_end)
        }
        
        res[i].append(result)
        transcript_vectors = transcript_vectors[max(end - 30, 0):]
        transcript_text = transcript_text[max(end - 30, 0):]
        cache = {}
        subtitle_scores.append(best_similarity)
        j = 0


In [None]:
subtitle_scores

In [None]:
if subtitle_scores:
    avg_score = statistics.mean(subtitle_scores)
    median_score = statistics.median(subtitle_scores)
    # Use pstdev for population standard deviation (all scenes considered), or stdev for sample std dev
    stdev_score = statistics.pstdev(subtitle_scores) if len(subtitle_scores) > 1 else 0.0

    # Calculate how many scenes exceed the given thresholds
    total_scenes = len(subtitle_scores)
    above_0_7 = sum(1 for s in subtitle_scores if s >= 0.7)
    above_0_8 = sum(1 for s in subtitle_scores if s >= 0.8)
    perc_above_0_7 = (above_0_7 / total_scenes) * 100
    perc_above_0_8 = (above_0_8 / total_scenes) * 100
    
    above_0_5 = sum(1 for s in subtitle_scores if s >= 0.5)
    above_0_6 = sum(1 for s in subtitle_scores if s >= 0.58)
    perc_above_0_5 = (above_0_5 / total_scenes) * 100
    perc_above_0_6 = (above_0_6 / total_scenes) * 100

    # Print a clear performance summary
    print("\n=== Performance Summary ===")
    print(f"Total scenes processed: {total_scenes}")
    print(f"Average similarity score: {avg_score:.3f}")
    print(f"Median similarity score: {median_score:.3f}")
    print(f"Standard deviation of similarity scores: {stdev_score:.3f}")
    print(f"Scenes with similarity >= 0.5: {above_0_5}/{total_scenes} " 
          f"({perc_above_0_5:.1f}%)")
    print(f"Scenes with similarity >= 0.6: {above_0_6}/{total_scenes} " 
          f"({perc_above_0_6:.1f}%)")
    print(f"Scenes with similarity >= 0.7: {above_0_7}/{total_scenes} " 
          f"({perc_above_0_7:.1f}%)")
    print(f"Scenes with similarity >= 0.8: {above_0_8}/{total_scenes} " 
          f"({perc_above_0_8:.1f}%)")
    print("===========================\n")

In [None]:
from numpy import dot
from numpy.linalg import norm

s1 = ''' 
        Someplace where there isn't any trouble. Do you suppose there is such a place, Toto? There must be. It's not a place you can get to by a boat or a train. It's far, far away. Behind the moon... ...beyond the rain.... [SINGlNG] Somewhere over the rainbow Way up high There's a land that l heard of Once in a lullaby Somewhere over the rainbow Skies are blue And the dreams that you dare to dream Really do come true
'''
s2 = '''
DOROTHY  Some place where there isn't any trouble.        (CONTINUED)    8.  CONTINUED: (7)      MS -- Dorothy and Toto -- she tosses him a piece of the  cruller -- Toto eats it -- Dorothy speaks as she walks  forward -- she sings -- leans against haystack -- then walks  over near rake -- CAMERA PANS right --    DOROTHY (CONT'D)  Do you suppose there is such a place,  Toto? There must be. It's not a place  you can get to by a boat or a train.  It's far, far away -- behind the moon --  beyond the rain --    DOROTHY (CONT'D)  (sings)  Somewhere, over the rainbow, way up high,  There's a land that I heard of once in a  lullaby.    Somewhere, over the rainbow, skies are blue, And the dreams  that you dare to dream really do.... CS -- Toto by wheel of  rake -- listening to song -- DOROTHY o.s. (sings) ...come  true.... MCS -- Dorothy singing -- swings on wheel of rake --  then walks forward around wheel -- Toto jumps up onto seat of  rake -- Dorothy pets him -- sits on front of rake -- CAMERA  PULLS back -- Dorothy finishes song --    DOROTHY (CONT'D)  (sings)  ...Someday I'll wish upon a star And wake  up where the clouds are far behind me.    Where troubles melt like lemon drops, Away above the chimney  tops, That's where you'll find me. Somewhere, over the  rainbow, bluebirds fly. Birds fly over the rainbow, Why then -  - oh, why can't I? If happy little bluebirds fly Beyond the  rainbow Why, oh, why can't I? LS -- Miss Gulch rides along  country road on bicycle -- CAMERA PANS to right with her --  LS -- Miss Gulch rides forward to front of Gale's home --  stops and gets off her bicycle as Uncle Henry comes forward
'''


one = embeddings.embed_query(s1)
two = embeddings.embed_query(s2)
dp = (dot(one, two))
nm = norm(one) * norm(two)
ratio = dp / nm

print(ratio)

Unbalanced Optimal Transport

In [None]:
def tokenize(text):
    """Split text into tokens (words), ignoring punctuation but keeping apostrophes in words."""
    import re
    # Keep letters, numbers, and apostrophes as part of words
    tokens = re.findall(r"[A-Za-z0-9']+", text)
    return tokens

def align_subtitle_transcript(subtitle_text, transcript_text):
    """Align subtitle text with transcript text using DP. Returns list of (subtitle_word, transcript_word) pairs."""
    # Tokenize both texts
    s_words = tokenize(subtitle_text)
    t_words = tokenize(transcript_text)
    n, m = len(s_words), len(t_words)
    # DP table and backpointer initialization
    # dp[i][j] = minimum alignment cost for s_words[:i] vs t_words[:j]
    dp = [[0] * (m+1) for _ in range(n+1)]
    backptr = [[None] * (m+1) for _ in range(n+1)]
    # Initialize base cases: align empty sequence with prefixes (cost = j or i gaps)
    for i in range(1, n+1):
        dp[i][0] = i  # i deletions (skip all i subtitle words)
        backptr[i][0] = (i-1, 0, 'up')  # came from above (skip subtitle word)
    for j in range(1, m+1):
        dp[0][j] = j  # j insertions (skip all j transcript words)
        backptr[0][j] = (0, j-1, 'left')  # came from left (skip transcript word)
    # Fill DP table
    for i in range(1, n+1):
        for j in range(1, m+1):
            # Cost for substituting or matching s_words[i-1] with t_words[j-1]
            if s_words[i-1].lower() == t_words[j-1].lower():
                match_cost = 0  # words match
            else:
                match_cost = 1  # words differ (substitution cost)
            # Compute costs for three possibilities:
            # 1. Match/Substitute s_words[i-1] with t_words[j-1]
            cost_diag = dp[i-1][j-1] + match_cost
            # 2. Skip s_words[i-1] (align it to a gap)
            cost_up = dp[i-1][j] + 1  # gap cost = 1
            # 3. Skip t_words[j-1] (align it to a gap)
            cost_left = dp[i][j-1] + 1  # gap cost = 1
            # Choose min cost option
            min_cost = cost_diag
            direction = 'diag'
            if cost_up < min_cost:
                min_cost = cost_up
                direction = 'up'
            if cost_left < min_cost:
                min_cost = cost_left
                direction = 'left'
            dp[i][j] = min_cost
            # Record backpointer for reconstructing alignment
            if direction == 'diag':
                backptr[i][j] = (i-1, j-1, 'diag')
            elif direction == 'up':
                backptr[i][j] = (i-1, j, 'up')
            else:  # 'left'
                backptr[i][j] = (i, j-1, 'left')
    # Backtrack from dp[n][m] to get alignment
    alignment = []
    i, j = n, m
    while i > 0 or j > 0:
        prev_i, prev_j, direction = backptr[i][j]
        if direction == 'diag':
            # Word from subtitle aligned to word from transcript (match or substitution)
            alignment.append((s_words[i-1], t_words[j-1]))
        elif direction == 'up':
            # Subtitle word aligned to nothing (it was skipped)
            alignment.append((s_words[i-1], None))
        elif direction == 'left':
            # Transcript word aligned to nothing (skipped on transcript side)
            alignment.append((None, t_words[j-1]))
        i, j = prev_i, prev_j
    alignment.reverse()  # reverse to get from start to end
    return alignment

# Example usage:
subtitle = scene_text
transcript = trans_text
alignment = align_subtitle_transcript(subtitle, transcript)
count = 0
for pair in alignment:
    if pair[0] is None or pair[1] is None:
        count += 1
    print(pair)
    
print(count)


In [None]:
len(alignment)
count = 0
for pair in alignment:
    if pair[0] is None or pair[1] is None:
        count += 1
print(count)