In [2]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [11]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('corpora')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading corpora: Package 'corpora' not found in
[nltk_data]     index
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [13]:
class TranscriptSearch:
    def __init__(self, transcript_path):
        self.transcript_path = transcript_path
        self.chunks = self._preprocess_transcript()
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.chunks)
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.embeddings = self.embedding_model.encode(self.chunks)

    def _preprocess_transcript(self):
        """Parse and clean the transcript into chunks"""
        with open(self.transcript_path, 'r') as f:
            content = f.read()

        # Split into timestamped segments
        segments = re.findall(r'\[.*?\].*?(?=\[|$)', content, re.DOTALL)

        # Clean each segment
        cleaned_chunks = []
        for seg in segments:
            # Remove timestamps
            text = re.sub(r'\[.*?\]', '', seg).strip()
            # Remove extra whitespace
            text = re.sub(r'\s+', ' ', text)
            if text:
                cleaned_chunks.append(text)

        return cleaned_chunks

    def keyword_search(self, question, top_k=3):
        """Basic keyword matching search"""
        question = question.lower()
        question_words = set(word_tokenize(question))

        scores = []
        for chunk in self.chunks:
            chunk_words = set(word_tokenize(chunk.lower()))
            # Count matching words (simple intersection)
            score = len(question_words.intersection(chunk_words))
            scores.append(score)

        top_indices = np.argsort(scores)[-top_k:][::-1]
        return [(self.chunks[i], scores[i]) for i in top_indices if scores[i] > 0]

    def tfidf_search(self, question, top_k=3):
        """TF-IDF with cosine similarity search"""
        question_vec = self.tfidf_vectorizer.transform([question])
        similarities = cosine_similarity(question_vec, self.tfidf_matrix)
        top_indices = np.argsort(similarities[0])[-top_k:][::-1]
        return [(self.chunks[i], similarities[0][i]) for i in top_indices if similarities[0][i] > 0]

    def semantic_search(self, question, top_k=3):
        """Embedding-based semantic search"""
        question_embedding = self.embedding_model.encode([question])
        similarities = cosine_similarity(question_embedding, self.embeddings)
        top_indices = np.argsort(similarities[0])[-top_k:][::-1]
        return [(self.chunks[i], similarities[0][i]) for i in top_indices if similarities[0][i] > 0.3]

    def hybrid_search(self, question, top_k=3):
        """Combine results from all methods"""
        keyword_results = self.keyword_search(question, top_k)
        tfidf_results = self.tfidf_search(question, top_k)
        semantic_results = self.semantic_search(question, top_k)
          # Combine and deduplicate results
        all_results = {}
        for method, results in [('keyword', keyword_results),
                              ('tfidf', tfidf_results),
                              ('semantic', semantic_results)]:
            for text, score in results:
                if text not in all_results or score > all_results[text][1]:
                    all_results[text] = (method, score)

        # Sort by score
        sorted_results = sorted(all_results.items(), key=lambda x: -x[1][1])
        return [(text, f"{method} (score: {score:.2f})") for text, (method, score) in sorted_results[:top_k]]

def main():
    # Initialize with the transcript file
    search = TranscriptSearch('New_Employee_Induction_transcript.txt')

    print("Transcript Search System")
    print("Enter your question about employee induction (or 'quit' to exit):")

    while True:
        question = input("\nQuestion: ").strip()
        if question.lower() in ['quit', 'exit']:
            break

        if not question:
            print("Please enter a question.")
            continue

        print("\nSearch Results:")

        # Get hybrid results
        results = search.hybrid_search(question)

        if not results:
            print("No relevant results found.")
            continue

        for i, (text, method) in enumerate(results, 1):
            print(f"\nResult {i} ({method}):")
            print(text)

if __name__ == "__main__":
    main()





Transcript Search System
Enter your question about employee induction (or 'quit' to exit):

Question: what is employee induction?

Search Results:

Result 1 (keyword (score: 4.00)):
Beginning with the first topic, what is induction and what are the types of induction program?

Result 2 (keyword (score: 3.00)):
Employee induction is the program where the new employees introduce to the organization

Result 3 (keyword (score: 3.00)):
Don't forget to take the induction feedback for each new employee because this is helpful

Question: 1

Search Results:

Result 1 (semantic (score: 0.37)):
members.

Question: what are the types of induction progrms?

Search Results:

Result 1 (keyword (score: 7.00)):
Beginning with the first topic, what is induction and what are the types of induction program?

Result 2 (keyword (score: 4.00)):
Now let's see the types of induction process.

Result 3 (keyword (score: 4.00)):
Now moving on to the next part of the video that is what all topics are covered in th