In [None]:
!pip install nltk

## Metadata Extraction - Case Study 01


### Importing the required packages 

In [None]:
import os
import torch
import time
import json
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from sklearn.metrics.pairwise import cosine_similarity
# import pandas as pd
import re
nltk.download('punkt')
nltk.download('punkt_tab')

### Structure of Metadata 
The initial schema was in the form of csv which is converted in json and a detailed decription for better parsing as an input to LLM 

In [None]:
import json
schema_file = "metadata_schema.json" 

# Load schema as dictionary
with open(schema_file, "r", encoding="utf-8") as f:
    metadata_schema = json.load(f) 

In [None]:
metadata_schema

### Structure Of Transcripts

In [None]:
folder_path = "Transcripts"
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path) & filename.endswith(".csv"):
        print(f"\nProcessing file: {filename}")
        input_data = pd.read_csv(file_path, sep=None, engine='python')
        print(f"\nFile Loaded!")
input_data        

# Loading the Model

This cell describes how the "Llama-3.3-70B-Instruct" model is loaded.


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

model_path = "models/meta-llama/Llama-3.3-70B-Instruct"

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    device_map="auto", 
    torch_dtype=torch.bfloat16, 
    quantization_config=quantization_config,
    trust_remote_code=True  # Add this for some custom models
)

tokenizer = AutoTokenizer.from_pretrained(model_path)

metadata_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)


# Transcript Chunking
1. **Split Text into Paragraphs**
    - We first break the transcript into paragraphs (or small chunks) so that each paragraph focuses on a relatively coherent piece of text.
2. **Compute Paragraph Similarities**
    - We convert each paragraph into a TF-IDF vector.
    - We then calculate the cosine similarity between each consecutive pair of paragraphs.
3. **Detect Dips (Topic Boundaries)**
    - Whenever the similarity between paragraph *i* and paragraph *i+1* falls **below a chosen threshold**, we mark that as a boundary.
    - You can pick this threshold empirically (e.g., 0.3 or 0.4), or use local minima in the similarity scores.
4. **Create Final Segments**
    - Merge paragraphs from one boundary to the next into a single segment.
    - Each segment should then represent a coherent portion of the text before a significant topic shift occurs.

In [None]:
def split_into_paragraphs(text, sentences_per_paragraph=10):
    """
    Splits the text into paragraphs containing a fixed number of sentences.
    """
    sentences = sent_tokenize(text)
    paragraphs = []
    for i in range(0, len(sentences), sentences_per_paragraph):
        paragraphs.append(" ".join(sentences[i:i+sentences_per_paragraph]))
    return paragraphs

In [None]:
def segment_text_by_local_minima(text, sentences_per_paragraph=10):
    """
    Segments a transcript into chunks by detecting local minima in cosine similarity
    between adjacent paragraphs. Returns a DataFrame with one column 'Segment'
    containing each segment's text.
    """
    # Split text into paragraphs
    paragraphs = split_into_paragraphs(text, sentences_per_paragraph)
    
    # If there's only one paragraph, return it as the only segment
    if len(paragraphs) <= 1:
        return pd.DataFrame({"Segment": [text]})
    
    # Vectorize paragraphs using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(paragraphs)
    
    # Compute cosine similarity between consecutive paragraphs
    similarities = []
    for i in range(tfidf_matrix.shape[0] - 1):
        sim = cosine_similarity(tfidf_matrix[i], tfidf_matrix[i+1])[0][0]
        similarities.append(sim)
    
    # Identify local minima: where the similarity is less than its immediate neighbors
    boundaries = [0]  # starting index of first segment
    for i in range(1, len(similarities) - 1):
        if similarities[i] < similarities[i-1] and similarities[i] < similarities[i+1]:
            boundaries.append(i + 1)
    boundaries.append(len(paragraphs))  # end boundary
    
    # Build segments from boundaries
    segments = []
    for start, end in zip(boundaries, boundaries[1:]):
        segment_text = " ".join(paragraphs[start:end])
        segments.append(segment_text)
    
    return pd.DataFrame({"Segment": segments})


# Metadata Extraction with Context

1. **Schema Loading and Parsing**  
   - The function accepts a metadata schema either as a dictionary or as a file path.  
   - If a file path is provided, it loads and parses the JSON schema, setting the structure for the metadata to be extracted.

2. **Segment Processing**  
   - Iterates through each row of a DataFrame containing transcript segments and their associated timestamp ranges.  
   - For every segment, a detailed prompt is constructed that includes a description of the extraction task, previously aggregated context, the JSON-formatted metadata schema, the transcript text, and its timestamp.

3. **Context-Aware Metadata Extraction**  
   - The prompt is fed to a text-generation pipeline that returns a response expected to contain a JSON object with the extracted metadata.  
   - A regular expression is used to isolate the JSON from the response, which is then parsed into a Python dictionary.

4. **Aggregating Context**  
   - Valid metadata values (those not marked as "???") are appended to an aggregated context string.  
   - This cumulative context is injected into subsequent prompts to guide the extraction process by leveraging information from previous segments.

5. **Final Output Construction**  
   - The metadata from each segment is collected into a list and then converted into a pandas DataFrame.  
   - This DataFrame provides a structured and consolidated view of all extracted metadata across the transcript segments.


In [None]:
def extract_metadata_with_context(metadata_pipeline, segments_df: pd.DataFrame, metadata_schema: dict) -> pd.DataFrame:
    all_metadata = []
    
    if isinstance(metadata_schema, str):
        try:
            with open(metadata_schema, "r", encoding="utf-8") as f:
                metadata_schema = json.load(f)
        except Exception as e:
            print(f"❌ Error loading metadata schema: {e}")
            return pd.DataFrame()
    
    schema_json = json.dumps(metadata_schema, indent=2)
    
    aggregated_context = ""
    # print("Initial Aggregated Context:")
    # print(aggregated_context)
    
    for i, row in segments_df.iterrows():
        segment_text = row["Segment"]
        # print(f"\nProcessing Segment {i+1}:\n{segment_text}\n")
        
        prompt =  f"""
        You are an AI model specialized in extracting structured metadata from German interview transcripts.
        Below you have an interview transcript. Please analyze the interview transcript from the interviewee’s perspective.
        The provided transcript covers the interviewee's life experiences, including positions held, work-related events, and family interactions.
        Extract all relevant details from the text.
        For any detected date with a two-digit year, convert it to a four-digit year by assuming it falls in the 1900s (e.g., "67" becomes "1967").
        You will receive a JSON metadata object with predefined keys as input and values as the description of the keys.
        Carefully analyze the transcript and extract detailed information to populate as many keys as possible.
        Only assign '???' to a key if there is no explicit evidence in the transcript for that field.
        Return only a valid JSON object containing the keys with their corresponding extracted values.
        Do not include any additional text, markdown formatting, or explanations.
                        
        Aggregated Context from previous segments:
        {aggregated_context}

        Metadata Schema:
        {schema_json}

        Transcript:
        {segment_text}

        JSON-Antwort:
        """
        
        try:
            response_text = metadata_pipeline(prompt, max_new_tokens=1800, return_full_text=False, temperature=0.2)[0]["generated_text"]
        except Exception as e:
            print(f"❌ Error generating response for segment {i+1}: {e}")
            response_text = "[]"  # Default to an empty JSON
            
        
        print(f"\n🔍 DEBUG: Raw AI Response for segment {i+1}:\n{response_text}\n")
        response_text = response_text.strip().strip("[]")
        
        json_match = re.search(r"\{[\s\S]*\}", response_text)
        extracted_metadata = {}
        if json_match:
            json_str = json_match.group(0)
            try:
                data = json.loads(json_str)
                extracted_metadata.update(data)
            except json.JSONDecodeError as e:
                print(f"JSON parsing error: {e}")
                # fallback: use line-splitting or an empty dict
        else:
            print("No JSON object found in the response.")
            # fallback: or just continue
        
        # 2. Update aggregator
        for key in metadata_schema.keys():
            # If the key is present and not "???", add it to aggregator
            val = extracted_metadata.get(key, "???")
            if val != "???":
                aggregated_context += f"{key}: {val}\n"
                
        
        # 3. Print aggregator
        # print(f"\nAggregated Context after segment {i+1}:\n{aggregated_context}\n")
        
        # 4. Also store metadata in a list for a final DataFrame
        # extracted_metadata["Segment"] = segment_text
        extracted_metadata = {
            k: (str(v).replace('"', "") if str(v).replace('"', "").replace(",", "") != "???" else "")
            for k, v in extracted_metadata.items()
        }        
        all_metadata.append(extracted_metadata)
    return pd.DataFrame(all_metadata)


# Main Function for Metadata Extraction from Transcripts

This main function consolidates the entire workflow for processing transcript files, extracting structured metadata using an AI model, and aggregating the results into a final DataFrame. The process is as follows:

1. **Schema Loading:**  
   The metadata schema is loaded from a JSON file. This schema defines the expected keys and structure for the metadata to be extracted from the transcripts.

2. **File Iteration and Data Loading:**  
   The function iterates through all CSV files in the "Transcripts" folder. For each file, it reads the transcript data and extracts relevant columns such as "Transkript" and "Timecode".

3. **Transcript Segmentation:**  
   The transcript is segmented into smaller chunks using the `segment_text_by_local_minima` function, which identifies topic boundaries based on text similarity. A subset of these segments is processed to demonstrate metadata extraction.

4. **Context-Aware Metadata Extraction:**  
   The `extract_metadata_with_context` function is invoked for each segment subset. This function builds a detailed prompt incorporating both the current transcript segment and previously aggregated context, enabling the AI model to extract detailed metadata.

5. **Aggregation of Metadata:**  
   For each processed file, only the aggregated metadata (from the final segment) is collected. All metadata are then combined into a final DataFrame, providing a structured overview of the extracted information.

This approach ensures that metadata is not only extracted accurately from each segment but also benefits from contextual continuity across transcript segments.


In [None]:
import os
import time
import json
import pandas as pd
import re

# Set pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


def main():
    """
    Main pipeline for processing transcript CSV files:
    
    1. Loads a metadata schema from a JSON file.
    2. Iterates through CSV files in the 'Transcripts' folder.
    3. For each file, loads transcript data and segments it using a local minima method.
    4. Extracts metadata with context from the segmented transcript using an AI pipeline.
    5. Aggregates the metadata from all processed files into a final DataFrame.
    
    Returns:
        final_metadata_df (pd.DataFrame): DataFrame containing aggregated metadata.
    """
    pd.set_option('display.max_rows', None)  # Show all rows
    pd.set_option('display.max_columns', None)  # Show all columns
    
    folder_path = "Transcripts"
    schema_file = "metadata_schema.json"

    # Load the metadata schema from JSON file
    with open(schema_file, "r", encoding="utf-8") as f:
        metadata_schema = json.load(f)

    all_metadata = []  # List to store metadata DataFrames for each file

    # Iterate over each file in the folder
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path) and filename.endswith(".csv"):
            print(f"\nProcessing file: {filename}")
            input_data = pd.read_csv(file_path, sep=None, engine='python')
            print("File Loaded!")
            
            # Extract transcript and timestamp lists (if needed elsewhere)
            transcript_text = " ".join(input_data["Transkript"].dropna().astype(str))
            segments_df = segment_text_by_local_minima(transcript_text, sentences_per_paragraph=10)
            segments_df_subset = segments_df.head(3)
            llama_70b_responses = extract_metadata_with_context(metadata_pipeline, segments_df_subset, metadata_schema)

            if not llama_70b_responses.empty:
            # Taking only the last row as it has the aggregated values
                metadata_extracted = llama_70b_responses.tail(1)
                metadata_extracted["Filename"] = filename
                all_metadata.append(metadata_extracted)
            
            else:
                print(f"No metadata extracted from {filename}")
         
        time.sleep(0.5)  # Optional pause between processing files

    # Combine all metadata into a single DataFrame
    if all_metadata:
        final_metadata_df = pd.concat(all_metadata, ignore_index=True)
    else:
        final_metadata_df = pd.DataFrame()
    
    # print("\nFinal Metadata DataFrame:")
    # print(final_metadata_df)
    return final_metadata_df

if __name__ == "__main__":
    final_metadata_df = main()


In [None]:
final_metadata_df