In [1]:
! pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: nltk
Successfully installed nltk-3.9.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [1]:
#IMPORT STATEMENTS
import pandas as pd
import json
import nltk
from nltk.tokenize import sent_tokenize
import os
import transformers
import torch
from transformers import pipeline

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
def chunk_transcript(data: pd.DataFrame) -> pd.DataFrame:
    
    # Ensure 'Transkript' column has no NaN values
    data['Transkript'] = data['Transkript'].fillna("").astype(str)

    # Chunking logic
    chunked_data = []
    current_speaker = None
    current_text = ""
    initial_timestamp = ""

    for index, row in data.iterrows():
        speaker = row['Sprecher']
        transcript = row['Transkript']
        timestamp = row['Timecode'] 

        if speaker != current_speaker:
            # Save previous chunk if exists
            if current_speaker is not None:
                chunked_data.append({
                    'Speaker': current_speaker,
                    'Transcript': current_text.strip(),
                    'Initial_Timestamp' : initial_timestamp,
                    'Current_Timestamp' : timestamp
                })
            
            # Start a new chunk
            current_speaker = speaker
            current_text = transcript
            initial_timestamp = timestamp 
        else:
            # Continue appending to the same speaker's chunk
            current_text += " " + transcript if isinstance(transcript, str) else ""

    # Save the last chunk
    if current_speaker is not None:
        chunked_data.append({
            'Speaker': current_speaker,
            'Transcript': current_text.strip(),
            'Initial_Timestamp' : initial_timestamp,
            'Current_Timestamp' : timestamp
        })

    # Convert to DataFrame
    chunked_df = pd.DataFrame(chunked_data)
    
    return chunked_df

In [3]:
def chunk_by_sentence(chunked_df: pd.DataFrame, min_tokens=256, max_tokens=512) -> pd.DataFrame:
    """
    Further splits chunks by sentence while ensuring each chunk is within a token range.

    Args:
        chunked_df (pd.DataFrame): Input DataFrame with 'Speaker' and 'Transcript' columns.
        min_tokens (int): Minimum number of tokens per chunk.
        max_tokens (int): Maximum number of tokens per chunk.

    Returns:
        pd.DataFrame: DataFrame with sentence-based chunked transcripts.
    """

    # Function to count tokens (approximate, assuming 1 word ≈ 1.2 tokens)
    def count_tokens(text):
        return len(text.split()) * 1.2  # Rough estimate

    # Initialize list for final merged chunks
    merged_chunks = []
    temp_chunk = []
    temp_token_count = 0
    speaker = None
    initial_timestamp = ""
    final_timestamp = ""

    # Process each row
    for _, row in chunked_df.iterrows():
        sentence = row['Transcript']
        sentence_tokens = count_tokens(sentence)

        # If adding this chunk keeps us within MAX_TOKENS
        if temp_token_count + sentence_tokens <= max_tokens:
            if not temp_chunk:
                speaker = row['Speaker']  # Store speaker only for new chunks
                initial_timestamp = row['Initial_Timestamp']
            temp_chunk.append(sentence)
            temp_token_count += sentence_tokens
        else:
            # Save the previous chunk before starting a new one
            if temp_chunk:
                merged_chunks.append({
                    'Speaker': speaker,
                    'Transcript': " ".join(temp_chunk),
                    'Initial_Timestamp' : initial_timestamp,
                    'Current_Timestamp' : row['Current_Timestamp']
                })

            # Start a new chunk with the current sentence
            temp_chunk = [sentence]
            temp_token_count = sentence_tokens
            speaker = row['Speaker']
            initial_timestamp = row['Initial_Timestamp']
            final_timestamp = row['Current_Timestamp']

    # Save last chunk if any content remains
    if temp_chunk:
        merged_chunks.append({
            'Speaker': speaker,
            'Transcript': " ".join(temp_chunk),
            'Initial_Timestamp' : initial_timestamp,
            'Current_Timestamp' : final_timestamp
        })

    # Convert to DataFrame
    final_merged_df = pd.DataFrame(merged_chunks)
    
    return final_merged_df

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

model_path = "models/meta-llama/Llama-3.3-70B-Instruct"

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    device_map="auto", 
    torch_dtype=torch.bfloat16, 
    quantization_config=quantization_config,
    trust_remote_code=True  # Add this for some custom models
)

tokenizer = AutoTokenizer.from_pretrained(model_path)

metadata_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [5]:
import pandas as pd
import json
import re

def extract_metadate(metadata_pipeline, chunks: pd.DataFrame, metadata_schema: dict) -> pd.DataFrame:
    """
    Extract metadata from multiple chunks of a transcript using the specified model.

    Args:
        client: The AI client object.
        model_name: The AI model to use.
        chunks: A DataFrame with 'Speaker' and 'Transcript' columns.
        metadata_schema: A dictionary representing the metadata schema.

    Returns:
        pd.DataFrame: A structured DataFrame containing extracted metadata.
    """
    all_metadata = []

    if isinstance(metadata_schema, str):
        try:
            with open(metadata_schema, "r", encoding="utf-8") as f:
                metadata_schema = json.load(f)
        except Exception as e:
            print(f"❌ Error loading metadata schema: {e}")
            return pd.DataFrame()
                
    schema_json = json.dumps(metadata_schema, indent=2)

    for i, row in chunks.iloc[:4].iterrows():
        speaker = row["Speaker"]
        transcript = row["Transcript"]  # Prevent AI truncation
        timestamp = row['Timestamp']



          # JSON-structured prompt
        prompt = f"""
        You are an AI model specialized in extracting structured metadata from interview transcripts.  
        Return **only a valid JSON object** based on the given schema.  
        If information is missing, set the value to `???`.  
        For each extracted value, append the corresponding timestamp in parentheses (e.g., `Value (timestamp)`).  
        If the value is `???`, do not append the timestamp.  
        Do not add explanations, markdown formatting, or extra text.

        Erforderliches Format:
        {schema_json}

        Transkript:
        {transcript}

        Timestamp:
        {timestamp}

        JSON-Antwort:
        """

         # Generate response using the local LLaMA model
        try:
            response_text =  metadata_pipeline(prompt, max_new_tokens=1800,return_full_text=False,temperature=0.2)[0]["generated_text"]
        except Exception as e:
            print(f"❌ Error generating response for chunk {i+1}: {e}")
            response_text = "[]"  # Default to an empty JSON
        
        # Debugging: Print the raw AI response to identify issues
        print(f"\n🔍 DEBUG: Raw AI Response for chunk {i+1} data type {type(response_text)}:\n{response_text}\n")

        response_text = response_text.strip().strip("[]")

        # Extract the dictionary (remove list brackets)
        # data_dict = data_list[0] if data_list else {}


        metadata = response_text
        # print(f"\n🔍 DEBUG: Raw Metadata for chunk {i+1}:{type(metadata)}")
        # Parse metadata into a dictionary
        extracted_metadata = {"Speaker": speaker}  # Store speaker info
        for line in metadata.split("\n"):
            if ":" in line:
                key, value = line.split(":", 1)
                extracted_metadata[key.strip()] = value.strip()

        # Remove "%%%%" values before storing
        extracted_metadata = {k: (v.replace('"', "") if v.replace('"', "").replace(",", "") != "???" else "") for k, v in extracted_metadata.items()}

        if i < 4 - 1:  # If not the last iteration
            extracted_metadata = {k: v.strip() + "," if v else v.strip() for k, v in extracted_metadata.items()}

        # Append metadata for this chunk
        all_metadata.append(extracted_metadata)

    # Convert metadata list into a DataFrame
    return pd.DataFrame(all_metadata)
        

In [15]:
def chunk_transcript_optimized_with_timestamps(input_data: pd.DataFrame, model_name: str, max_tokens: int = 12000) -> pd.DataFrame:
    """
    Optimized chunking for LLaMA while preserving timestamps and minimizing chunk sizes.

    Args:
        input_data (pd.DataFrame): DataFrame containing 'Sprecher', 'Transkript', and 'Timecode'.
        model_name (str): Name of the LLaMA model to determine the tokenizer.
        max_tokens (int): Max tokens per chunk (default ~12,000).

    Returns:
        pd.DataFrame: Chunked transcript with timestamps and speakers.
    """
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Ensure 'Transkript' column has no NaN values
    input_data['Transkript'] = input_data['Transkript'].fillna("").astype(str)
    
    chunked_data = []
    tokenized_text = []
    token_timestamps = []
    token_speakers = []
    
    for index, row in input_data.iterrows():
        transcript = row['Transkript']
        timestamp = row['Timecode']
        speaker = row['Sprecher']
        
        tokenized_segment = tokenizer(transcript, return_tensors="pt", truncation=False)["input_ids"].squeeze().tolist()
        tokenized_segment = tokenized_segment if isinstance(tokenized_segment, list) else [tokenized_segment]
        
        tokenized_text.extend(tokenized_segment)
        token_timestamps.extend([timestamp] * len(tokenized_segment))  # Align timestamps with tokens
        token_speakers.extend([speaker] * len(tokenized_segment))  # Align speakers with tokens
    
    start_idx = 0
    while start_idx < len(tokenized_text):
        # Get the next chunk within token limit
        end_idx = min(start_idx + max_tokens, len(tokenized_text))
        # Find the nearest sentence boundary
        sub_text = tokenizer.decode(tokenized_text[start_idx:end_idx])
        sentences = re.split(r'(?<=[.!?])\s+', sub_text)  # Split at sentence boundaries
 
        # Ensure the last sentence doesn't get cut off
        chunk = " ".join(sentences[:-1]) if len(sentences) > 1 else sub_text
        
        # Determine timestamps and speakers safely
        initial_timestamp = token_timestamps[start_idx] if start_idx < len(token_timestamps) else None
        current_timestamp = token_timestamps[end_idx - 1] if (end_idx - 1) < len(token_timestamps) else initial_timestamp
        speaker = token_speakers[start_idx] if start_idx < len(token_speakers) else None
        
        # Store chunk details
        chunked_data.append({
            'Speaker': speaker,
            'Transcript': chunk.strip(),
            'Initial_Timestamp': initial_timestamp,
            'Current_Timestamp': current_timestamp
        })
        
        # Move start index past processed text
        start_idx += len(tokenizer(chunk)["input_ids"][0])
    
    return pd.DataFrame(chunked_data)


In [16]:
import os
import time

folder_path = "Transcripts"
schema_file = "metadata_schema.json" 
# Load schema as dictionary
with open(schema_file, "r", encoding="utf-8") as f:
    metadata_schema = json.load(f) 
    
MODEL = "llama-3.3-70b-versatile"

# List to store metadata for all files
all_metadata = []
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path) & filename.endswith(".csv"):
        print(f"\nProcessing file: {filename}")
        input_data = pd.read_csv(file_path, sep=None, engine='python')
        # speaker_chunks_df = chunk_transcript(input_data)  # Stores speaker-based chunks
        # final_chunks_df = chunk_by_sentence(speaker_chunks_df)
        final_chunks_df = chunk_transcript_optimized_with_timestamps(input_data , model_path)
        final_chunks_df['Timestamp'] = final_chunks_df['Initial_Timestamp'] + " - " + final_chunks_df['Current_Timestamp'] 
        final_chunks_df.drop(columns=['Initial_Timestamp', "Current_Timestamp"], inplace=True)
        # print(final_chunks_df)
        

        # Extract metadata for the chunks
        llama_70b_responses = extract_metadate(metadata_pipeline, final_chunks_df, metadata_schema)
        # Ensure that the response DataFrame contains metadata columns
        if not llama_70b_responses.empty:
            # Merge chunk outputs into a single row 
            merged_metadata = llama_70b_responses.apply(lambda col: ' '.join(col.dropna().astype(str)))
            
            for column in merged_metadata.index:
                unique_values = set([value.strip() for value in merged_metadata[column].strip().split(",")])
                list_unique_values = list(filter(None, unique_values))
                merged_metadata[column] = " | ".join(list_unique_values)

            # Add filename for reference
            # merged_metadata["Filename"] = filename  

            # Append to list
            all_metadata.append(merged_metadata)
        else:
            print(f"No metadata extracted from {filename}")
         
        time.sleep(0.5)

# Convert list of metadata rows into a single DataFrame
final_metadata_df = pd.DataFrame(all_metadata)



Processing file: adg0001_er_2024_10_31.csv


TypeError: object of type 'int' has no len()

In [125]:
final_metadata_df

Unnamed: 0,Speaker,"""Standort""","""Archiv ID""","""PROBANDNR""","""DOK_ART""","""ARCHIVORT""","""PROVENIENZ""","""SPERRUNG""","""ENTSTZEIT""","""Zeitumfang 1""",...,"""PART_HERKU""","""PART_SCHUL""","""PART_AUSBI""","""PART_STAND""","""PART_BERUF""","""PART_POLOR""","""PART_PKONV""","""PART_ENGAG""","""KRIT10""","""GRÜNDE"""
0,INT_AH | IP_FA,Hemer im Sauerland,,,,,,,,,...,,,,,,,,,,


In [121]:
final_metadata_df.to_csv("metadata_results.csv", index=False)