In [1]:
#IMPORT STATEMENTS
import pandas as pd
import json
import nltk
from nltk.tokenize import sent_tokenize
import os
import transformers
import torch
from transformers import pipeline

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
def chunk_transcript(data: pd.DataFrame) -> pd.DataFrame:
    
    # Ensure 'Transkript' column has no NaN values
    data['Transkript'] = data['Transkript'].fillna("").astype(str)

    # Chunking logic
    chunked_data = []
    current_speaker = None
    current_text = ""

    for index, row in data.iterrows():
        speaker = row['Sprecher']
        transcript = row['Transkript']

        if speaker != current_speaker:
            # Save previous chunk if exists
            if current_speaker is not None:
                chunked_data.append({
                    'Speaker': current_speaker,
                    'Transcript': current_text.strip(),
                })
            
            # Start a new chunk
            current_speaker = speaker
            current_text = transcript
        else:
            # Continue appending to the same speaker's chunk
            current_text += " " + transcript if isinstance(transcript, str) else ""

    # Save the last chunk
    if current_speaker is not None:
        chunked_data.append({
            'Speaker': current_speaker,
            'Transcript': current_text.strip(),
        })

    # Convert to DataFrame
    chunked_df = pd.DataFrame(chunked_data)
    
    return chunked_df

In [3]:
def chunk_by_sentence(chunked_df: pd.DataFrame, min_tokens=256, max_tokens=512) -> pd.DataFrame:
    """
    Further splits chunks by sentence while ensuring each chunk is within a token range.

    Args:
        chunked_df (pd.DataFrame): Input DataFrame with 'Speaker' and 'Transcript' columns.
        min_tokens (int): Minimum number of tokens per chunk.
        max_tokens (int): Maximum number of tokens per chunk.

    Returns:
        pd.DataFrame: DataFrame with sentence-based chunked transcripts.
    """

    # Function to count tokens (approximate, assuming 1 word ≈ 1.2 tokens)
    def count_tokens(text):
        return len(text.split()) * 1.2  # Rough estimate

    # Initialize list for final merged chunks
    merged_chunks = []
    temp_chunk = []
    temp_token_count = 0
    speaker = None

    # Process each row
    for _, row in chunked_df.iterrows():
        sentence = row['Transcript']
        sentence_tokens = count_tokens(sentence)

        # If adding this chunk keeps us within MAX_TOKENS
        if temp_token_count + sentence_tokens <= max_tokens:
            if not temp_chunk:
                speaker = row['Speaker']  # Store speaker only for new chunks
            temp_chunk.append(sentence)
            temp_token_count += sentence_tokens
        else:
            # Save the previous chunk before starting a new one
            if temp_chunk:
                merged_chunks.append({
                    'Speaker': speaker,
                    'Transcript': " ".join(temp_chunk),
                })

            # Start a new chunk with the current sentence
            temp_chunk = [sentence]
            temp_token_count = sentence_tokens
            speaker = row['Speaker']


    # Save last chunk if any content remains
    if temp_chunk:
        merged_chunks.append({
            'Speaker': speaker,
            'Transcript': " ".join(temp_chunk)
        })

    # Convert to DataFrame
    final_merged_df = pd.DataFrame(merged_chunks)
    
    return final_merged_df

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

model_path = "models/meta-llama/Llama-3.3-70B-Instruct"

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    device_map="auto", 
    torch_dtype=torch.bfloat16, 
    quantization_config=quantization_config,
    trust_remote_code=True  # Add this for some custom models
)

tokenizer = AutoTokenizer.from_pretrained(model_path)

metadata_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [13]:
import pandas as pd
import json
import re

def extract_metadate(metadata_pipeline, chunks: pd.DataFrame, metadata_schema: dict) -> pd.DataFrame:
    """
    Extract metadata from multiple chunks of a transcript using the specified model.

    Args:
        client: The AI client object.
        model_name: The AI model to use.
        chunks: A DataFrame with 'Speaker' and 'Transcript' columns.
        metadata_schema: A dictionary representing the metadata schema.

    Returns:
        pd.DataFrame: A structured DataFrame containing extracted metadata.
    """
    all_metadata = []

    if isinstance(metadata_schema, str):
        try:
            with open(metadata_schema, "r", encoding="utf-8") as f:
                metadata_schema = json.load(f)
        except Exception as e:
            print(f"❌ Error loading metadata schema: {e}")
            return pd.DataFrame()
                
    schema_json = json.dumps(metadata_schema, indent=2)

    for i, row in chunks.iloc[:5].iterrows():
        speaker = row["Speaker"]
        transcript = row["Transcript"]  # Prevent AI truncation



          # JSON-structured prompt
        prompt = f"""
        You are an AI model specialized in extracting structured metadata from interview transcripts. 
        Return **only a valid JSON object** based on the given schema. 
        If information is missing, set the value to `???`. 
        Do not add explanations, markdown formatting, or extra text.

        Erforderliches Format:
        {schema_json}

        Transkript:
        {transcript}

        JSON-Antwort:
        """

         # Generate response using the local LLaMA model
        try:
            response_text =  metadata_pipeline(prompt, max_new_tokens=1800,return_full_text=False,temperature=0.2)[0]["generated_text"]
        except Exception as e:
            print(f"❌ Error generating response for chunk {i+1}: {e}")
            response_text = "{}"  # Default to an empty JSON

        # Debugging: Print the raw AI response to identify issues
        print(f"\n🔍 DEBUG: Raw AI Response for chunk {i+1}:\n{response_text}\n")
        


    return extracted_df


In [10]:
import os
import time

folder_path = "Transcripts"
schema_file = "metadata_schema.json" 
# Load schema as dictionary
with open(schema_file, "r", encoding="utf-8") as f:
    metadata_schema = json.load(f) 
    
MODEL = "llama-3.3-70b-versatile"

# List to store metadata for all files
all_metadata = []
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path) & filename.endswith(".csv"):
        print(f"\nProcessing file: {filename}")
        input_data = pd.read_csv(file_path, sep=None, engine='python')
        speaker_chunks_df = chunk_transcript(input_data)  # Stores speaker-based chunks
        final_chunks_df = chunk_by_sentence(speaker_chunks_df)
        # print(final_chunks_df)
        

        # Extract metadata for the chunks
        llama_70b_responses = extract_metadate(metadata_pipeline, final_chunks_df, metadata_schema)
        # Ensure that the response DataFrame contains metadata columns
        if not llama_70b_responses.empty:
            # Merge chunk outputs into a single row 
            merged_metadata = llama_70b_responses.apply(lambda col: ' '.join(col.dropna().astype(str)))
            
            for column in merged_metadata.index:
                unique_values = set([value.strip() for value in merged_metadata[column].strip().split(",")])
                list_unique_values = list(filter(None, unique_values))
                merged_metadata[column] = " | ".join(list_unique_values)

            # Add filename for reference
            # merged_metadata["Filename"] = filename  

            # Append to list
            all_metadata.append(merged_metadata)
        else:
            print(f"No metadata extracted from {filename}")
         
        time.sleep(0.5)

# Convert list of metadata rows into a single DataFrame
final_metadata_df = pd.DataFrame(all_metadata)



Processing file: adg0001_er_2024_10_31.csv

🔍 DEBUG: Raw AI Response for chunk 1:
 [
           {
             "Standort": "Hemer im Sauerland",
             "Archiv ID": "???",
             "PROBANDNR": "???",
             "DOK_ART": "???",
             "ARCHIVORT": "???",
             "PROVENIENZ": "???",
             "SPERRUNG": "???",
             "ENTSTZEIT": "???",
             "Zeitumfang 1": "???",
             "NAME": "???",
             "VORNAME": "???",
             "ORT": "Hemer im Sauerland",
             "Feld1": "???",
             "PSEUDONYM": "???",
             "GESCHLECHT": "???",
             "JAHRGANG": "1925",
             "IPV": "???",
             "DATENBOGEN": "???",
             "KURZBESCHR": "???",
             "TITEL": "???",
             "STRASSE": "???",
             "PLZ": "???",
             "TELEFON": "???",
             "GRUPPE": "???",
             "BERUF": "???",
             "HEUT_FAMST": "???",
             "INTERVIEWE": "???",
             "TIPPE

KeyboardInterrupt: 

In [14]:
import pandas as pd
import json
import re

def extract_metadate(metadata_pipeline, chunks: pd.DataFrame, metadata_schema: dict) -> pd.DataFrame:
    """
    Extract metadata from multiple chunks of a transcript using the specified model.

    Args:
        client: The AI client object.
        model_name: The AI model to use.
        chunks: A DataFrame with 'Speaker' and 'Transcript' columns.
        metadata_schema: A dictionary representing the metadata schema.

    Returns:
        pd.DataFrame: A structured DataFrame containing extracted metadata.
    """
    all_metadata = []

    if isinstance(metadata_schema, str):
        try:
            with open(metadata_schema, "r", encoding="utf-8") as f:
                metadata_schema = json.load(f)
        except Exception as e:
            print(f"❌ Error loading metadata schema: {e}")
            return pd.DataFrame()
                
    schema_json = json.dumps(metadata_schema, indent=2)

    for i, row in chunks.iloc[:5].iterrows():
        speaker = row["Speaker"]
        transcript = row["Transcript"]  # Prevent AI truncation



          # JSON-structured prompt
        prompt = f"""
        You are an AI model specialized in extracting structured metadata from interview transcripts. 
        Return **only a valid JSON object** based on the given schema. 
        If information is missing, set the value to `???`. 
        Do not add explanations, markdown formatting, or extra text.

        Erforderliches Format:
        {schema_json}

        Transkript:
        {transcript}

        JSON-Antwort:
        """

         # Generate response using the local LLaMA model
        try:
            response_text =  metadata_pipeline(prompt, max_new_tokens=1800,return_full_text=False,temperature=0.2)[0]["generated_text"]
        except Exception as e:
            print(f"❌ Error generating response for chunk {i+1}: {e}")
            response_text = "{}"  # Default to an empty JSON

        # Debugging: Print the raw AI response to identify issues
        print(f"\n🔍 DEBUG: Raw AI Response for chunk {i+1}:\n{response_text}\n")
        
        response_text = response_text.strip("[]")  # Handle array brackets if any
        try:
            metadata = json.loads(response_text)
            all_metadata.append(metadata)
        except json.JSONDecodeError as e:
            print(f"⚠️ JSON decoding error in chunk {i+1}: {e}")
            all_metadata.append({})  # Append empty JSON if parsing fails

    return all_metadata


In [16]:
import os
import time
import pandas as pd
import json

folder_path = "Transcripts"
schema_file = "metadata_schema.json"

# Load schema as dictionary
with open(schema_file, "r", encoding="utf-8") as f:
    metadata_schema = json.load(f)

MODEL = "llama-3.3-70b-versatile"

# List to store metadata for all files
all_metadata = []

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    
    if os.path.isfile(file_path) and filename.endswith(".csv"):
        print(f"\nProcessing file: {filename}")

        # Load transcript file
        input_data = pd.read_csv(file_path, sep=None, engine='python')

        # Chunk transcript (assuming these functions exist)
        speaker_chunks_df = chunk_transcript(input_data)  
        final_chunks_df = chunk_by_sentence(speaker_chunks_df)

        # Extract metadata for the chunks
        llama_70b_responses = extract_metadate(metadata_pipeline, final_chunks_df, metadata_schema)

        if llama_70b_responses:  # Check if list is not empty
            merged_metadata = {}

            # Iterate over extracted metadata (list of JSON objects)
            for response in llama_70b_responses:
                for key, value in response.items():
                    if key not in merged_metadata:
                        merged_metadata[key] = set()  # Use a set to store unique values

                    # Ensure value is a string before processing
                    if isinstance(value, list):
                        value = ", ".join(str(v) for v in value)
                    elif not isinstance(value, str):
                        value = str(value)

                    # Add to the set
                    merged_metadata[key].add(value.strip())

            # Convert set values to a string separated by " | "
            merged_metadata = {key: " | ".join(sorted(values)) for key, values in merged_metadata.items()}

            # Add filename for reference
            merged_metadata["Filename"] = filename  

            # Append to list
            all_metadata.append(merged_metadata)
        else:
            print(f"No metadata extracted from {filename}")

        time.sleep(0.5)

# Convert list of metadata into a single DataFrame
final_metadata_df = pd.DataFrame(all_metadata)




Processing file: adg0001_er_2024_10_31.csv


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



🔍 DEBUG: Raw AI Response for chunk 1:
 [
           {
             "Standort": "Hemer im Sauerland",
             "Archiv ID": "???",
             "PROBANDNR": "???",
             "DOK_ART": "???",
             "ARCHIVORT": "???",
             "PROVENIENZ": "???",
             "SPERRUNG": "???",
             "ENTSTZEIT": "???",
             "Zeitumfang 1": "???",
             "NAME": "???",
             "VORNAME": "???",
             "ORT": "Hemer im Sauerland",
             "Feld1": "???",
             "PSEUDONYM": "???",
             "GESCHLECHT": "???",
             "JAHRGANG": "1925",
             "IPV": "???",
             "DATENBOGEN": "???",
             "KURZBESCHR": "???",
             "TITEL": "???",
             "STRASSE": "???",
             "PLZ": "???",
             "TELEFON": "???",
             "GRUPPE": "???",
             "BERUF": "???",
             "HEUT_FAMST": "???",
             "INTERVIEWE": "???",
             "TIPPER": "???",
             "Segmentierung": "??

In [17]:
final_metadata_df


Unnamed: 0,Filename
0,adg0001_er_2024_10_31.csv
