In [1]:
import nest_asyncio
import os
import time
import pandas as pd
import json
import os
import transformers
import torch
from transformers import pipeline


nest_asyncio.apply()

In [11]:
folder_path = "Transcripts"
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path) & filename.endswith(".csv"):
        print(f"\nProcessing file: {filename}")
        input_data = pd.read_csv(file_path, sep=None, engine='python')
        print(f"\nFile Loaded!")


Processing file: adg0001_er_2024_10_31.csv

File Loaded!


In [21]:
input_data.head(5)

Unnamed: 0,Band,Timecode,Sprecher,Transkript,Übersetzung,Hauptüberschrift,Zwischenüberschrift,Hauptüberschrift (Übersetzung),Zwischenüberschrift (Übersetzung),Registerverknüpfungen,Anmerkungen,Anmerkungen (Übersetzung)
0,1,00:00:06.00,INT_AH,Können wir anfangen?,,"Haushalt und Hausarbeit, Lebenslauf, Arbeit (a...",,,,90443688#90443736#90443668,,
1,1,00:00:06.00,INT_AH,"Also wäre schön, wenn Sie mit Kindheit beginne...",,,,,,,,
2,1,00:00:12.00,IP_FA,"Ich war das erste Enkelkind, einzige Enkelkind...",,,,,,,,
3,1,00:00:16.00,IP_FA,Ich bin gebürtig aus Hemer im Sauerland und bi...,,,,,,,,
4,1,00:00:43.00,IP_FA,"Und, ja, Kindheit verlief eigentlich an und fü...",,,,,,,,


In [5]:
transcript = " ".join(input_data["Transkript"].dropna().astype(str))

In [13]:
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

nltk.download('punkt')
nltk.download('punkt_tab')

def split_into_paragraphs(text, sentences_per_paragraph=10):
    """
    Splits the text into paragraphs containing a fixed number of sentences.
    """
    sentences = sent_tokenize(text)
    paragraphs = []
    for i in range(0, len(sentences), sentences_per_paragraph):
        paragraphs.append(" ".join(sentences[i:i+sentences_per_paragraph]))
    return paragraphs

def segment_text_by_local_minima(text, sentences_per_paragraph=10):
    """
    Segments a transcript into chunks by detecting local minima in cosine similarity
    between adjacent paragraphs. Returns a DataFrame with one column 'Segment'
    containing each segment's text.
    """
    # Split text into paragraphs
    paragraphs = split_into_paragraphs(text, sentences_per_paragraph)
    
    # If there's only one paragraph, return it as the only segment
    if len(paragraphs) <= 1:
        return pd.DataFrame({"Segment": [text]})
    
    # Vectorize paragraphs using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(paragraphs)
    
    # Compute cosine similarity between consecutive paragraphs
    similarities = []
    for i in range(tfidf_matrix.shape[0] - 1):
        sim = cosine_similarity(tfidf_matrix[i], tfidf_matrix[i+1])[0][0]
        similarities.append(sim)
    
    # Identify local minima: where the similarity is less than its immediate neighbors
    boundaries = [0]  # starting index of first segment
    for i in range(1, len(similarities) - 1):
        if similarities[i] < similarities[i-1] and similarities[i] < similarities[i+1]:
            boundaries.append(i + 1)
    boundaries.append(len(paragraphs))  # end boundary
    
    # Build segments from boundaries
    segments = []
    for start, end in zip(boundaries, boundaries[1:]):
        segment_text = " ".join(paragraphs[start:end])
        segments.append(segment_text)
    
    return pd.DataFrame({"Segment": segments})


[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [14]:
def extract_metadata_with_context(metadata_pipeline, segments_df: pd.DataFrame, metadata_schema: dict) -> pd.DataFrame:
    all_metadata = []
    
    if isinstance(metadata_schema, str):
        try:
            with open(metadata_schema, "r", encoding="utf-8") as f:
                metadata_schema = json.load(f)
        except Exception as e:
            print(f"❌ Error loading metadata schema: {e}")
            return pd.DataFrame()
    
    schema_json = json.dumps(metadata_schema, indent=2)
    
    aggregated_context = ""
    # print("Initial Aggregated Context:")
    # print(aggregated_context)
    
    for i, row in segments_df.iterrows():
        segment_text = row["Segment"]
        # print(f"\nProcessing Segment {i+1}:\n{segment_text}\n")
        
        prompt =  f"""
        You are an AI model specialized in extracting structured metadata from German interview transcripts.
        Below you have an interview transcript. Please analyze the interview transcript from the interviewee’s perspective.
        The provided transcript covers the interviewee's life experiences, including positions held, work-related events, and family interactions.
        Extract all relevant details from the text.
        For any detected date with a two-digit year, convert it to a four-digit year by assuming it falls in the 1900s (e.g., "67" becomes "1967").
        You will receive a JSON metadata object with predefined keys as input and values as the description of the keys.
        Carefully analyze the transcript and extract detailed information to populate as many keys as possible.
        Only assign '???' to a key if there is no explicit evidence in the transcript for that field.
        Return only a valid JSON object containing the keys with their corresponding extracted values.
        Do not include any additional text, markdown formatting, or explanations.
                        
        Aggregated Context from previous segments:
        {aggregated_context}

        Metadata Schema:
        {schema_json}

        Transcript:
        {segment_text}

        JSON-Antwort:
        """
        
        try:
            response_text = metadata_pipeline(prompt, max_new_tokens=1800, return_full_text=False, temperature=0.2)[0]["generated_text"]
        except Exception as e:
            print(f"❌ Error generating response for segment {i+1}: {e}")
            response_text = "[]"  # Default to an empty JSON
            
        
        print(f"\n🔍 DEBUG: Raw AI Response for segment {i+1}:\n{response_text}\n")
        response_text = response_text.strip().strip("[]")
        
        json_match = re.search(r"\{[\s\S]*\}", response_text)
        extracted_metadata = {}
        if json_match:
            json_str = json_match.group(0)
            try:
                data = json.loads(json_str)
                extracted_metadata.update(data)
            except json.JSONDecodeError as e:
                print(f"JSON parsing error: {e}")
                # fallback: use line-splitting or an empty dict
        else:
            print("No JSON object found in the response.")
            # fallback: or just continue
        
        # 2. Update aggregator
        for key in metadata_schema.keys():
            # If the key is present and not "???", add it to aggregator
            val = extracted_metadata.get(key, "???")
            if val != "???":
                aggregated_context += f"{key}: {val}\n"
                
        
        # 3. Print aggregator
        # print(f"\nAggregated Context after segment {i+1}:\n{aggregated_context}\n")
        
        # 4. Also store metadata in a list for a final DataFrame
        # extracted_metadata["Segment"] = segment_text
        extracted_metadata = {
            k: (str(v).replace('"', "") if str(v).replace('"', "").replace(",", "") != "???" else "")
            for k, v in extracted_metadata.items()
        }        
        all_metadata.append(extracted_metadata)
    return pd.DataFrame(all_metadata)


In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

model_path = "models/meta-llama/Llama-3.3-70B-Instruct"

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    device_map="auto", 
    torch_dtype=torch.bfloat16, 
    quantization_config=quantization_config,
    trust_remote_code=True  # Add this for some custom models
)

tokenizer = AutoTokenizer.from_pretrained(model_path)

metadata_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
import os
import time

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

folder_path = "Transcripts"
schema_file = "metadata_schema.json" 
# Load schema as dictionary
with open(schema_file, "r", encoding="utf-8") as f:
    metadata_schema = json.load(f) 


# List to store metadata for all files
all_metadata = []
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path) & filename.endswith(".csv"):
        print(f"\nProcessing file: {filename}")
        input_data = pd.read_csv(file_path, sep=None, engine='python')
        print(f"\nFile Loaded!")
        transcript_text = " ".join(input_data["Transkript"].dropna().astype(str))
        segments_df = segment_text_by_local_minima(transcript_text, sentences_per_paragraph=10)
        segments_df_subset = segments_df.head(3)
        llama_70b_responses = extract_metadata_with_context(metadata_pipeline, segments_df_subset, metadata_schema)
    
        if not llama_70b_responses.empty:
            # Taking only last column as it has all values 
            metadata_extracted = llama_70b_responses.tail(1)
    else:
        print(f"No metadata extracted from {filename}")


In [13]:
import os
import time
import json
import pandas as pd
import re

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

folder_path = "Transcripts"
schema_file = "metadata_schema.json" 

# Load schema as dictionary
with open(schema_file, "r", encoding="utf-8") as f:
    metadata_schema = json.load(f) 

# List to store metadata for all files
all_metadata = []
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path) and filename.endswith(".csv"):
        print(f"\nProcessing file: {filename}")
        input_data = pd.read_csv(file_path, sep=None, engine='python')
        print(f"\nFile Loaded!")
        transcript_text = " ".join(input_data["Transkript"].dropna().astype(str))
        segments_df = segment_text_by_local_minima(transcript_text, sentences_per_paragraph=10)
        segments_df_subset = segments_df.head(3)
        llama_70b_responses = extract_metadata_with_context(metadata_pipeline, segments_df_subset, metadata_schema)
    
        if not llama_70b_responses.empty:
            # Taking only the last row as it has the aggregated values
            metadata_extracted = llama_70b_responses.tail(1)
            all_metadata.append(metadata_extracted)
    else:
        print(f"No metadata extracted from {filename}")
         
    time.sleep(0.5)



Processing file: adg0001_er_2024_10_31.csv

File Loaded!

🔍 DEBUG: Raw AI Response for segment 1:
 {
          "Standort": "Hemer im Sauerland",
          "Archiv ID": "???",
          "PROBANDNR": "???",
          "DOK_ART": "Interview",
          "ARCHIVORT": "???",
          "PROVENIENZ": "???",
          "SPERRUNG": "???",
          "ENTSTZEIT": "???",
          "Zeitumfang 1": "1939",
          "NAME": "???",
          "VORNAME": "???",
          "ORT": "Hemer im Sauerland",
          "Feld1": "Enkelkind",
          "PSEUDONYM": "???",
          "GESCHLECHT": "weiblich",
          "JAHRGANG": "1925",
          "IPV": "???",
          "DATENBOGEN": "???",
          "KURZBESCHR": "Kindheitserinnerungen",
          "TITEL": "???",
          "STRASSE": "???",
          "PLZ": "???",
          "TELEFON": "???",
          "GRUPPE": "???",
          "BERUF": "Dienstmädchen",
          "HEUT_FAMST": "???",
          "INTERVIEWE": "???",
          "TIPPER": "???",
          "Segmentierung

In [14]:
metadata_extracted

Unnamed: 0,Standort,Archiv ID,PROBANDNR,DOK_ART,ARCHIVORT,PROVENIENZ,SPERRUNG,ENTSTZEIT,Zeitumfang 1,NAME,VORNAME,ORT,Feld1,PSEUDONYM,GESCHLECHT,JAHRGANG,IPV,DATENBOGEN,KURZBESCHR,TITEL,STRASSE,PLZ,TELEFON,GRUPPE,BERUF,HEUT_FAMST,INTERVIEWE,TIPPER,Segmentierung,DATUM1,DATUM2,DATUM3,DAUER,online,AUSDRUCKSART,UNKAUSDRUC,KORRAUSDRU,SCHLAGWORT,KURZBIOGRA,KURZPROTOK,FOTOS,DOKUMENTE,VHS,DVD,IBM Server,Cloud,Format Cloud,DV,Beta,ORIGCASSET,CASSKOPIEN,FESTPLATTE,Dig Audiofiles,KONF_HEUTE,KONVERSION,WANN_KONV,HERKUNFT,WANN_ZUGEZ,GESCHWISTE,Schulabsch,ABGEBROCHE,WEITERBILD,AUSBILDUNG,STAND,WIRTSCHBER,BERUFSWECH,WANN_WECHS,BERUFSBEGI,BERUFSENDE,NICHTERWER,GRNDE,VON_BIS,ARBEITSLOS,VON_BIS_AL,FAM_STAND,HEIRAT1JHR,HEIRAT2JHR,HEIRAT3JHR,SCHEID1JHR,SCHEID2JHR,VERWIT1JHR,VERWIT2JHR,KINDERZAHL,GEB_JAHR1,GEB_JAHR2,GEB_JAHR_L,POLOR_HEUT,POL_KONVER,POLORIENT1,VON_BIS_1,POLORIENT2,VON_BIS_2,GEW_VERBAN,VON_BIS_GV,JUGENDORG1,VON_BIS_J1,JUGENDORG2,VON_BIS_J2,NS_ORGAN_1,VON_BISNS1,NS_ORGAN_2,VON_BISNS2,RAD_KLV_DV,VON_BISRAD,SONST_ENG,VON_BIS_SE,KRIEGSTEIL,VON_BIS_KR,MUTT_JG,MUTT_KONFESSION,MUTT_HERKU,MUTT_SCHUL,MUTT_AUSBI,MUTT_STAND,MUTT_POLOR,VAT_JG,VAT_KONFESSION,VAT_HERKUN,VAT_SCHULE,VAT_AUSBIL,VAT_STAND,VAT_POLOR,PART_JG,PART_KONFESSION,PART_HERKU,PART_SCHUL,PART_AUSBI,PART_STAND,PART_BERUF,PART_POLOR,PART_PKONV,PART_ENGAG
2,Hemer im Sauerland,,,Interview,,,,,1939,,,Hemer im Sauerland,Enkelkind,,weiblich,1925,,,Kindheitserinnerungen,,,,,,Dienstmädchen,,,,,1939,1940,1942,,,,,,Kindheitserinnerungen,,,,,,,,,,,,,,,,,,,Hemer im Sauerland,,1,Hauptschulabschluss,,Landjahr-Lager,Dienstmädchen,,Landwirtschaft,,,,,Migräne,Migräne,,,,,,,,,,,,1,,,,,,,,,,,,,,,,NSV,1940-1942,,,Landjahr-Lager,1942,,,,,,,,,,,,,,,,,,,,,,,,,,,,
