# üîç QualiVault: Validate Transcripts
**Goal:** Use Ollama (local LLM) to detect transcription errors and hallucinations.

1. Loads transcripts from CSV files.
2. Samples segments and sends them to Ollama for validation.
3. Flags potential errors: hallucinations, misheard words, artifacts.
4. Generates validation report with suggestions.

**Prerequisites:**
- Ollama must be installed and running (`ollama serve`)
- Install a model: `ollama pull llama3.1` or `ollama pull jobautomation/OpenEuroLLM-Danish:latest`


In [16]:
%load_ext autoreload
%autoreload 2
import yaml
from pathlib import Path
from qualivault.validation import OllamaValidator, validate_recipe_transcripts

# ============================================
# PROJECT CONFIGURATION
# ============================================
# Specify your project folder name here:
PROJECT_NAME = 'YOUR_PROJECT_NAME'  # <-- Change this to your project folder name

# Auto-detect workspace root and project path
workspace_root = Path(r'c:\dev\qualvalt')  # Workspace root
project_root = workspace_root / 'projects' / PROJECT_NAME

# Verify project exists
if not project_root.exists():
    raise FileNotFoundError(f"‚ùå Project not found: {project_root}\n   Available projects in {workspace_root / 'projects'}:")
    
config_path = project_root / 'config.yml'
if not config_path.exists():
    raise FileNotFoundError(f"‚ùå Config not found: {config_path}")

print(f"üéØ Working on project: {PROJECT_NAME}")
print(f"üìÅ Project root:       {project_root}")
print(f"‚öôÔ∏è  Config file:        {config_path}")
print()

# 1. Load Configuration
with open(config_path) as f:
    config = yaml.safe_load(f)

recipe_path = project_root / "processing_recipe.yaml"
transcripts_dir = (project_root / config['paths']['output_base_folder']).resolve()

print(f"üìÇ Transcripts: {transcripts_dir}")
print(f"üìã Recipe: {recipe_path}")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


FileNotFoundError: ‚ùå Project not found: c:\dev\qualvalt\projects\YOUR_PROJECT_NAME
   Available projects in c:\dev\qualvalt\projects:

In [None]:
# 2. Check Ollama Installation and Available Models
import requests
import json

OLLAMA_URL = "http://localhost:11434"

def check_ollama():
    """Test if Ollama is running and list available models."""
    print("üîç Checking Ollama installation...\n")
    
    # Test connection
    try:
        response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5)
        
        if response.status_code == 200:
            print("‚úÖ Ollama is running!")
            
            # List available models
            data = response.json()
            models = data.get('models', [])
            
            if models:
                print(f"\nüì¶ Available models ({len(models)}):\n")
                
                for model in models:
                    name = model.get('name', 'Unknown')
                    size_gb = model.get('size', 0) / (1024**3)
                    
                    # Highlight Danish model
                    if 'danish' in name.lower() or 'openeuro' in name.lower():
                        print(f"   üá©üá∞ {name} ({size_gb:.1f} GB) ‚Üê Recommended for Danish")
                    elif 'llama3' in name.lower():
                        print(f"   ü¶ô {name} ({size_gb:.1f} GB) ‚Üê Good general model")
                    else:
                        print(f"   ‚Ä¢ {name} ({size_gb:.1f} GB)")
                
                print("\nüí° Recommendation:")
                danish_models = [m for m in models if 'danish' in m.get('name', '').lower() or 'openeuro' in m.get('name', '').lower()]
                
                if danish_models:
                    print(f"   Use: '{danish_models[0]['name']}' (Danish-optimized)")
                elif any('llama3' in m.get('name', '').lower() for m in models):
                    llama3 = [m for m in models if 'llama3' in m.get('name', '').lower()][0]
                    print(f"   Use: '{llama3['name']}' (general purpose)")
                else:
                    print(f"   Use: '{models[0]['name']}'")
                
                return True, models
            else:
                print("‚ö†Ô∏è  Ollama is running but no models are installed!")
                print("\nüì• Install a model:")
                print("   For Danish: ollama pull jobautomation/OpenEuroLLM-Danish:latest")
                print("   General:    ollama pull llama3.1")
                return False, []
        else:
            print(f"‚ùå Ollama responded with error: {response.status_code}")
            return False, []
            
    except requests.exceptions.ConnectionError:
        print("‚ùå Cannot connect to Ollama!")
        print("\nüîß To fix:")
        print("   1. Install Ollama: https://ollama.ai")
        print("   2. Start Ollama: ollama serve")
        print("   3. Pull a model: ollama pull llama3.1")
        return False, []
    except Exception as e:
        print(f"‚ùå Error checking Ollama: {e}")
        return False, []

# Run the check
ollama_ok, available_models = check_ollama()


## Configuration

Adjust these settings to control validation:

- **model**: Ollama model to use (`llama2`, `mistral`, `llama3`, etc.)
- **sample_rate**: Fraction of segments to check (0.1 = 10%, 1.0 = 100%)
- **language**: Expected language of transcripts
- **ollama_url**: URL where Ollama is running (default: localhost)

In [None]:
# Validation Settings
# Auto-select the best available model from the list above
if ollama_ok and available_models:
    # Prefer Danish models, then llama3, then first available
    danish_models = [m for m in available_models if 'danish' in m.get('name', '').lower() or 'openeuro' in m.get('name', '').lower()]
    llama3_models = [m for m in available_models if 'llama3' in m.get('name', '').lower()]
    
    if danish_models:
        MODEL = danish_models[0]['name']
    elif llama3_models:
        MODEL = llama3_models[0]['name']
    else:
        MODEL = available_models[0]['name']
else:
    MODEL = "llama3.1"  # Fallback if check didn't run

SAMPLE_RATE = 0.1           # Check 10% of segments (faster, set to 1.0 for 100%)
LANGUAGE = "Danish"         # Expected language
OLLAMA_URL = "http://localhost:11434"
TIMEOUT = 120               # Timeout in seconds (increase for large models)

VALIDATION_PARAMS = {
    "sample_rate": SAMPLE_RATE,
    "min_text_length": 8,
    "max_segments": None,
    "timeout": TIMEOUT,
    "ollama_options": {"temperature": 0.3, "top_p": 0.9}
}

print(f"ü§ñ Model: {MODEL}")
print(f"üìä Sample Rate: {SAMPLE_RATE * 100}%")
print(f"üåç Language: {LANGUAGE}")
print(f"‚è±Ô∏è  Timeout: {TIMEOUT}s")


## Test Ollama Connection

Make sure Ollama is running before proceeding.

In [None]:
validator = OllamaValidator(model=MODEL, ollama_url=OLLAMA_URL)

# Quick test
test_response = validator._query_ollama("Say 'Hello' in one word.")
if test_response:
    print(f"‚úÖ Ollama is responding: '{test_response.strip()[:50]}'")
else:
    print("‚ùå Ollama is not responding. Make sure it's running: `ollama serve`")

## Validate All Transcripts

This will:
1. Load all transcribed interviews from the recipe
2. Sample segments from each CSV
3. Check each segment with Ollama for errors
4. Save individual validation report for each interview
5. Create summary statistics


In [None]:
# Run validation on all transcripts - Save individual report for each interview
from datetime import datetime
import json

print(f"üîç Starting validation at {datetime.now().strftime('%H:%M:%S')}")
print(f"   Model: {MODEL}")
print(f"   Scanning: {transcripts_dir}")
print("=" * 70)

# Create validation reports directory
reports_dir = project_root / "validation_reports"
reports_dir.mkdir(exist_ok=True)
print(f"üìÅ Reports saved to: {reports_dir}\n")

# Find all CSV files
csv_files = sorted(transcripts_dir.glob("*.csv"))

if not csv_files:
    print(f"‚ùå No CSV files found")
    results = []
else:
    print(f"Found {len(csv_files)} CSV files\n")
    results = []
    
    for i, csv_file in enumerate(csv_files, 1):
        interview_name = csv_file.stem
        report_file = reports_dir / f"{interview_name}_validation.json"
        
        print(f"[{i}/{len(csv_files)}] {interview_name}...", end=" ")
        
        validator = OllamaValidator(model=MODEL, ollama_url=OLLAMA_URL, timeout=TIMEOUT)
        report = validator.validate_transcript(
            csv_file,
            sample_rate=VALIDATION_PARAMS["sample_rate"],
            language=LANGUAGE,
            validation_params=VALIDATION_PARAMS
        )
        
        if report:
            results.append(report)
            
            # Save individual report
            with open(report_file, 'w', encoding='utf-8') as f:
                json.dump(report, f, indent=2, ensure_ascii=False)
            
            flagged = len(report.get('flagged_segments', []))
            checked = report.get('segments_checked', 0)
            print(f"‚úÖ {flagged}/{checked} segments")
        else:
            print(f"‚è≠Ô∏è  Skipped")

print("\n" + "=" * 70)
print(f"‚úÖ Validation complete at {datetime.now().strftime('%H:%M:%S')}")
print(f"   Reports saved to: {reports_dir}")


## Review Validation Reports

Inspect flagged segments and issues across all transcripts.


In [None]:
# Summary Statistics - Load all individual reports and aggregate
import pandas as pd

# Load all individual validation reports
reports_dir = project_root / "validation_reports"
report_files = sorted(reports_dir.glob("*_validation.json"))

print(f"üìÇ Loading {len(report_files)} validation reports\n")

all_results = []
for report_file in report_files:
    try:
        with open(report_file, 'r', encoding='utf-8') as f:
            report = json.load(f)
            all_results.append(report)
    except Exception as e:
        print(f"‚ö†Ô∏è  Could not load {report_file.name}: {e}")

# Aggregate statistics
total_flagged = sum(r.get('flagged_count', len(r.get('flagged_segments', []))) for r in all_results)
total_checked = sum(r.get('segments_checked', 0) for r in all_results)

print(f"üìä VALIDATION SUMMARY")
print(f"=" * 70)
print(f"Files validated: {len(all_results)}")
print(f"Total segments checked: {total_checked}")
print(f"Total segments flagged: {total_flagged}")
if total_checked > 0:
    print(f"Flag rate: {(total_flagged / total_checked * 100):.1f}%")
print(f"=" * 70)

# Per-file breakdown
df = pd.DataFrame([
    {
        'File': Path(r.get('csv_file', 'unknown')).stem,
        'Checked': r.get('segments_checked', 0),
        'Flagged': r.get('flagged_count', len(r.get('flagged_segments', []))),
        'Rate': f"{(r.get('flagged_count', len(r.get('flagged_segments', []))) / max(r.get('segments_checked', 1), 1) * 100):.1f}%" 
               if r.get('segments_checked', 0) > 0 else "N/A"
    }
    for r in all_results
])

print("\nüìÅ Per-File Results:")
print(df.to_string(index=False))

# Save summary to CSV
summary_file = reports_dir / "SUMMARY.csv"
df.to_csv(summary_file, index=False)
print(f"\nüíæ Summary saved to: {summary_file}")

## Detailed Issue Review

Examine specific flagged segments.

In [None]:
# Show detailed issues for first transcript
if reports and reports[0].get('flagged_segments'):
    report = reports[0]
    print(f"\nüîç Detailed issues for: {report['csv_file']}\n")
    
    for seg in report['flagged_segments'][:10]:  # Show first 10
        print(f"Segment {seg['segment_index']} ({seg['start']:.1f}s - {seg['end']:.1f}s)")
        print(f"Speaker: {seg['speaker']}")
        print(f"Text: {seg['text']}")
        print(f"Issues: {', '.join(seg['issues'])}")
        print(f"Confidence: {seg['confidence']:.2f}")
        if seg.get('suggestions'):
            print(f"Suggestions: {seg['suggestions']}")
        print()
else:
    print("No issues found in first transcript.")

## Export Validation Report

Create master summary file with links to individual validation reports.


In [None]:
# Create Master Summary Report
reports_dir = project_root / "validation_reports"

# Create master summary
master_summary = {
    'validation_date': datetime.now().isoformat(),
    'model': MODEL,
    'sample_rate': SAMPLE_RATE,
    'language': LANGUAGE,
    'summary': {
        'total_files_validated': len(all_results),
        'total_segments_checked': total_checked,
        'total_segments_flagged': total_flagged,
        'flag_rate': f"{(total_flagged / total_checked * 100):.1f}%" if total_checked > 0 else "0%"
    },
    'per_file_reports': [
        {
            'file': Path(r.get('csv_file', 'unknown')).name,
            'segments_checked': r.get('segments_checked', 0),
            'segments_flagged': r.get('flagged_count', len(r.get('flagged_segments', []))),
            'report_file': f"{Path(r.get('csv_file', 'unknown')).stem}_validation.json"
        }
        for r in all_results
    ]
}

# Save master summary
summary_file = reports_dir / "VALIDATION_SUMMARY.json"
with open(summary_file, 'w', encoding='utf-8') as f:
    json.dump(master_summary, f, indent=2, ensure_ascii=False)

print(f"üìã Master Summary Report")
print(f"=" * 70)
print(f"üìÅ Location: {summary_file}")
print(f"   ‚Ä¢ Master summary: {summary_file.name}")
print(f"   ‚Ä¢ Individual reports: {len(list(reports_dir.glob('*_validation.json')))} files")
print(f"   ‚Ä¢ Summary CSV: SUMMARY.csv")
print(f"\n‚úÖ Organization:")
print(f"   {reports_dir}/")
print(f"   ‚îú‚îÄ‚îÄ VALIDATION_SUMMARY.json  (master overview)")
print(f"   ‚îú‚îÄ‚îÄ SUMMARY.csv              (per-file table)")
print(f"   ‚îú‚îÄ‚îÄ Interview_1_validation.json")
print(f"   ‚îú‚îÄ‚îÄ Interview_2_validation.json")
print(f"   ‚îî‚îÄ‚îÄ ...")