## IMPORT & KONFIGURASI

In [7]:
import os
import glob
import whisper
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# --- KONFIGURASI ---
FOLDER_VIDEO = 'Vid'  # Pastikan folder ini berisi video interview bahasa Inggris
MODEL_WHISPER_SIZE = 'base' 

# GANTI 1: Menggunakan model Sentence-BERT khusus Bahasa Inggris
# Model ini lebih ringan dan performanya sangat baik untuk teks Inggris
MODEL_SBERT = 'all-MiniLM-L6-v2' 

# --- DATASET KUNCI JAWABAN (ENGLISH VERSION) ---
kunci_jawaban_db = {
    'interview_question_1': {
        'ideal_answer': "Machine Learning is a subset of artificial intelligence that focuses on building systems that learn from data to improve their performance without being explicitly programmed.",
        'keywords': ['learn from data', 'improve', 'explicitly programmed']
    },
    'interview_question_2': {
        'ideal_answer': "In supervised learning, the algorithm is trained on labeled data, while unsupervised learning deals with unlabeled data to find hidden patterns.",
        'keywords': ['labeled data', 'unlabeled', 'patterns']
    },
    'interview_question_3': {
        'ideal_answer': "In supervised learning, the algorithm is trained on labeled data, while unsupervised learning deals with unlabeled data to find hidden patterns.",
        'keywords': ['labeled data', 'unlabeled', 'patterns']
    },
    # Sesuaikan nama key ini dengan nama file video kamu
}

# --- FUNGSI LOAD MODEL ---
print("Loading English AI Models...")
stt_model = whisper.load_model(MODEL_WHISPER_SIZE)
nlp_model = SentenceTransformer(MODEL_SBERT)
print("Models loaded successfully!\n")

def hitung_skor_keyword(transkrip, keywords):
    """Calculate keyword coverage percentage"""
    transkrip_lower = transkrip.lower()
    found_count = 0
    for key in keywords:
        if key.lower() in transkrip_lower:
            found_count += 1
    
    if len(keywords) == 0: return 100
    return (found_count / len(keywords)) * 100

def proses_penilaian_interview():
    video_paths = glob.glob(os.path.join(FOLDER_VIDEO, '*.webm'))
    video_paths.sort()
    
    hasil_penilaian = []

    print(f"Starting analysis for {len(video_paths)} videos...\n")

    for video_path in video_paths:
        filename = os.path.basename(video_path).replace('.webm', '')
        
        if filename not in kunci_jawaban_db:
            print(f"[SKIP] No answer key found for: {filename}")
            continue

        print(f"--> Processing: {filename}")
        
        # GANTI 2: Set parameter language ke 'en' (English)
        # Ini membantu Whisper agar tidak bingung mendeteksi aksen
        result = stt_model.transcribe(video_path, language='en') 
        transkrip_kandidat = result['text']
        
        # Ambil Data Kunci Jawaban
        data_kunci = kunci_jawaban_db[filename]
        jawaban_ideal = data_kunci['ideal_answer']
        keywords_wajib = data_kunci['keywords']

        # Hitung Semantic Score
        embedding_kandidat = nlp_model.encode(transkrip_kandidat, convert_to_tensor=True)
        embedding_ideal = nlp_model.encode(jawaban_ideal, convert_to_tensor=True)
        
        semantic_score = util.pytorch_cos_sim(embedding_kandidat, embedding_ideal).item() * 100
        
        # Hitung Keyword Score
        keyword_score = hitung_skor_keyword(transkrip_kandidat, keywords_wajib)

        # Final Score Calculation
        final_score = (semantic_score * 0.7) + (keyword_score * 0.3)

        hasil_penilaian.append({
            'Video File': filename,
            'Candidate Transcript': transkrip_kandidat,
            'Semantic Score': round(semantic_score, 2),
            'Keyword Score': round(keyword_score, 2),
            'FINAL SCORE': round(final_score, 2)
        })
    
    return pd.DataFrame(hasil_penilaian)

# --- JALANKAN PROSES ---
# Pastikan FFmpeg sudah diperbaiki sebelum menjalankan ini
try:
    df_hasil = proses_penilaian_interview()
    print("\n=== AUTOMATED INTERVIEW ASSESSMENT REPORT ===")
    print(df_hasil[['Video File', 'Candidate Transcript', 'FINAL SCORE']])
except Exception as e:
    print(f"Terjadi Error: {e}")
    print("Tips: Pastikan FFmpeg sudah terinstall (lihat solusi sebelumnya).")

Loading English AI Models...
Models loaded successfully!

Starting analysis for 5 videos...

--> Processing: interview_question_1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


--> Processing: interview_question_2


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


--> Processing: interview_question_3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[SKIP] No answer key found for: interview_question_4
[SKIP] No answer key found for: interview_question_5

=== AUTOMATED INTERVIEW ASSESSMENT REPORT ===
             Video File                               Candidate Transcript  \
0  interview_question_1   Okay, share this specific challenges you face...   
1  interview_question_2   Can you describe your experience with transfe...   
2  interview_question_3   Wait, what is this? We scroll the complexity ...   

   FINAL SCORE  
0        17.22  
1        13.46  
2        12.25  
