In [1]:
import pandas as pd
import os
import argparse
import json
from typing import List, Dict, Optional, Union
import re
import pandas as pd
import time
import numpy as np
from openai import OpenAI
from tqdm import tqdm


parquet_file_path = '/kaggle/input/newest/test-00000-of-00001.parquet'
data = pd.read_parquet(parquet_file_path)

csv_file_path = 'output.csv'
data.to_csv(csv_file_path, index=False)

print(f"Файл успешно сохранен как CSV: {csv_file_path}")


Файл успешно сохранен как CSV: output.csv


In [2]:
data= pd.read_csv('/kaggle/working/output.csv')

In [3]:
CATEGORIES = [
    "Knowledge",
    "Film & Television",
    "Sports Competition",
    "Artistic Performance",
    "Life Record",
    "Multilingual"
]

SUB_CATEGORIES = [
    "Humanity & History",
    "Literature & Art",
    "Biology & Medicine",
    "Finance & Commerce",
    "Astronomy",
    "Geography",
    "Law",
    "Life Tip",
    "Technology",
    "Animation",
    "Movie & TV Show",
    "Documentary",
    "News Report",
    "Esports",
    "Basketball",
    "Football",
    "Athletics",
    "Other Sports",
    "Stage Play",
    "Magic Show",
    "Variety Show",
    "Acrobatics",
    "Handicraft",
    "Food",
    "Fashion",
    "Daily Life",
    "Travel",
    "Pet & Animal",
    "Exercise",
    "Multilingual"
]

TASK_CATEGORIES = [
    "Temporal Perception",
    "Spatial Perception",
    "Attribute Perception",
    "Action Recognition",
    "Object Recognition",
    "OCR Problems",
    "Counting Problem",
    "Temporal Reasoning",
    "Spatial Reasoning",
    "Action Reasoning",
    "Object Reasoning",
    "Information Synopsis",
]


def extract_characters_regex(s):
    s = s.strip()
    answer_prefixes = [
        "The best answer is",
        "The correct answer is",
        "The answer is",
        "The answer",
        "The best option is"
        "The correct option is",
        "Best answer:"
        "Best option:",
        "Answer:",
        "Option:",
        "The correct answer",
        "The correct option",
    ]
    for answer_prefix in answer_prefixes:
        s = s.replace(answer_prefix, "")

    if len(s.split()) > 10 and not re.search("[ABCD]", s):
        return ""
    matches = re.search(r'[ABCD]', s)
    if matches is None:
        return ""
    return matches[0]


def eval_your_results(
        your_results_path: str, 
        video_types: Optional[Union[List[str], str]] = None,
        skip_missing: Optional[bool] = False,
        return_categories_accuracy: Optional[bool] = True,
        return_sub_categories_accuracy: Optional[bool] = False,
        return_task_types_accuracy: Optional[bool] = False,
        gt_answer_key: Optional[str] = "answer",
        your_answer_key: Optional[str] = "response"

    ):
    """
    Evaluate your results against the ground truth

    Args:
    - your_results_path (str): Path to your results file
    - video_types (Optional[List[str], str]): List of video types to evaluate. 
    - skip_missing (Optional[bool]): If True, missing files will be skipped. If False, an error will be raised if there are missing files.
    - return_categories_accuracy (Optional[bool]): If True, the accuracy for each video category will be returned.
    - return_sub_categories_accuracy (Optional[bool]): If True, the accuracy for each video sub category will be returned.
    - return_task_types_accuracy (Optional[bool]): If True, the accuracy for each task category will be returned.
    - gt_answer_key (Optional[str]): Key to access the ground truth answer in the results file.
    - your_answer_key (Optional[str]): Key to access your answer in the results file.
    """
    with open(your_results_path, 'r') as f:
        your_results = json.load(f)

    if isinstance(video_types, str):
        video_types = video_types.split(",")

    q_type_dict = {}
    v_type_dict = {}
    v_sub_type_dict = {}


    for video_type in video_types:

        # Filter your results based on video types
        your_results_video_type = [item for item in your_results if item["duration"] == video_type]
        q_type_dict[video_type] = {}

        for q_type in TASK_CATEGORIES:
            q_type_dict[video_type][q_type] = {"correct": 0, "answered": 0}

        v_type_dict[video_type] = {}
        for v_type in CATEGORIES:
            v_type_dict[video_type][v_type] = {"correct": 0, "answered": 0}
        
        v_sub_type_dict[video_type] = {}
        for v_sub_type in SUB_CATEGORIES:
            v_sub_type_dict[video_type][v_sub_type] = {"correct": 0, "answered": 0}

        if not skip_missing:
            assert len(your_results_video_type) == 300, f"Number of files in {video_type} is not 300. Check if there are missing files."

        for item in your_results_video_type:

            if skip_missing and item["missing"]:
                continue

            # Get the video category, sub category and question category
            video_category = item["domain"]
            video_sub_category = item["sub_category"]
            
            questions = item["questions"]

            for question in questions:
                q_type = question["task_type"]

                # Get the ground truth and your response
                gt_answer = question[gt_answer_key]
                response = question[your_answer_key]

                # Extract the answer from the response
                extration = extract_characters_regex(response)
    
                if extration != "":
                    q_type_dict[video_type][q_type]["answered"] += 1
                    q_type_dict[video_type][q_type]["correct"] += extration == gt_answer

                    v_type_dict[video_type][video_category]["answered"] += 1
                    v_type_dict[video_type][video_category]["correct"] += extration == gt_answer

                    v_sub_type_dict[video_type][video_sub_category]["answered"] += 1
                    v_sub_type_dict[video_type][video_sub_category]["correct"] += extration == gt_answer


    # results for each video type
    for video_type in video_types:

        print("=====================================")
        print(f"Evaluation on video Type: {video_type}")
        print("=====================================")
        if return_categories_accuracy:
            print("-------------------------------------")
            print("Video Categories")
            print("-------------------------------------")
            for v_type in v_type_dict[video_type]:
                print(f"{v_type}: {100 * v_type_dict[video_type][v_type]['correct'] / v_type_dict[video_type][v_type]['answered'] if v_type_dict[video_type][v_type]['answered'] > 0 else 0 : .1f}%")
        if return_sub_categories_accuracy:
            print("-------------------------------------")
            print("Video Sub Categories")
            print("-------------------------------------")
            for v_sub_type in v_sub_type_dict[video_type]:
                print(f"{v_sub_type}: {100 * v_sub_type_dict[video_type][v_sub_type]['correct'] / v_sub_type_dict[video_type][v_sub_type]['answered'] if v_sub_type_dict[video_type][v_sub_type]['answered'] > 0 else 0 : .1f}%")
        if return_task_types_accuracy:
            print("-------------------------------------")
            print("Task Categories")
            print("-------------------------------------")
            for q_type in q_type_dict[video_type]:
                print(f"{q_type}: {100 * q_type_dict[video_type][q_type]['correct'] / q_type_dict[video_type][q_type]['answered'] if q_type_dict[video_type][q_type]['answered'] > 0 else 0 : .1f}%")
        
        print("-------------------------------------")
        print("Overall Performance")
        print("-------------------------------------")
        total_correct = sum([q_type_dict[video_type][q_type]["correct"] for q_type in TASK_CATEGORIES])
        total_answered = sum([q_type_dict[video_type][q_type]["answered"] for q_type in TASK_CATEGORIES])
        print(f"Overall: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")

        print("\n")

    # results for the entire dataset
    print("=====================================")
    print("Evaluation on the entire dataset")
    print("=====================================")

    if return_categories_accuracy:
        print("-------------------------------------")
        print("Video Domains")
        print("-------------------------------------")
        for v_type in CATEGORIES:
            total_correct = sum([v_type_dict[video_type][v_type]["correct"] for video_type in video_types])
            total_answered = sum([v_type_dict[video_type][v_type]["answered"] for video_type in video_types])
            print(f"{v_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
    

    if return_sub_categories_accuracy:
        print("-------------------------------------")
        print("Video Sub Categories")
        print("-------------------------------------")

        for v_sub_type in SUB_CATEGORIES:
            total_correct = sum([v_sub_type_dict[video_type][v_sub_type]["correct"] for video_type in video_types])
            total_answered = sum([v_sub_type_dict[video_type][v_sub_type]["answered"] for video_type in video_types])
            print(f"{v_sub_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")


    if return_task_types_accuracy:
        print("-------------------------------------")
        print("Task Categories")
        print("-------------------------------------")
        for q_type in TASK_CATEGORIES:

            total_correct = sum([q_type_dict[video_type][q_type]["correct"] for video_type in video_types])
            total_answered = sum([q_type_dict[video_type][q_type]["answered"] for video_type in video_types])
            print(f"{q_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")

    print("-------------------------------------")
    print("Overall Performance")
    print("-------------------------------------")
    total_correct = sum([sum([q_type_dict[video_type][q_type]["correct"] for q_type in TASK_CATEGORIES]) for video_type in video_types])
    total_answered = sum([sum([q_type_dict[video_type][q_type]["answered"] for q_type in TASK_CATEGORIES]) for video_type in video_types])
    print(f"Overall: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")


In [4]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.bool_):
            return bool(obj)
        return super(NumpyEncoder, self).default(obj)

api_key = "api_key"
client = OpenAI(api_key=api_key)

def get_multiple_choice_answer(question, options, model_name="gpt-4", retries=3, delay=2):
    """Get a multiple-choice answer from GPT-4"""
    for attempt in range(retries):
        try:
            if not question or pd.isna(question):
                return "Invalid Question"
            
            # options are properly formatted
            options_list = []
            for opt in options:
                if isinstance(opt, (str, int, float)) and not pd.isna(opt):
                    options_list.append(str(opt))
                else:
                    options_list.append("No option")
            
            while len(options_list) < 4:
                options_list.append("No option")
            
            # messages with clear instructions
            messages = [
                {"role": "system", "content": "You are a helpful assistant that answers multiple choice questions with just the letter of the correct answer."},
                {"role": "user", "content": f"""
Question: {question}
Options:
A) {options_list[0]}
B) {options_list[1]}
C) {options_list[2]}
D) {options_list[3]}

Important: Respond with ONLY the letter of the correct answer (A, B, C, or D).
"""}
            ]
            
            response = client.chat.completions.create(
                model=model_name,
                messages=messages,
                max_tokens=10,
                temperature=0
            )
            
            # Extract and process the answer
            answer_text = response.choices[0].message.content.strip().upper()
            if any(letter in answer_text for letter in ["A", "B", "C", "D"]):
                for letter in ["A", "B", "C", "D"]:
                    if letter in answer_text:
                        return letter
                return answer_text
            else:
                print(f"Invalid answer format: '{answer_text}'")
                if attempt < retries - 1:
                    time.sleep(delay * (attempt + 1))
                    continue
                return "Invalid response"
                
        except Exception as e:
            print(f"Error on attempt {attempt+1}: {str(e)}")
            if attempt < retries - 1:
                time.sleep(delay * (attempt + 1))
            else:
                return f"Error: {str(e)}"
    
    return "Error: No answer after retries"

def process_video_data(data_path, output_path, model_name="gpt-4", sample_size=None):
    """Process the video data, get model responses, and save results"""
    print(f"Loading data from {data_path}...")
    # Load the data
    if data_path.endswith('.parquet'):
        df = pd.read_parquet(data_path)
    elif data_path.endswith('.csv'):
        df = pd.read_csv(data_path)
    else:
        raise ValueError("Unsupported file format. Use .parquet or .csv")
    
    print(f"Data loaded. Total records: {len(df)}")
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype(str)
    
    #NaN values
    df = df.fillna("None")
    if sample_size:
        df = df.sample(sample_size, random_state=42)
        print(f"Using sample of {sample_size} records")
    
    results = []
    required_columns = ['video_id', 'duration', 'domain', 'sub_category', 'url', 'question_id', 'task_type', 'question', 'options', 'answer']
    for col in required_columns:
        if col not in df.columns:
            print(f"Warning: Column '{col}' not found in the dataset. Using placeholder values.")
    
    # Add missing columns with placeholder values if needed
    for col in required_columns:
        if col not in df.columns:
            df[col] = "placeholder"
    
    # Group by video to maintain the structure required by the evaluation script
    print("Grouping data by video...")
    groupby_columns = ['video_id', 'duration', 'domain', 'sub_category', 'url']
    video_groups = df.groupby([col for col in groupby_columns if col in df.columns])
    print(f"Total videos to process: {len(video_groups)}")
    
    for video_info, group in tqdm(video_groups, desc="Processing videos"):
        if not isinstance(video_info, tuple):
            video_info = (video_info,)
        
        #  dictionary with video info
        video_data = {}
        for i, col in enumerate([col for col in groupby_columns if col in df.columns]):
            if i < len(video_info):
                video_data[col] = video_info[i]
            else:
                video_data[col] = "placeholder"
        for col in groupby_columns:
            if col not in video_data:
                video_data[col] = "placeholder"
        
        video_data["missing"] = False
        video_data["questions"] = []
        
        # Process each question for this video
        for _, row in group.iterrows():
            try:
                options = row['options']
                if pd.isna(options):
                    options = ["No option"] * 4
                elif isinstance(options, str):
                    # parse as JSON
                    try:
                        options = json.loads(options)
                    except json.JSONDecodeError:
                        options = [opt.strip() for opt in options.strip('[]').split(',')]

                    if not isinstance(options, list):
                        options = [str(options)] * 4
                else:
                    options = [str(options)] * 4
            except:
                options = ["No option"] * 4
            
            if len(options) < 4:
                options.extend(["No option"] * (4 - len(options)))
            elif len(options) > 4:
                options = options[:4]
            
            question_text = str(row['question']) if 'question' in row else "No question provided"
            model_answer = get_multiple_choice_answer(question_text, options, model_name)
            question_data = {
                "question_id": str(row['question_id']) if 'question_id' in row else "unknown",
                "task_type": str(row['task_type']) if 'task_type' in row else "unknown",
                "question": question_text,
                "options": options,
                "answer": str(row['answer']) if 'answer' in row else "A",
                "response": f"The answer is {model_answer}"
            }
            
            video_data["questions"].append(question_data)
        results.append(video_data)

    print(f"Saving results to {output_path}...")
    try:
        with open(output_path, 'w') as f:
            json.dump(results, f, indent=2, cls=NumpyEncoder)
        print(f"Results successfully saved to {output_path}")
    except Exception as e:
        print(f"Error saving results: {str(e)}")
        try:
            simplified_results = []
            for video in results:
                video_simple = {}
                for k, v in video.items():
                    if k != "questions":
                        video_simple[k] = str(v)
                    else:
                        video_simple[k] = []
                        for q in v:
                            q_simple = {}
                            for qk, qv in q.items():
                                if qk != "options":
                                    q_simple[qk] = str(qv)
                                else:
                                    q_simple[qk] = [str(opt) for opt in qv[:4]]
                            video_simple[k].append(q_simple)
                simplified_results.append(video_simple)
            
            fallback_path = output_path.replace('.json', '_simplified.json')
            with open(fallback_path, 'w') as f:
                json.dump(simplified_results, f, indent=2)
            print(f"Saved simplified results to {fallback_path}")
            return fallback_path
        except Exception as backup_error:
            print(f"Fallback save also failed: {str(backup_error)}")
            raise
    
    return output_path

def run_evaluation(results_path, video_types="short,medium,long", return_categories=True, 
                   return_sub_categories=False, return_task_types=False):
    """Run the evaluation script on the results"""
    # Import the eval function from the script
    try:
        
        # Run the evaluation
        eval_your_results(
            results_path,
            video_types=video_types,
            skip_missing=True,
            return_categories_accuracy=return_categories,
            return_sub_categories_accuracy=return_sub_categories,
            return_task_types_accuracy=return_task_types,
            gt_answer_key="answer",
            your_answer_key="response"
        )
    except ImportError:
        print("Could not import eval_your_results_2.py. Make sure the file is in the current directory.")
    except Exception as e:
        print(f"Error during evaluation: {str(e)}")


data_path = '/kaggle/input/newest/test-00000-of-00001.parquet'  # Update this path
output_path = '/kaggle/working/model_responses.json'
model_name = 'gpt-4'
sample_size = None # Set to None to process all data
run_eval = True
video_types = "short,medium,long"
return_categories = True
return_sub_categories = False
return_task_types = True


print("Starting data processing...")
results_path = process_video_data(
    data_path, 
    output_path,
    model_name=model_name,
    sample_size=sample_size
)

if run_eval:
    print("\nRunning evaluation...")
    run_evaluation(
        results_path,
        video_types=video_types,
        return_categories=return_categories,
        return_sub_categories=return_sub_categories,
        return_task_types=return_task_types
    )

print("Processing complete!")

Starting data processing...
Loading data from /kaggle/input/newest/test-00000-of-00001.parquet...
Data loaded. Total records: 2700
Grouping data by video...
Total videos to process: 900


Processing videos: 100%|██████████| 900/900 [31:17<00:00,  2.09s/it]

Saving results to /kaggle/working/model_responses.json...
Results successfully saved to /kaggle/working/model_responses.json

Running evaluation...
Evaluation on video Type: short
-------------------------------------
Video Categories
-------------------------------------
Knowledge:  33.0%
Film & Television:  22.5%
Sports Competition:  23.3%
Artistic Performance:  27.5%
Life Record:  24.3%
Multilingual:  23.3%
-------------------------------------
Task Categories
-------------------------------------
Temporal Perception:  38.9%
Spatial Perception:  23.3%
Attribute Perception:  20.5%
Action Recognition:  29.8%
Object Recognition:  23.2%
OCR Problems:  21.1%
Counting Problem:  23.2%
Temporal Reasoning:  46.2%
Spatial Reasoning:  29.6%
Action Reasoning:  34.0%
Object Reasoning:  35.0%
Information Synopsis:  31.7%
-------------------------------------
Overall Performance
-------------------------------------
Overall:  26.9%


Evaluation on video Type: medium
-------------------------------


