In [None]:
import requests
import json
import pandas as pd
import os


DATASET_URL = "https://raw.githubusercontent.com/AREEG94FAHAD/TaskComplexityEval-24/main/problems_data.jsonl"
LOCAL_FILE = "problems_data.jsonl"

def load_data():
    """
    Loads the TaskComplexity dataset (JSONL format).
    Expected keys from your sample: 'title', 'description', 'input_description', 
    'output_description', 'problem_class', 'problem_score'.
    """
    # 1. Download if missing
    if not os.path.exists(LOCAL_FILE):
        print(f"üì• Downloading dataset from {DATASET_URL}...")
        try:
            r = requests.get(DATASET_URL)
            if r.status_code == 200:
                with open(LOCAL_FILE, "wb") as f:
                    f.write(r.content)
                print("‚úÖ Download complete.")
            else:
                print(f"‚ö†Ô∏è Download failed (Status {r.status_code}). Please download 'problems_data.jsonl' manually.")
                return None
        except Exception as e:
            print(f"‚ö†Ô∏è Error downloading: {e}")
            return None

    # 2. Parse JSONL (Line-delimited JSON)
    try:
        # lines=True is crucial for .jsonl files
        df = pd.read_json(LOCAL_FILE, lines=True)
        
        # 3. Create 'full_text' for BERT
        # Concatenate Title + Description + Input/Output
        df['full_text'] = (
            df['title'].fillna('') + " [SEP] " + 
            df['description'].fillna('') + " [SEP] " + 
            df['input_description'].fillna('') + " " + 
            df['output_description'].fillna('')
        )
        
        # 4. Map 'problem_class' to integers
        # Normalize to lowercase just in case (e.g. "Hard" -> "hard")
        class_map = {'easy': 0, 'medium': 1, 'hard': 2}
        df['label_cls'] = df['problem_class'].str.lower().map(class_map)
        
        # 5. Ensure 'problem_score' is float
        df['label_score'] = pd.to_numeric(df['problem_score'], errors='coerce').fillna(1.0)
        
        # Drop rows with missing targets
        df = df.dropna(subset=['label_cls', 'label_score'])
        
        return df
        
    except Exception as e:
        print(f"‚ùå Error processing data: {e}")
        return None

if __name__ == "__main__":
    df = load_data()
    if df is not None:
        print(f"‚úÖ Loaded {len(df)} samples successfully.")
        print(df[['title', 'problem_class', 'problem_score']].head())

üì• Downloading dataset from https://raw.githubusercontent.com/AREEG94FAHAD/TaskComplexityEval-24/main/problems_data.jsonl...
‚úÖ Download complete.
‚úÖ Loaded 4112 samples successfully.
                       title problem_class  problem_score
0                        Uuu          hard            9.7
1             House Building          hard            9.7
2             Mario or Luigi          hard            9.6
3             The Wire Ghost          hard            9.6
4  Barking Up The Wrong Tree          hard            9.6
