# Data Cleaning

## Load Data

In [7]:
import pandas as pd

# Define file splits
splits = {
    'train': 'data/train-00000-of-00001.parquet',
    'validation': 'data/validation-00000-of-00001.parquet',
    'test': 'data/test-00000-of-00001.parquet'
}

dfs = {}
# Load the train split from the Hugging Face dataset
try:
    dfs["train"] = pd.read_parquet("hf://datasets/tau/commonsense_qa/" + splits["train"])
    dfs["validation"] = pd.read_parquet("hf://datasets/tau/commonsense_qa/" + splits["validation"])
    dfs["test"] = pd.read_parquet("hf://datasets/tau/commonsense_qa/" + splits["test"])
    print("Dataset loaded successfully.")
except Exception as e:
    raise RuntimeError(f"Failed to load dataset: {e}")


Dataset loaded successfully.


## Clean Data

In [8]:
REQUIRED_FIELDS = ['id', 'question', 'question_concept', 'choices', 'answerKey']

def validate_entry(entry):
    """Ensure the entry has all required fields."""
    missing = [field for field in REQUIRED_FIELDS if field not in entry or pd.isna(entry[field])]
    if missing:
        raise ValueError(f"Missing fields {missing} in entry with id: {entry.get('id', 'unknown')}")
    return True

def clean_entry(entry):
    """Perform basic cleaning on the entry and update choices."""
    try:
        # Trim whitespace on basic string fields.
        for key in ['id', 'question', 'question_concept', 'answerKey']:
            if key in entry and isinstance(entry[key], str):
                entry[key] = entry[key].strip()
                
        # Process the choices field if it is a dict.
        if 'choices' in entry and isinstance(entry['choices'], dict):
            # Define expected labels.
            expected_labels = ['A', 'B', 'C', 'D', 'E']
            
            # Check for 'label' key and verify its content.
            if 'label' in entry['choices']:
                labels = entry['choices']['label']
                # Convert numpy array to list if necessary.
                if hasattr(labels, 'tolist'):
                    labels = labels.tolist()
                if labels != expected_labels:
                    raise ValueError(f"Unexpected labels: {labels}")
            
            # Process the 'text' key, ensuring it's a list of strings.
            if 'text' in entry['choices']:
                # Clean the text entries and replace the choices dict with just the text list.
                entry['choices'] = [choice.strip() for choice in entry['choices']['text']]
            else:
                raise ValueError("Missing 'text' in choices")
        else:
            raise ValueError("Field 'choices' is missing or not a dict")
            
    except Exception as e:
        raise ValueError(f"Error cleaning entry {entry.get('id', 'unknown')}: {e}")
    return entry

In [9]:

# Process each row in the DataFrame
for key, df in dfs.items():
    cleaned_entries = []
    errors = []
    for index, row in df.iterrows():
        entry = row.to_dict()
        try:
            validate_entry(entry)
            cleaned_entry = clean_entry(entry)
            cleaned_entries.append(cleaned_entry)
        except Exception as e:
            errors.append(f"Row {index}: {e}")

    if errors:
        print("Some entries had issues:")
        for error in errors:
            print(error)
    else:
        print("All entries validated and cleaned successfully.")

        # Convert cleaned entries to a DataFrame and save as CSV
        cleaned_df = pd.DataFrame(cleaned_entries)
        output_file = f'data/cleaned/{key}.csv'
        cleaned_df.to_csv(output_file, index=False)
        print(f"Cleaned data saved to {output_file}")


All entries validated and cleaned successfully.
Cleaned data saved to data/cleaned/train.csv
All entries validated and cleaned successfully.
Cleaned data saved to data/cleaned/validation.csv
All entries validated and cleaned successfully.
Cleaned data saved to data/cleaned/test.csv
