In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# 🧹 AGI Voice Agent - Data Preprocessing (Notebook 2)

### Purpose:
Prepare the 73 datasets for efficient:
- Loading
- Cleaning
- Format standardization (CSV, Parquet, JSONL)
- Indexing for retrieval in RAG



📌 1. Load Libraries

import pandas as pd
import json
import os
from pathlib import Path




📌 2. Define Dataset Paths

# Example for 3 datasets, extend for all 73
dataset_files = {
    "AI_Reasoning": "data/ai_reasoning.csv",
    "Hellaswag_Test": "data/hellaswag_test.jsonl.txt",
    "Hellaswag_Val": "data/hellaswag_val.jsonl.txt",
    # Extend up to 73 datasets
}




📌 3. Load & Preview Datasets

✅ Example: Load CSV

df_ai_reasoning = pd.read_csv(dataset_files["AI_Reasoning"])
df_ai_reasoning.head()

✅ Example: Load JSONL

def load_jsonl(file_path):
    with open(file_path, 'r') as f:
        return [json.loads(line) for line in f]

hellaswag_test_data = load_jsonl(dataset_files["Hellaswag_Test"])
hellaswag_test_data[:2]




📌 4. Data Cleaning Example

Remove nulls & duplicates:

df_ai_reasoning.dropna(inplace=True)
df_ai_reasoning.drop_duplicates(inplace=True)

For JSONL:

def clean_jsonl(data):
    cleaned = [entry for entry in data if entry]  # Remove empty entries
    return cleaned

clean_hellaswag = clean_jsonl(hellaswag_test_data)
len(clean_hellaswag)




📌 5. Save to Efficient Formats

Save to Parquet (Optimized):

df_ai_reasoning.to_parquet("processed/ai_reasoning.parquet", compression='gzip')

with open('processed/hellaswag_test_cleaned.jsonl', 'w') as f:
    for item in clean_hellaswag:
        f.write(json.dumps(item) + '\n')




📌 6. Automate Preprocessing of All Datasets

Path("processed").mkdir(exist_ok=True)

def preprocess_all_datasets(dataset_files):
    for name, path in dataset_files.items():
        if path.endswith('.csv'):
            df = pd.read_csv(path).dropna().drop_duplicates()
            df.to_parquet(f'processed/{name}.parquet', compression='gzip')
            print(f"Processed {name} to Parquet.")
        
        elif path.endswith('.jsonl.txt'):
            data = load_jsonl(path)
            cleaned = clean_jsonl(data)
            with open(f'processed/{name}_cleaned.jsonl', 'w') as f:
                for item in cleaned:
                    f.write(json.dumps(item) + '\n')
            print(f"Processed {name} to cleaned JSONL.")

preprocess_all_datasets(dataset_files)




📌 7. Next Steps

✅ Use processed datasets in the RAG model pipeline

✅ Proceed to:

03_Dataset_Balance_Checker.ipynb

or directly to Feature Engineering

SyntaxError: invalid character '📌' (U+1F4CC) (89657685.py, line 12)