In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# 🛠️ AGI Voice Agent - Feature Engineering (Notebook 5)

### Purpose:
- Extract structured features from text, reasoning data, or multimodal sources.
- Normalize and transform features for model training and retrieval pipelines (RAG/LLMs).





📌 1. Libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import json




📌 2. Load Example Dataset

df = pd.read_parquet('processed/ai_reasoning.parquet')
df.head()




📌 3. TF-IDF Feature Extraction (for text columns)

text_column = 'question'  # Example column name

tfidf = TfidfVectorizer(max_features=100)
tfidf_matrix = tfidf.fit_transform(df[text_column].fillna(''))

tfidf_feature_names = tfidf.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_feature_names)

tfidf_df.head()




📌 4. Sentence Embedding with Transformer Models

model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(df[text_column].fillna(''))

print("Embedding shape:", embeddings.shape)




📌 5. Add Embeddings to Dataset

embedding_columns = [f'embedding_{i}' for i in range(embeddings.shape[1])]
embedding_df = pd.DataFrame(embeddings, columns=embedding_columns)

df_with_embeddings = pd.concat([df.reset_index(drop=True), embedding_df], axis=1)
df_with_embeddings.head()




📌 6. Save Feature-Enhanced Dataset

df_with_embeddings.to_parquet('processed/ai_reasoning_features.parquet', compression='gzip')
print("✅ Saved feature-enhanced dataset")




📌 7. Automate Feature Extraction for All Datasets

dataset_files = {
    "AI_Reasoning": "processed/ai_reasoning.parquet",
    # Extend to all datasets
}

def process_dataset_features(name, path, text_column='question'):
    df = pd.read_parquet(path)
    embeddings = model.encode(df[text_column].fillna(''))
    embedding_columns = [f'embedding_{i}' for i in range(embeddings.shape[1])]
    embedding_df = pd.DataFrame(embeddings, columns=embedding_columns)
    df_feat = pd.concat([df.reset_index(drop=True), embedding_df], axis=1)
    df_feat.to_parquet(f'processed/{name}_features.parquet', compression='gzip')
    print(f"✅ Processed features for {name}")

for name, path in dataset_files.items():
    process_dataset_features(name, path)

SyntaxError: invalid character '📌' (U+1F4CC) (2518880055.py, line 11)