In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder


In [None]:
# Goal: create datasets that's compatible with 3 gb models
"""
X_xgb: Dense, Target Encoded, Statistical Text Features.
X_lgb: Integer Encoded, SVD Text Features, Aggregate Features.
X_cat: Raw Categoricals, Raw Text.
"""

In [None]:
def create_datasets(original_df, text_col='Description'):
    df = original_df.copy()
    
    # --- COMMON PREPROCESSING ---
    # Fill N/As for text
    df[text_col] = df[text_col].fillna('none')
    
    # Basic Feature Engineering (Shared) based on v1
    df['name_length'] = df['Name'].str.len().fillna(0)
    df['desc_length'] = df[text_col].str.len().fillna(0)
    df['is_mixed_breed'] = (df['Breed2'] != 0).astype(int)
    df['num_colors'] = (df[['Color1', 'Color2', 'Color3']] != 0).sum(axis=1)
    
    cat_columns = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 
                   'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 
                   'Sterilized', 'Health', 'State', 'RescuerID']
    
    # Convert all categoricals to string first (safer for encoding)
    for c in cat_columns:
        if c in df.columns:
            df[c] = df[c].astype(str)

    # -------------------------------------
    # 1. XGBOOST DATASET (Dense, Numerical)
    # -------------------------------------
    X_xgb = df.copy()
    
    # Label Encode Categoricals for XGBoost
    # (Note: For production, you must fit LE on train and transform test. 
    #  Here we assume df is the full dataset or this is handled before split)
    for c in cat_columns:
        if c in X_xgb.columns:
            lbl = LabelEncoder()
            X_xgb[c] = lbl.fit_transform(X_xgb[c])
            
    # Drop raw text
    X_xgb = X_xgb.drop(['Name', 'PetID', 'Description', 'AdoptionSpeed'], axis=1, errors='ignore')

    # ---------------------------------------------
    # 2. LIGHTGBM DATASET (Integer Cats + SVD Text)
    # ---------------------------------------------
    X_lgb = df.copy()
    
    # Pandas Category Type for LightGBM (it handles them natively & optimally)
    for c in cat_columns:
        if c in X_lgb.columns:
            X_lgb[c] = X_lgb[c].astype('category')
            
    # TF-IDF + SVD for Description
    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    svd = TruncatedSVD(n_components=10, random_state=42)
    
    text_vectors = tfidf.fit_transform(df[text_col])
    svd_vectors = svd.fit_transform(text_vectors)
    
    svd_df = pd.DataFrame(svd_vectors, columns=[f'svd_{i}' for i in range(10)])
    
    # Reset index to allow concat
    X_lgb = X_lgb.reset_index(drop=True)
    X_lgb = pd.concat([X_lgb, svd_df], axis=1)
    
    # Drop raw text
    X_lgb = X_lgb.drop(['Name', 'PetID', 'Description', 'AdoptionSpeed'], axis=1, errors='ignore')

    # ----------------------------------------
    # 3. CATBOOST DATASET (Raw Text & Strings)
    # ----------------------------------------
    
    X_cat = df.copy()
    
    # CatBoost wants raw strings for categorical features (fill NaNs with "Missing")
    for c in cat_columns:
        if c in X_cat.columns:
            X_cat[c] = X_cat[c].fillna("Missing")
    
    # Define which columns are text/categorical for later use in model.fit()
    # (CatBoost needs raw text columns to be kept)
    X_cat = X_cat.drop(['Name', 'PetID', 'AdoptionSpeed'], axis=1, errors='ignore')
    # Note: Keep 'Description' for CatBoost's text_features support

    return X_xgb, X_lgb, X_cat

# Usage Example:
# 1. Combine Train/Test temporarily for consistent encoding (or fit/transform separately carefully)
# all_data = pd.concat([pet_df, test_df], sort=False).reset_index(drop=True)
# X_xgb_all, X_lgb_all, X_cat_all = create_datasets(all_data)

# 2. Split back logic (simplified)
# train_len = len(pet_df)
# X_train_xgb = X_xgb_all[:train_len]
# X_test_xgb = X_xgb_all[train_len:]
# Same for lgb and cat...