## IMPORT Thư Viện


In [55]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Load và tiền xử lý dữ liệu House Prices

In [56]:
def preprocess_house_prices_data(train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    print("Original data shapes:")
    print(f"Train: {train_df.shape}, Test: {test_df.shape}")
    print(f"Train columns: {train_df.columns.tolist()[:10]}...") 
    if 'SalePrice' in train_df.columns:
        y = train_df['SalePrice']
        X_train = train_df.drop('SalePrice', axis=1)
        X_test = test_df.copy()
    else:
        raise ValueError("SalePrice column not found in training data")
    train_ids = X_train['Id'].copy() if 'Id' in X_train.columns else None
    test_ids = X_test['Id'].copy() if 'Id' in X_test.columns else None
    if 'Id' in X_train.columns:
        X_train = X_train.drop('Id', axis=1)
    if 'Id' in X_test.columns:
        X_test = X_test.drop('Id', axis=1)
    print(f"\nSau khi tách target:")
    print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y: {y.shape}")
    return X_train, X_test, y, train_ids, test_ids

## Thực hiện feature engineering

In [57]:
def feature_engineering(X_train, X_test):
    # Tổng diện tích
    if all(col in X_train.columns for col in ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF']):
        X_train['TotalSF'] = X_train['TotalBsmtSF'] + X_train['1stFlrSF'] + X_train['2ndFlrSF']
        X_test['TotalSF'] = X_test['TotalBsmtSF'] + X_test['1stFlrSF'] + X_test['2ndFlrSF']
        print(" Added TotalSF feature")
    
    # Tuổi ngôi nhà
    if all(col in X_train.columns for col in ['YrSold', 'YearBuilt']):
        X_train['HouseAge'] = X_train['YrSold'] - X_train['YearBuilt']
        X_test['HouseAge'] = X_test['YrSold'] - X_test['YearBuilt']
        print(" Added HouseAge feature")
    
    # Thời gian từ khi cải tạo
    if all(col in X_train.columns for col in ['YrSold', 'YearRemodAdd']):
        X_train['RemodelAge'] = X_train['YrSold'] - X_train['YearRemodAdd']
        X_test['RemodelAge'] = X_test['YrSold'] - X_test['YearRemodAdd']
        print(" Added RemodelAge feature")
    
    # Đã cải tạo hay chưa
    if all(col in X_train.columns for col in ['YearRemodAdd', 'YearBuilt']):
        X_train['IsRemodeled'] = (X_train['YearRemodAdd'] != X_train['YearBuilt']).astype(int)
        X_test['IsRemodeled'] = (X_test['YearRemodAdd'] != X_test['YearBuilt']).astype(int)
        print(" Added IsRemodeled feature")
    
    # Tổng số phòng tắm
    bath_cols = ['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']
    available_bath = [col for col in bath_cols if col in X_train.columns]
    if len(available_bath) >= 2:
        X_train['TotalBath'] = X_train.get('FullBath', 0) + 0.5 * X_train.get('HalfBath', 0) + \
                              X_train.get('BsmtFullBath', 0) + 0.5 * X_train.get('BsmtHalfBath', 0)
        X_test['TotalBath'] = X_test.get('FullBath', 0) + 0.5 * X_test.get('HalfBath', 0) + \
                             X_test.get('BsmtFullBath', 0) + 0.5 * X_test.get('BsmtHalfBath', 0)
        print(" Added TotalBath feature")
    
    # Diện tích porch tổng
    porch_cols = ['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']
    available_porch = [col for col in porch_cols if col in X_train.columns]
    if available_porch:
        X_train['TotalPorchSF'] = X_train[available_porch].sum(axis=1)
        X_test['TotalPorchSF'] = X_test[available_porch].sum(axis=1)
        print(" Added TotalPorchSF feature")
    
    # Chất lượng tổng thể
    if all(col in X_train.columns for col in ['OverallQual', 'OverallCond']):
        X_train['OverallScore'] = X_train['OverallQual'] * X_train['OverallCond']
        X_test['OverallScore'] = X_test['OverallQual'] * X_test['OverallCond']
        print(" Added OverallScore feature")
    
    return X_train, X_test

   ## Xử lý missing values

In [58]:
def handle_missing_values(X_train, X_test):
    print("\n=== HANDLING MISSING VALUES ===")
    
    # Phân loại columns
    numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    
    print(f"Numeric columns: {len(numeric_cols)}")
    print(f"Categorical columns: {len(categorical_cols)}")
    
    # Kiểm tra missing values trước khi xử lý
    print(f"\nMissing values trước khi xử lý:")
    print(f"X_train: {X_train.isnull().sum().sum()}")
    print(f"X_test: {X_test.isnull().sum().sum()}")
    
    # 1. Xử lý missing values cho numeric columns
    if numeric_cols:
        numeric_imputer = SimpleImputer(strategy='median')
        X_train[numeric_cols] = numeric_imputer.fit_transform(X_train[numeric_cols])
        X_test[numeric_cols] = numeric_imputer.transform(X_test[numeric_cols])
        print("✓ Imputed numeric columns")
    
    # 2. Xử lý missing values cho categorical columns
    if categorical_cols:
        categorical_imputer = SimpleImputer(strategy='most_frequent')
        X_train[categorical_cols] = categorical_imputer.fit_transform(X_train[categorical_cols])
        X_test[categorical_cols] = categorical_imputer.transform(X_test[categorical_cols])
        print("✓ Imputed categorical columns")
    
    return X_train, X_test, numeric_cols, categorical_cols

## Encoding categorical variables

In [59]:
def encode_categorical_features(X_train, X_test, categorical_cols):
    if categorical_cols:
        label_encoders = {}
        for col in categorical_cols:
            le = LabelEncoder()
            # Combine train và test để đảm bảo consistent encoding
            combined = pd.concat([X_train[col], X_test[col]], axis=0)
            le.fit(combined)
            X_train[col] = le.transform(X_train[col])
            X_test[col] = le.transform(X_test[col])
            label_encoders[col] = le
            if len(label_encoders) <= 5:  # Hiển thị 5 cột đầu
                print(f" Encoded {col}: {len(le.classes_)} categories")
        
        if len(label_encoders) > 5:
            print(f" Encoded {len(label_encoders)} categorical columns total")
    
    return X_train, X_test

## Chuẩn hóa numerical features


In [60]:
def scale_features(X_train, X_test):
    scaler = StandardScaler()
    numeric_cols_updated = X_train.select_dtypes(include=[np.number]).columns.tolist()
    
    X_train[numeric_cols_updated] = scaler.fit_transform(X_train[numeric_cols_updated])
    X_test[numeric_cols_updated] = scaler.transform(X_test[numeric_cols_updated])
    
    print(f" Scaled {len(numeric_cols_updated)} numerical features")
    
    return X_train, X_test, scaler

## Lưu dữ liệu đã xử lý

In [61]:
def save_processed_data(X_train, X_test, y, output_dir='../exps/feature_Labels/'):
    import os
    os.makedirs(output_dir, exist_ok=True)
    X_train.to_csv(os.path.join(output_dir, 'ex2_train.csv'), index=False)
    X_test.to_csv(os.path.join(output_dir, 'ex2_test.csv'), index=False)
    
    # Lưu y thành DataFrame
    y_df = pd.DataFrame(y)
    y_df.to_csv(os.path.join(output_dir, 'y_train.csv'), index=False)
    
    print(f"\n=== SAVING PROCESSED DATA ===")
    print(f" ex2_train.csv: {X_train.shape}")
    print(f" ex2_test.csv: {X_test.shape}")
    print(f" y_train.csv: {y_df.shape}")
    print(f" Saved to: {output_dir}")

### Thực thi tiền xử lý dữ liệu

In [62]:
train_path = '../data/train.csv'  
test_path = '../data/test.csv' 

try:
    print("BẮT ĐẦU TIỀN XỬ LÝ DỮ LIỆU")
    print("=" * 50)
    
    # 1. Load và tách data
    X_train, X_test, y, train_ids, test_ids = preprocess_house_prices_data(train_path, test_path)
    
    # 2. Feature engineering
    X_train, X_test = feature_engineering(X_train, X_test)
    
    # 3. Xử lý missing values
    X_train, X_test, numeric_cols, categorical_cols = handle_missing_values(X_train, X_test)
    
    # 4. Encoding categorical features
    X_train, X_test = encode_categorical_features(X_train, X_test, categorical_cols)
    
    # 5. Scaling features
    X_train, X_test, scaler = scale_features(X_train, X_test)
    
    # 6. Lưu dữ liệu đã xử lý
    save_processed_data(X_train, X_test, y)
    
    print("\n" + "=" * 50)
    print("TIỀN XỬ LÝ HOÀN TẤT!")
    print(f"Final shapes - X_train: {X_train.shape}, X_test: {X_test.shape}")
    print("Đã sẵn sàng cho modeling!")
    
except Exception as e:
    print(f"Lỗi trong quá trình tiền xử lý: {e}")
    import traceback
    traceback.print_exc()

BẮT ĐẦU TIỀN XỬ LÝ DỮ LIỆU
Original data shapes:
Train: (1460, 81), Test: (1459, 80)
Train columns: ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities']...

Sau khi tách target:
X_train: (1460, 79), X_test: (1459, 79), y: (1460,)
 Added TotalSF feature
 Added HouseAge feature
 Added RemodelAge feature
 Added IsRemodeled feature
 Added TotalBath feature
 Added TotalPorchSF feature
 Added OverallScore feature

=== HANDLING MISSING VALUES ===
Numeric columns: 43
Categorical columns: 43

Missing values trước khi xử lý:
X_train: 7829
X_test: 7881
✓ Imputed numeric columns
✓ Imputed categorical columns
 Encoded MSZoning: 5 categories
 Encoded Street: 2 categories
 Encoded Alley: 2 categories
 Encoded LotShape: 4 categories
 Encoded LandContour: 4 categories
 Encoded 43 categorical columns total
 Scaled 86 numerical features

=== SAVING PROCESSED DATA ===
 ex2_train.csv: (1460, 86)
 ex2_test.csv: (1459, 86)
 y_train.csv: (1460, 