## Predicting student exam scores. With Student Performance dataset on kaggle. We will EDA, then split the data and apply machine learning model (Linear Regression, Decision Tree, Random Forest, XGBoost, SVM) to comment which model is best for the data


In [None]:
#Import Libaries
import pandas as pd
import numpy as np
from scipy.stats import skew
import seaborn as sns
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import time
from sklearn.preprocessing import LabelEncoder


### 1. Read data description and summary



In [None]:
with open("/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt", "r") as f:
    content = f.read()
print(content)  

### 2. load dataset and Explore data



In [None]:
train_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
sample_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")

In [None]:
assert 'Id' in test_df.columns, "❌ test_df no column 'Id'"
assert 'Id' in sample_df.columns, "❌ sample_df no column 'Id'"

test_full_df = test_df.merge(sample_df[['Id', 'SalePrice']], on='Id', how='left')

print(f"shape after concat: {test_full_df.shape}")

test_full_df.head()


In [None]:
#Shape dataset
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

In [None]:
pd.set_option('display.max_columns', None)
train_df.tail()

In [None]:
train_df.info()

In [None]:
test_df.info()

#### Id is varible identifier because convert to object


In [None]:
train_df['Id'] = train_df['Id'].astype('object')
test_df['Id'] = test_df['Id'].astype('object')

#### MSSubCLass In terms of statistical significance, it is a categorical variable.

In [None]:
train_df['MSSubClass'] = train_df['MSSubClass'].astype('object')
test_df['MSSubClass'] = test_df['MSSubClass'].astype('object')

#### handle missing values

In [None]:
def safe_imputer_sync(df_to_fit: pd.DataFrame, df_to_transform: pd.DataFrame) -> pd.DataFrame:
    """
    Synchronously imputes missing data in df_to_transform (TEST) using imputation parameters 
    (Median/Mode) learned from df_to_fit (TRAIN).

    Args:
        df_to_fit (pd.DataFrame): The DataFrame used to learn Median/Mode values (TRAIN).
        df_to_transform (pd.DataFrame): The DataFrame that needs imputation (TEST).
    
    Returns:
        pd.DataFrame: The DataFrame after imputation.
    """
    
    df_transformed = df_to_transform.copy()
    
    # 1. HANDLE NUMERIC COLUMNS - USE MEDIAN
    # Learn numeric columns from the fit set
    numeric_cols = df_to_fit.select_dtypes(include=np.number).columns
    
    for col in numeric_cols:
        # Check if the column exists in the transformed set and has missing values
        if col in df_transformed.columns and df_transformed[col].isnull().any():
            # Learn the parameter (Median) ONLY from the FIT set
            median_value = df_to_fit[col].median()
            
            # Apply imputation to the TRANSFORM set
            df_transformed[col] = df_transformed[col].fillna(median_value)
            print(f"✅ Filled '{col}' with Median from TRAIN ({median_value:.2f}).")
        elif col not in df_transformed.columns:
             # Handle case where a numerical column is missing in the test set (rare after alignment)
             pass


    # 2. HANDLE CATEGORICAL COLUMNS - USE "NONE"
    # Learn categorical columns from the fit set
    categorical_cols = df_to_fit.select_dtypes(include=['object', 'category']).columns
    
    for col in categorical_cols:
        if col in df_transformed.columns and df_transformed[col].isnull().any():
            # Apply imputation with the constant "None"
            df_transformed[col] = df_transformed[col].fillna("None")
            print(f"✅ Filled '{col}' with 'None'.")
            
    print("\n--- SYNCHRONIZED IMPUTATION SUMMARY ---")
    print("✅ Missing Data filled successfully.")
    print(f"Total Missing Values remaining: {df_transformed.isnull().sum().sum()}")
    return df_transformed

# --- CORRECT AND SAFE USAGE ---

# Step 1: Process the TRAIN set (Learning parameters from itself)
train_df_imputed = safe_imputer_sync(df_to_fit=train_df, df_to_transform=train_df)

# Step 2: Process the TEST set (Learning parameters from the processed TRAIN set)
test_df_imputed = safe_imputer_sync(df_to_fit=train_df_imputed, df_to_transform=test_df)

### House Price Distribution
##### Now let us take a look at how the house prices are distributed.

In [None]:
print(train_df['SalePrice'].describe())
plt.figure(figsize=(9, 8))
sns.distplot(train_df['SalePrice'], color='g', bins=100, hist_kws={'alpha': 0.4});

In [None]:
train_df.info()

### handle ordinal value.

In [None]:
import pandas as pd
from typing import Dict, List

def apply_complete_ordinal_encoding(df: pd.DataFrame) -> pd.DataFrame:
    """
    Applies Custom Ordinal Encoding to all identified quality/rank columns 
    to convert them into numerical values that preserve their inherent order.
    
    Assumption: All missing values (NA) in these columns have already been imputed 
    with the string 'None'.

    Args:
        df (pd.DataFrame): The input DataFrame.
        
    Returns:
        pd.DataFrame: The DataFrame with all Ordinal columns successfully encoded to integers.
    """
    
    df_encoded = df.copy()

    # 1. MAPPING DEFINITIONS (Defining the numerical rank for each category)
    
    # Quality/Condition Scale (6 levels: None/Poorest -> Excellent)
    # Used for general quality ratings (e.g., ExterQual, BsmtQual)
    qual_cond_mapping_6 = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    
    # LotShape (Irregular -> Regular)
    lot_shape_mapping = {'IR3': 1, 'IR2': 2, 'IR1': 3, 'Reg': 4, 'None': 0}
    
    # Utilities (Worst -> Best)
    utilities_mapping = {'ELO': 1, 'NoSeWa': 2, 'NoSewr': 3, 'AllPub': 4, 'None': 0}
    
    # LandSlope (Severe -> Gentle)
    land_slope_mapping = {'Sev': 1, 'Mod': 2, 'Gtl': 3, 'None': 0}
    
    # BsmtExposure (No Exposure -> Good Exposure)
    bsmt_exposure_mapping = {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}

    # BsmtFinType (Unfinished -> Good Living Quarters)
    bsmt_fin_mapping_7 = {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
    
    # GarageFinish (Unfinished -> Finished)
    garage_finish_mapping = {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
    
    # Functional Rating (Salvage -> Typical)
    functional_mapping = {
        'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4, 
        'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8
    }
    
    # Fence Quality (No Fence -> Good Privacy)
    fence_mapping = {'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}


    # ----------------------------------------------------------------------
    # 2. COLUMN APPLICATION LIST (Complete 19 Ordinal Features)
    # ----------------------------------------------------------------------

    column_mappings: List[tuple[str, Dict[str, int]]] = [
        # Quality/Condition - 6-Point Scale
        ('ExterQual', qual_cond_mapping_6), ('ExterCond', qual_cond_mapping_6),
        ('HeatingQC', qual_cond_mapping_6), ('KitchenQual', qual_cond_mapping_6),
        ('BsmtQual', qual_cond_mapping_6), ('BsmtCond', qual_cond_mapping_6),
        ('FireplaceQu', qual_cond_mapping_6), ('PoolQC', qual_cond_mapping_6),
        ('GarageQual', qual_cond_mapping_6), ('GarageCond', qual_cond_mapping_6),
        
        # Specific Mappings
        ('LotShape', lot_shape_mapping),
        ('Utilities', utilities_mapping),
        ('LandSlope', land_slope_mapping),
        ('BsmtExposure', bsmt_exposure_mapping),
        ('BsmtFinType1', bsmt_fin_mapping_7),
        ('BsmtFinType2', bsmt_fin_mapping_7),
        ('GarageFinish', garage_finish_mapping),
        ('Functional', functional_mapping),
        
        # Added Fence
        ('Fence', fence_mapping),
    ]
    
    print("--- STARTING CUSTOM ORDINAL ENCODING ---")
    
    for col, mapping in column_mappings:
        if col in df_encoded.columns:
            # Apply the mapping. .fillna(0) handles any residual unmapped values.
            # Using .astype(int) ensures the result is a clean integer type.
            df_encoded[col] = df_encoded[col].map(mapping).fillna(0).astype(int)
            print(f"✅ Encoded: {col}")
        else:
             print(f"⚠️ Column '{col}' not found. Skipping.")

    # 3. OverallQual and OverallCond are already numerical (int) and are left alone.
    print("--- ORDINAL ENCODING COMPLETE ---")
    
    return df_encoded

# --- EXAMPLE USAGE ---
train_df_ordinal_encoded = apply_complete_ordinal_encoding(train_df_imputed)
test_df_ordinal_encoded = apply_complete_ordinal_encoding(test_df_imputed)

In [None]:
train_df_ordinal_encoded.info()

In [None]:

def apply_nominal_label_encoding_after_ordinal_filtered(df: pd.DataFrame) -> pd.DataFrame:
    """
    Applies Label Encoding to nominal categorical variables, excluding columns 
    that have already been Ordinal Encoded.

    Args:
        df (pd.DataFrame): The DataFrame after Ordinal Encoding (contains numerical 
                           and remaining 'object' columns).
    
    Returns:
        pd.DataFrame: The DataFrame containing only numerical data 
                      (Label Encoded & ready for scaling/training).
    """

    df_encoded = df.copy()

    # 1️⃣ Ordinal columns that were ALREADY encoded to numbers (these are skipped)
    # We do this check implicitly by only selecting 'object' columns.
    
    # 2️⃣ Select the remaining Nominal columns (type 'object' or 'category')
    # These are the columns that did not have an inherent order.
    nominal_cols_to_label = [
        col for col in df_encoded.select_dtypes(include=['object']).columns
        # Note: We rely on the fact that Ordinal columns have already been converted to int/float.
    ]

    if not nominal_cols_to_label:
        print("⚠️ No Nominal columns found that require Label Encoding.")
        return df_encoded

    print("🔧 --- STARTING LABEL ENCODING (NOMINAL FEATURES) ---")
    print(f"📊 Columns to be encoded: {nominal_cols_to_label}")

    # 3️⃣ Apply LabelEncoder
    le = LabelEncoder()
    for col in nominal_cols_to_label:
        # Convert to string explicitly to handle mixed types safely before encoding
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))

    print("✅ --- LABEL ENCODING COMPLETE ---")
    print(f"📈 Total columns after encoding: {df_encoded.shape[1]}")
    print("🎯 DataFrame now contains only numerical data, ready for Scaling/Training.")

    return df_encoded


# 🔹 Example Usage:
train_df_final_encoded = apply_nominal_label_encoding_after_ordinal_filtered(train_df_ordinal_encoded)
test_df_final_encoded = apply_nominal_label_encoding_after_ordinal_filtered(test_df_ordinal_encoded)

In [None]:
train_df_final_encoded.info()

In [None]:
from sklearn.preprocessing import MinMaxScaler 
import pandas as pd
import numpy as np

def apply_final_scaling(df_final_encoded: pd.DataFrame, target_col: str = 'SalePrice') -> pd.DataFrame:
    
    
    df_scaled = df_final_encoded.copy()
    
    numeric_cols = df_scaled.select_dtypes(include=np.number).columns
    
    cols_to_check = numeric_cols.drop(target_col, errors='ignore').tolist()
    
    final_cols_to_scale = [col for col in cols_to_check if df_scaled[col].nunique() > 2]
    
    scaler = MinMaxScaler()
    
    df_scaled[final_cols_to_scale] = scaler.fit_transform(df_scaled[final_cols_to_scale])
    
    print("--- SCALING Done ---")
    print("✅ Use MinMaxScaler.")
    print(f"✅ Scaling has been applied to {len(final_cols_to_scale)} column (Ordinal và Continuous).")
    
    return df_scaled

train_df_final_scaled = apply_final_scaling(train_df_final_encoded, target_col='SalePrice')
test_df_final_scaled = apply_final_scaling(test_df_final_encoded, target_col='None')

In [None]:
train_df_final_scaled['SalePrice']= np.log1p(train_df_final_scaled['SalePrice'])


#### Split train and validation data

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from typing import Tuple
def split_train_validation(df_scaled: pd.DataFrame, target_col: str = 'SalePrice', test_size: float = 0.2, random_state: int = 42) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    X = df_scaled.drop(columns=[target_col])
    y = df_scaled[target_col]
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    print("--- DATA SET SPLIT COMPLETE ---")
    print(f"Shape train data (X_train): {X_train.shape}")
    print(f"Shape validation data(X_val): {X_val.shape}")
    print(f"split ratio: {1 - test_size}:{test_size} (Training: Assessment)")
    
    return X_train, X_val, y_train, y_val
X_train, X_val, y_train, y_val = split_train_validation(train_df_final_scaled, test_size=0.2)

#### Train and evaluate model

In [None]:

# ==========================================
# ⚙️ 1️⃣ MODEL CONFIGURATION
# ==========================================
linear_params = {
    'ElasticNet': {
        'model': ElasticNet(max_iter=5000, random_state=42),
        'param_grid': {'alpha': [0.0001, 0.0005, 0.001], 'l1_ratio': [0.5, 0.7, 0.9]}
    },
    "Lasso_Reg": {
        'model': Lasso(max_iter=5000, random_state=42),
        'param_grid': {'alpha': [0.0003, 0.0005, 0.0008, 0.001]}
    }
}

tree_models = {
    "RandomForest": RandomForestRegressor(n_estimators=1000, max_depth=8, random_state=42, n_jobs=-1),
    "XGBoost": XGBRegressor(n_estimators=2000, learning_rate=0.01, max_depth=3, random_state=42, n_jobs=-1)
}

results = {}

# ==========================================
# 🧮 2️⃣ TRAINING ON LOG(SALEPRICE)
# ==========================================
print("\n--- 🔁 MODEL TRAINING & HYPERPARAMETER TUNING ---")

for name, config in linear_params.items():
    start_time = time.time()
    print(f"\n⚙️ GridSearchCV for {name} ...")

    grid_search = GridSearchCV(
        config['model'],
        config['param_grid'],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)  # y_train is already log(SalePrice)
    best_model = grid_search.best_estimator_

    # --- PREDICTION ---
    y_pred_val_log = best_model.predict(X_val)

    # --- EVALUATION ---
    rmse_log = np.sqrt(mean_squared_error(y_val, y_pred_val_log))
    r2_val = r2_score(y_val, y_pred_val_log)

    # Inverse transform to original currency unit
    y_val_true = np.expm1(y_val)
    y_val_pred = np.expm1(y_pred_val_log)
    rmse_real = np.sqrt(mean_squared_error(y_val_true, y_val_pred))

    results[name] = {
        "RMSE_Log": round(rmse_log, 4),
        "R2_Score": round(r2_val, 4),
        "RMSE_Original($)": round(rmse_real, 2),
        "Best_Alpha": grid_search.best_params_.get('alpha'),
        "Best_L1_Ratio": grid_search.best_params_.get('l1_ratio', 'N/A'),
        "Model": best_model
    }
    print(f"✅ {name} complete. | RMSE_Log: {rmse_log:.4f} | Time: {time.time() - start_time:.2f}s")

# ---------------------------------------------------
# 🌳 3️⃣ TREE-BASED MODELS
# ---------------------------------------------------
for name, model in tree_models.items():
    start_time = time.time()
    print(f"\n🌲 Training {name} ...")

    model.fit(X_train, y_train)
    y_pred_val_log = model.predict(X_val)

    rmse_log = np.sqrt(mean_squared_error(y_val, y_pred_val_log))
    r2_val = r2_score(y_val, y_pred_val_log)

    y_val_true = np.expm1(y_val)
    y_val_pred = np.expm1(y_pred_val_log)
    rmse_real = np.sqrt(mean_squared_error(y_val_true, y_val_pred))

    results[name] = {
        "RMSE_Log": round(rmse_log, 4),
        "R2_Score": round(r2_val, 4),
        "RMSE_Original($)": round(rmse_real, 2),
        "Model": model
    }
    print(f"✅ {name} complete. | RMSE_Log: {rmse_log:.4f} | Time: {time.time() - start_time:.2f}s")

# ---------------------------------------------------
# 📊 4️⃣ BEST MODEL SELECTION
# ---------------------------------------------------
results_df = pd.DataFrame(results).T.sort_values(by="RMSE_Log")
print("\n--- 🧾 PERFORMANCE ON VALIDATION ---")
print(results_df.drop(columns=['Model']))

best_model_name = results_df.index[0]
final_model = results_df.loc[best_model_name, 'Model']

# ---------------------------------------------------
# 🧠 5️⃣ RETRAIN ON FULL TRAIN SET
# ---------------------------------------------------
X_full = pd.concat([X_train, X_val], axis=0, ignore_index=True)
y_full = pd.concat([y_train, y_val], axis=0, ignore_index=True)  # Still log(SalePrice)

final_model.fit(X_full, y_full)
print(f"\n✅ Retrained best model ({best_model_name}) on full train data.")

# ---------------------------------------------------
# 🧩 6️⃣ PREPARE FINAL TEST SET (Assuming necessary features are aligned in test_df)
# ---------------------------------------------------
# Note: The original script assumes a merged 'test_full_df' exists for the prediction and submission step.
# We will use the provided test DataFrame structure.
# X_test_final is assumed to be the FINAL, PROCESSED, SCALED test features.
X_test_final = test_df_final_scaled[X_full.columns]

# ---------------------------------------------------
# 🔮 7️⃣ PREDICTION & INVERSE TRANSFORM
# ---------------------------------------------------
y_test_pred_log = final_model.predict(X_test_final)
y_test_pred_real = np.expm1(y_test_pred_log)

# Assuming test_df (the raw test data, needed for ID and potential SalePrice lookup) 
# is merged with a sample submission or similar structure.
# We create a final prediction dataframe.
# Note: The original script uses a structure based on merging with a 'sample_df' which is not defined here.
test_full_df['Predicted_SalePrice'] = y_test_pred_real

# ---------------------------------------------------
# 📈 8️⃣ EVALUATION ON TEST (IF TRUE PRICES EXIST)
# ---------------------------------------------------
if 'SalePrice' in test_full_df.columns and test_full_df['SalePrice'].notna().sum() > 0:
    y_true = test_full_df['SalePrice']
    y_pred = test_full_df['Predicted_SalePrice']

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    print("\n--- 📊 PERFORMANCE ON TEST ---")
    print(f"RMSE : {rmse:,.2f}")
    print(f"MAE  : {mae:,.2f}")
    print(f"MAPE : {mape:.2f}%")
else:
    print("\n⚠️ Evaluation skipped: True SalePrice is not available in the test set.")

# ---------------------------------------------------
# 💾 9️⃣ SUBMISSION FILE
# ---------------------------------------------------
submission = test_full_df[['Id', 'Predicted_SalePrice']].rename(columns={'Predicted_SalePrice': 'SalePrice'})
submission.to_csv("submission.csv", index=False)
print("\n✅ Saved submission.csv")

# ---------------------------------------------------
# 🔍 🔟 SAMPLE RESULTS
# ---------------------------------------------------
print("\n--- SAMPLE PREDICTIONS ---")
print(test_full_df[['Id', 'SalePrice', 'Predicted_SalePrice']].head())

In [None]:

plt.figure(figsize=(6,6))
plt.scatter(y_true, y_pred, alpha=0.6)
plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
plt.xlabel("Actual price")
plt.ylabel("Predict price")
plt.title("📊 Compare real SalePrice vs prediction (Test set)")
plt.show()
