In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
import joblib

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 4) -> float:
    
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)
    
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if "__file__" in globals() else os.path.abspath("..")
DATA_DIR = os.path.join(BASE_DIR, "data")

TRAIN_PATH = os.path.join(DATA_DIR, "train.csv")
TEST_PATH = os.path.join(DATA_DIR, "test.csv")

df = pd.read_csv(TRAIN_PATH)
print("Dataset shape:", df.shape)
print(df.head(10))


print('\nSalePrice descriptive stats:')
print(df['SalePrice'].describe())

CONTINUOUS = ['GrLivArea', 'TotalBsmtSF']
CATEGORICAL = ['Neighborhood', 'HouseStyle']
REQUIRED_COLUMNS = CONTINUOUS + CATEGORICAL + ['SalePrice']

missing_cols = [c for c in REQUIRED_COLUMNS if c not in df.columns]
if missing_cols:
    raise ValueError(f"Required columns missing from dataset: {missing_cols}. Please pick other features or check dataset.")

df_sub = df[REQUIRED_COLUMNS].copy()

print('\nSubset head:')
print(df_sub.head(10))

for col in CONTINUOUS:
    if df_sub[col].isna().sum() > 0:
        med = df_sub[col].median()
        df_sub[col] = df_sub[col].fillna(med)

for col in CATEGORICAL:
    if df_sub[col].isna().sum() > 0:
        mode = df_sub[col].mode()[0]
        df_sub[col] = df_sub[col].fillna(mode)

X = df_sub.drop(columns=['SalePrice'])
y = df_sub['SalePrice'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print('\nTrain shape:', X_train.shape, 'Val shape:', X_val.shape)

X_train_cat = pd.get_dummies(X_train[CATEGORICAL], drop_first=True)
X_val_cat = pd.get_dummies(X_val[CATEGORICAL], drop_first=True)

X_train_cat, X_val_cat = X_train_cat.align(X_val_cat, join='left', axis=1, fill_value=0)

scaler = StandardScaler()
X_train_cont = scaler.fit_transform(X_train[CONTINUOUS])
X_val_cont = scaler.transform(X_val[CONTINUOUS])

X_train_proc = np.hstack([X_train_cont, X_train_cat.values])
X_val_proc = np.hstack([X_val_cont, X_val_cat.values])

print('\nProcessed feature matrix shapes (train, val):', X_train_proc.shape, X_val_proc.shape)

y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_proc, y_train_log)

os.makedirs('models', exist_ok=True)
joblib.dump(model, 'models/rf_log_target.joblib')
joblib.dump(scaler, 'models/standard_scaler.joblib')
cat_columns = list(X_train_cat.columns)
joblib.dump(cat_columns, 'models/cat_columns.joblib')

print('\nModel trained and saved to models/')

y_val_log_pred = model.predict(X_val_proc)

y_val_pred = np.expm1(y_val_log_pred)

y_val_pred = np.clip(y_val_pred, a_min=0, a_max=None)

rmsle_val = compute_rmsle(y_test=y_val, y_pred=y_val_pred, precision=4)
print(f"Validation RMSLE: {rmsle_val}")

comp = pd.DataFrame({
    'y_true': y_val[:10],
    'y_pred': np.round(y_val_pred[:10], 0)
})
print('\nComparison (first 10 rows of validation set):')
print(comp)

if os.path.exists(TEST_PATH):
    df_test = pd.read_csv(TEST_PATH)

    test_missing = [c for c in CONTINUOUS + CATEGORICAL if c not in df_test.columns]
    if test_missing:
        print('Warning: test file is missing columns:', test_missing)
    else:
        X_test = df_test[CONTINUOUS + CATEGORICAL].copy()
        
        for col in CONTINUOUS:
            X_test[col] = X_test[col].fillna(X_test[col].median())
        for col in CATEGORICAL:
            X_test[col] = X_test[col].fillna(X_test[col].mode()[0])
        
        X_test_cat = pd.get_dummies(X_test[CATEGORICAL], drop_first=True)
        
        for c in cat_columns:
            if c not in X_test_cat.columns:
                X_test_cat[c] = 0
        X_test_cat = X_test_cat[cat_columns]
        
        X_test_cont = scaler.transform(X_test[CONTINUOUS])
        X_test_proc = np.hstack([X_test_cont, X_test_cat.values])
        
        y_test_log_pred = model.predict(X_test_proc)
        y_test_pred = np.expm1(y_test_log_pred)
        submission = pd.DataFrame({'Id': df_test['Id'], 'SalePrice': y_test_pred})
        submission.to_csv('submission.csv', index=False)
        print('Saved submission.csv (first 5 lines):')
        print(submission.head())
else:
    print('\nNo test.csv found in data/ — skipping submission cell.')


Dataset shape: (1460, 81)
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   
5   6          50       RL         85.0    14115   Pave   NaN      IR1   
6   7          20       RL         75.0    10084   Pave   NaN      Reg   
7   8          60       RL          NaN    10382   Pave   NaN      IR1   
8   9          50       RM         51.0     6120   Pave   NaN      Reg   
9  10         190       RL         50.0     7420   Pave   NaN      Reg   

  LandContour Utilities  ... PoolArea PoolQC  Fence MiscFeature MiscVal  \
0         Lvl    AllPub  ...        0    NaN    NaN         NaN       0   
1        