In [33]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder ,StandardScaler
from sklearn.compose import ColumnTransformer

def load_data():
    house = pd.read_csv("Housing.csv")
    
    house["TotalRooms"] = house["bedrooms"] + house["bathrooms"] 
    house["rooms_per_bedroom"] = house["TotalRooms"] / (house["bedrooms"] + 1e-6)
    house["rooms_per_stories"] = house["stories"] / (house["bedrooms"] + 1e-6)
    house["bath_to_totalroom_ratio"] = house["bathrooms"] / (house["TotalRooms"] + 1e-6)
  
  
        
    X = house.drop(["price", "bedrooms", "bathrooms"], axis=1)
    y = np.log1p(house["price"])

    return X, y

def pre_process(X, y, test_size = 0.2, random_state = 42):
    return train_test_split(X,y,test_size=test_size,random_state=random_state)

def build_pipeline(X):
    
    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(include="object").columns.tolist()
    
    numerical_transformer = Pipeline([
        ("Imputer", SimpleImputer(strategy="mean")),
        ("Scaler", StandardScaler())
    ])
    
    categorical_transformer = Pipeline([
        ("Imputer",SimpleImputer(strategy="most_frequent")),
        ("Onehot", OneHotEncoder(handle_unknown="ignore"))
    ])
    
    preprocessor = ColumnTransformer([
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    
    pipe = Pipeline([
        ("Preprocessor", preprocessor),
        ("model", XGBRegressor(random_state=42))
    ])
    return pipe
    
def evaluate(model, X_test, y_test):
    predicted = np.expm1(model.predict(X_test))
    y_test_original = np.expm1(y_test)

    MAE = mean_absolute_error(y_test_original,predicted)
    MSE = mean_squared_error(y_test_original,predicted)
    RMAE = np.sqrt(MSE)
    R2 = r2_score(y_test_original,predicted)
    
    print(f"MAE: {MAE}")
    print(f"MSE: {MSE}")
    print(f"RMAE: {RMAE}")
    print(f"R^2: {R2}")
    
    
if __name__ == "__main__":
    
    X, y = load_data()
    X_train, X_test, y_train, y_test = pre_process(X, y)
    
    pipe = build_pipeline(X_train)
    
    param_dist = {
        'model__n_estimators': [200, 500, 800],
        'model__max_depth': [1, 3, 5, 7, 9],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__subsample': [0.4, 0.6, 0.8, 1.0],
        'model__colsample_bytree': [0.4, 0.6, 0.8, 1.0],
        'model__min_child_weight': [0.5, 1, 3, 5]
    }
    random_search = RandomizedSearchCV(pipe, param_dist, n_iter=50, cv=5 ,scoring='r2', n_jobs=-1, random_state=42)

    random_search.fit(X_train, y_train)

    best_model = random_search.best_estimator_
    
    evaluate(best_model, X_test, y_test)

    print("R^2 score after tuning:", random_search.best_score_)

    print("Parameters found: ", random_search.best_params_)
    
  
    

MAE: 984095.1972477062
MSE: 1910341502713.8352
RMAE: 1382151.041932044
R^2: 0.6220565924294653
R^2 score after tuning: 0.6764864967805101
Parameters found:  {'model__subsample': 0.8, 'model__n_estimators': 200, 'model__min_child_weight': 0.5, 'model__max_depth': 3, 'model__learning_rate': 0.05, 'model__colsample_bytree': 0.6}
