In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import root_mean_squared_error, r2_score

import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import math
from scipy.stats import skew

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
PATH_TRAIN = r"..\datasets\train.csv"
PATH_TEST = r"..\datasets\test.csv"

In [3]:
df_train = pd.read_csv(PATH_TRAIN)
df_test = pd.read_csv(PATH_TEST)

In [4]:
print("Shape Train:", df_train.shape)
print("Shape Test:", df_test.shape)

Shape Train: (1460, 81)
Shape Test: (1459, 80)


In [16]:
DROPPED_COL = ["Id", "Utilities"]
X = df_train.drop(["SalePrice", *DROPPED_COL], axis=1)
y = df_train["SalePrice"]

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin


class GroupMedianImputer(BaseEstimator, TransformerMixin):
    def __init__(self, group_col, target_col):
        self.group_col = group_col
        self.target_col = target_col

    def fit(self, X, y=None):
        self.median_values = X.groupby(self.group_col)[
            self.target_col].median()
        self.global_median_ = X[self.target_col].median()
        return self

    def transform(self, X):
        df = X.copy()
        df[self.target_col] = df[self.target_col].fillna(
            df[self.group_col].map(self.median_values))
        df[self.target_col] = df[self.target_col].fillna(self.global_median_)
        return df

In [18]:
categorical_columns = X.select_dtypes("object").columns
numerical_columns = X.select_dtypes("number").columns

In [19]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
one_hot_encoded = encoder.fit_transform(X[categorical_columns])

one_hot_X = pd.DataFrame(
    one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))


X_encoded = pd.concat([X.reset_index(
    drop=True), one_hot_X.reset_index(drop=True)], axis=1)

X_encoded = X_encoded.drop(categorical_columns, axis=1)

In [21]:
rfr = XGBRegressor()
rfr.fit(X_encoded, y)

rmse_score = cross_val_score(
    rfr, X_encoded, y, scoring='neg_root_mean_squared_error')

rmse_score.mean()

np.float64(-29105.558984375)

Prepare df_test

In [22]:
id_test = df_test["Id"]
df_test = df_test.drop(DROPPED_COL, axis=1)

KeyError: 'Id'

In [12]:
one_hot_encoded = encoder.transform(df_test[categorical_columns])

one_hot_X = pd.DataFrame(
    one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))


X_test_encoded = pd.concat([df_test.reset_index(
    drop=True), one_hot_X.reset_index(drop=True)], axis=1)

X_test_encoded = X_test_encoded.drop(categorical_columns, axis=1)

In [13]:
df_submision = pd.DataFrame()
df_submision["Id"] = id_test
df_submision["SalePrice"] = rfr.predict(X_test_encoded)

In [14]:
df_submision.to_csv("sub4_test.csv", index=False)