In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import root_mean_squared_error, r2_score

import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import math
from scipy.stats import skew

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
PATH_TRAIN = r"..\datasets\train.csv"
PATH_TEST = r"..\datasets\test.csv"

In [3]:
df_train = pd.read_csv(PATH_TRAIN)
df_test = pd.read_csv(PATH_TEST)

In [4]:
print("Shape Train:", df_train.shape)
print("Shape Test:", df_test.shape)

Shape Train: (1460, 81)
Shape Test: (1459, 80)


In [5]:
DROPPED_COL = ["Id", "Utilities"]
X = df_train.drop(["SalePrice", *DROPPED_COL], axis=1)
y = df_train["SalePrice"]

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin


class GroupMedianImputer(BaseEstimator, TransformerMixin):
    def __init__(self, group_col, target_col):
        self.group_col = group_col
        self.target_col = target_col

    def fit(self, X, y=None):
        self.median_values = X.groupby(self.group_col)[
            self.target_col].median()
        self.global_median_ = X[self.target_col].median()
        return self

    def transform(self, X):
        df = X.copy()
        df[self.target_col] = df[self.target_col].fillna(
            df[self.group_col].map(self.median_values))
        df[self.target_col] = df[self.target_col].fillna(self.global_median_)
        return df

In [None]:
ORDINAL_MAPS = {

{   "columns" : ["Functional"],
    "ordinalMap":{
    'Typ': 8,
    'Min1': 7,
    'Min2': 6,
    'Mod': 5,
    'Maj1': 4,
    'Maj2': 3,
    'Sev': 2,
    'Sal': 1
    }
},

{   "columns" : ["BsmtFinType1","BsmtFinType2"],
    "ordinalMap":{
    'GLQ': 6,
    'ALQ': 5,
    'BLQ': 4,
    'Rec': 3,
    'LwQ': 2,
    'Unf': 1
    }
},

{   "columns" : ["Electrical"],
    "ordinalMap":{
    "SBrkr": 5,
    "FuseA": 4,
    "FuseF": 3,
    "Mix": 2,
    "FuseP": 1
    }
},
{   "columns" : ["ExterQual","ExterCond","BsmtQual","BsmtCond","HeatingQC","KitchenQual","FireplaceQu","GarageQual","GarageCond","PoolQC","BsmtExposure"],
    "ordinalMap":{
    'Ex': 5,
    'Gd': 4,
    'TA': 3,
    'Fa': 2,
    'Po': 1,

    'Av': 3,
    'Mn': 2,
    'No': 1,
    }
},
{   "columns" : ["Fence"],
    "ordinalMap":{
    'GdPrv': 4,
    'MnPrv': 3,
    'GdWo': 2,
    'MnWw': 1,
    }
},
{   "columns" : ["LotShape"],
    "ordinalMap":{
    "Reg": 4,
    "IR1": 3,
    "IR2": 2,
    "IR3": 1,
    }
},
{   "columns" : ["LandContour"],
    "ordinalMap":{
    "Lvl": 4,
    "Bnk": 3,
    "HLS": 2,
    "Low": 1,
    }
},
{   "columns" : ["Utilities"],
    "ordinalMap":{
    "AllPub": 4,
    "NoSewr": 3,
    "NoSeWa": 2,
    "ELO": 1,
    }
},

    

    






    'Fin': 3,
    'RFn': 2,
    'Unf': 1,

    'Gtl': 3,
    'Mod': 2,
    'Sev': 1,

    'Y': 2,
    'P': 1,
    'N': 0,

    "Pave": 1,
    "Grvl": 0,

}

X = X.map(lambda x: ORDINAL_MAPS.get(x, x))

In [8]:
categorical_columns = X.select_dtypes("object").columns
numerical_columns = X.select_dtypes("number").columns

In [10]:
X[categorical_columns]

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Foundation,Heating,CentralAir,Electrical,GarageType,PavedDrive,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,Inside,Gtl,CollgCr,Norm,Norm,...,PConc,GasA,Y,SBrkr,Attchd,Y,,,WD,Normal
1,RL,Pave,,Reg,Lvl,FR2,Gtl,Veenker,Feedr,Norm,...,CBlock,GasA,Y,SBrkr,Attchd,Y,,,WD,Normal
2,RL,Pave,,IR1,Lvl,Inside,Gtl,CollgCr,Norm,Norm,...,PConc,GasA,Y,SBrkr,Attchd,Y,,,WD,Normal
3,RL,Pave,,IR1,Lvl,Corner,Gtl,Crawfor,Norm,Norm,...,BrkTil,GasA,Y,SBrkr,Detchd,Y,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,FR2,Gtl,NoRidge,Norm,Norm,...,PConc,GasA,Y,SBrkr,Attchd,Y,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Pave,,Reg,Lvl,Inside,Gtl,Gilbert,Norm,Norm,...,PConc,GasA,Y,SBrkr,Attchd,Y,,,WD,Normal
1456,RL,Pave,,Reg,Lvl,Inside,Gtl,NWAmes,Norm,Norm,...,CBlock,GasA,Y,SBrkr,Attchd,Y,MnPrv,,WD,Normal
1457,RL,Pave,,Reg,Lvl,Inside,Gtl,Crawfor,Norm,Norm,...,Stone,GasA,Y,SBrkr,Attchd,Y,GdPrv,Shed,WD,Normal
1458,RL,Pave,,Reg,Lvl,Inside,Gtl,NAmes,Norm,Norm,...,CBlock,GasA,Y,FuseA,Attchd,Y,,,WD,Normal


In [9]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
one_hot_encoded = encoder.fit_transform(X[categorical_columns])

one_hot_X = pd.DataFrame(
    one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))


X_encoded = pd.concat([X.reset_index(
    drop=True), one_hot_X.reset_index(drop=True)], axis=1)

X_encoded = X_encoded.drop(categorical_columns, axis=1)

TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['int', 'str']

In [15]:
rfr = XGBRegressor()
rfr.fit(X_encoded, y)

rmse_score = cross_val_score(
    rfr, X_encoded, y, scoring='neg_root_mean_squared_error')

rmse_score.mean()

# -28367.549609375

np.float64(-28274.288671875)

Prepare df_test

In [32]:
id_test = df_test["Id"]
df_test = df_test.drop(DROPPED_COL, axis=1)

df_test = df_test.map(lambda x: ORDINAL_MAPS.get(x, x))

In [33]:
one_hot_encoded = encoder.transform(df_test[categorical_columns])

one_hot_X = pd.DataFrame(
    one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))


X_test_encoded = pd.concat([df_test.reset_index(
    drop=True), one_hot_X.reset_index(drop=True)], axis=1)

X_test_encoded = X_test_encoded.drop(categorical_columns, axis=1)

In [34]:
df_submision = pd.DataFrame()
df_submision["Id"] = id_test
df_submision["SalePrice"] = rfr.predict(X_test_encoded)

In [35]:
df_submision.to_csv("sub4_test.csv", index=False)