In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# read the dataset
house_df = pd.read_csv('train.csv')

#identify input and target column
input_cols = list(house_df.columns)[1:-1]
target_cols = 'SalePrice'

input_df = house_df[input_cols].copy()
targets = house_df[target_cols]

#numeric_cols and categorical_cols separated
import numpy as np

numeric_cols = input_df.select_dtypes(include=np.number).columns.tolist()
category_cols = input_df.select_dtypes('object').columns.tolist()

#fill the missing value by imputer
imputer = SimpleImputer(strategy='mean').fit(input_df[numeric_cols])
input_df[numeric_cols] = imputer.transform(input_df[numeric_cols])

#scale the value between 0 to 1
scaler = MinMaxScaler().fit(input_df[numeric_cols])
input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])

#converted categorical column to numerical column
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(input_df[category_cols].fillna('unknown'))
encoded_cols = list(encoder.get_feature_names_out(category_cols))
input_df[encoded_cols] = encoder.transform(input_df[category_cols])

In [72]:
#split the train val data
train_inputs, val_inputs, train_targets, val_targets = train_test_split(input_df[numeric_cols + encoded_cols],
                                                                        targets,
                                                                        test_size=0.25,
                                                                        random_state=42)

#train the model
model = Ridge().fit(train_inputs, train_targets)

#prediction the train model
train_preds = model.predict(train_inputs)
train_preds

array([172599.43235557, 175802.15443601, 104368.76915641, ...,
       121535.1721255 , 173830.94792569, 190771.98152057])

In [None]:
#prediction the val model

model = Ridge().fit(val_inputs, val_targets)
val_pred = model.predict(val_inputs)
val_pred

In [74]:
#calculate rmse loss

train_rmse = mean_squared_error(train_targets, train_preds, squared=False)
val_rmse = mean_squared_error(val_targets, val_pred, squared=False)

print(train_rmse)
print(val_rmse)

21885.980021843327
17804.58029618141


In [78]:
def predict_input(single_input):
    input_df = pd.DataFrame([single_input])
    input_df[numeric_cols] = imputer.transform(input_df[numeric_cols])
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    input_df[encoded_cols] = encoder.transform(input_df[category_cols].values)
    X_input = input_df[numeric_cols + encoded_cols]
    return model.predict(X_input)[0]

In [79]:
sample_input = { 'MSSubClass': 20, 'MSZoning': 'RL', 'LotFrontage': 77.0, 'LotArea': 9320,
 'Street': 'Pave', 'Alley': None, 'LotShape': 'IR1', 'LandContour': 'Lvl', 'Utilities': 'AllPub',
 'LotConfig': 'Inside', 'LandSlope': 'Gtl', 'Neighborhood': 'NAmes', 'Condition1': 'Norm', 'Condition2': 'Norm',
 'BldgType': '1Fam', 'HouseStyle': '1Story', 'OverallQual': 4, 'OverallCond': 5, 'YearBuilt': 1959,
 'YearRemodAdd': 1959, 'RoofStyle': 'Gable', 'RoofMatl': 'CompShg', 'Exterior1st': 'Plywood',
 'Exterior2nd': 'Plywood', 'MasVnrType': 'None','MasVnrArea': 0.0,'ExterQual': 'TA','ExterCond': 'TA',
 'Foundation': 'CBlock','BsmtQual': 'TA','BsmtCond': 'TA','BsmtExposure': 'No','BsmtFinType1': 'ALQ',
 'BsmtFinSF1': 569,'BsmtFinType2': 'Unf','BsmtFinSF2': 0,'BsmtUnfSF': 381,
 'TotalBsmtSF': 950,'Heating': 'GasA','HeatingQC': 'Fa','CentralAir': 'Y','Electrical': 'SBrkr', '1stFlrSF': 1225,
 '2ndFlrSF': 0, 'LowQualFinSF': 0, 'GrLivArea': 1225, 'BsmtFullBath': 1, 'BsmtHalfBath': 0, 'FullBath': 1,
 'HalfBath': 1, 'BedroomAbvGr': 3, 'KitchenAbvGr': 1,'KitchenQual': 'TA','TotRmsAbvGrd': 6,'Functional': 'Typ',
 'Fireplaces': 0,'FireplaceQu': np.nan,'GarageType': np.nan,'GarageYrBlt': np.nan,'GarageFinish': np.nan,'GarageCars': 0,
 'GarageArea': 0,'GarageQual': np.nan,'GarageCond': np.nan,'PavedDrive': 'Y', 'WoodDeckSF': 352, 'OpenPorchSF': 0,
 'EnclosedPorch': 0,'3SsnPorch': 0, 'ScreenPorch': 0, 'PoolArea': 0, 'PoolQC': np.nan, 'Fence': np.nan, 'MiscFeature': 'Shed',
 'MiscVal': 400, 'MoSold': 1, 'YrSold': 2010, 'SaleType': 'WD', 'SaleCondition': 'Normal'}

In [None]:
predicted_price = predict_input(sample_input)
predicted_price

In [82]:
print('The predicted sale price of the house is ${}'.format(predicted_price))

The predicted sale price of the house is $118955.48078837915
