In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV


In [29]:

data = pd.read_csv('train.csv')


data.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [31]:
columns_to_drop = ['PoolQC', 'MiscFeature', 'Alley', 'Fence']
data_cleaned = data.drop(columns=columns_to_drop)

data_cleaned['LotFrontage'].fillna(data_cleaned['LotFrontage'].median(), inplace=True)
data_cleaned['GarageYrBlt'].fillna(data_cleaned['GarageYrBlt'].median(), inplace=True)
data_cleaned['GarageCond'].fillna(data_cleaned['GarageCond'].mode()[0], inplace=True)
data_cleaned['GarageType'].fillna(data_cleaned['GarageType'].mode()[0], inplace=True)
data_cleaned['FireplaceQu'].fillna(data_cleaned['FireplaceQu'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned['LotFrontage'].fillna(data_cleaned['LotFrontage'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned['GarageYrBlt'].fillna(data_cleaned['GarageYrBlt'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace metho

In [33]:

categorical_cols = data_cleaned.select_dtypes(include=['object']).columns
numeric_cols = data_cleaned.select_dtypes(exclude=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_cols),
        ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_cols)
    ])

X_preprocessed = preprocessor.fit_transform(data_cleaned.drop(columns=['SalePrice']))
y = data_cleaned['SalePrice']


ValueError: A given column is not a column of the dataframe

In [35]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

categorical_cols = data_cleaned.select_dtypes(include=['object']).columns
numeric_cols = data_cleaned.select_dtypes(exclude=['object']).columns.drop('SalePrice')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_cols),
        ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_cols)
    ])

X_preprocessed = preprocessor.fit_transform(data_cleaned.drop(columns=['SalePrice']))
y = data_cleaned['SalePrice']

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_preprocessed, y)

importances = rf_model.feature_importances_
indices = np.argsort(importances)[-20:]

print("Top 20 Feature Indices:", indices)


Top 20 Feature Indices: [ 28   5   0  11   8 246   2  23   7  19   6   3  27  13  26   9  14  12
  16   4]


In [36]:
numeric_features = numeric_cols.tolist()
categorical_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols).tolist()

all_feature_names = numeric_features + categorical_features

top_20_features = [all_feature_names[i] for i in indices]

print("Top 20 Feature Names:", top_20_features)


Top 20 Feature Names: ['WoodDeckSF', 'OverallCond', 'Id', 'BsmtUnfSF', 'MasVnrArea', 'GarageFinish_Unf', 'LotFrontage', 'TotRmsAbvGrd', 'YearRemodAdd', 'FullBath', 'YearBuilt', 'LotArea', 'GarageArea', '1stFlrSF', 'GarageCars', 'BsmtFinSF1', '2ndFlrSF', 'TotalBsmtSF', 'GrLivArea', 'OverallQual']


In [37]:
top_20_features_without_id = [feature for feature in top_20_features if feature != 'Id']

if len(top_20_features_without_id) < 20:

    next_feature_index = np.argsort(importances)[-21]
    next_feature_name = all_feature_names[next_feature_index]
    top_20_features_without_id.append(next_feature_name)

print("Top 20 Feature Names (without Id):", top_20_features_without_id)


Top 20 Feature Names (without Id): ['WoodDeckSF', 'OverallCond', 'BsmtUnfSF', 'MasVnrArea', 'GarageFinish_Unf', 'LotFrontage', 'TotRmsAbvGrd', 'YearRemodAdd', 'FullBath', 'YearBuilt', 'LotArea', 'GarageArea', '1stFlrSF', 'GarageCars', 'BsmtFinSF1', '2ndFlrSF', 'TotalBsmtSF', 'GrLivArea', 'OverallQual', 'OpenPorchSF']


In [41]:
data_cleaned['TotalBathrooms'] = data_cleaned['FullBath'] + data_cleaned['BsmtFullBath']
data_cleaned['TotalPorchArea'] = data_cleaned['OpenPorchSF'] + data_cleaned['EnclosedPorch'] + data_cleaned['3SsnPorch'] + data_cleaned['ScreenPorch']
data_cleaned['HouseAge'] = data_cleaned['YrSold'] - data_cleaned['YearBuilt']
data_cleaned['RemodeledAge'] = data_cleaned['YrSold'] - data_cleaned['YearRemodAdd']
data_cleaned['TotalSF'] = data_cleaned['1stFlrSF'] + data_cleaned['2ndFlrSF'] + data_cleaned['TotalBsmtSF']
data_cleaned['GarageInteraction'] = data_cleaned['GarageCars'] * data_cleaned['GarageArea']
data_cleaned['RoomsPerSF'] = data_cleaned['TotRmsAbvGrd'] / data_cleaned['TotalSF']

data_cleaned[['TotalBathrooms', 'TotalPorchArea', 'HouseAge', 'RemodeledAge', 'TotalSF', 'GarageInteraction', 'RoomsPerSF']].head()


Unnamed: 0,TotalBathrooms,TotalPorchArea,HouseAge,RemodeledAge,TotalSF,GarageInteraction,RoomsPerSF
0,3,61,5,5,2566,1096,0.003118
1,2,0,31,31,2524,920,0.002377
2,3,42,7,6,2706,1216,0.002217
3,2,307,91,36,2473,1926,0.002831
4,3,84,8,8,3343,2508,0.002692


In [43]:
data_cleaned['TotalBathrooms'] = data_cleaned['FullBath'] + data_cleaned['BsmtFullBath']
data_cleaned['TotalPorchArea'] = data_cleaned['OpenPorchSF'] + data_cleaned['EnclosedPorch'] + data_cleaned['3SsnPorch'] + data_cleaned['ScreenPorch']
data_cleaned['HouseAge'] = data_cleaned['YrSold'] - data_cleaned['YearBuilt']
data_cleaned['RemodeledAge'] = data_cleaned['YrSold'] - data_cleaned['YearRemodAdd']
data_cleaned['TotalSF'] = data_cleaned['1stFlrSF'] + data_cleaned['2ndFlrSF'] + data_cleaned['TotalBsmtSF']
data_cleaned['GarageInteraction'] = data_cleaned['GarageCars'] * data_cleaned['GarageArea']
data_cleaned['RoomsPerSF'] = data_cleaned['TotRmsAbvGrd'] / data_cleaned['TotalSF']

selected_features = ['TotalSF', 'GrLivArea', 'OverallQual', 'GarageInteraction', 'TotalBathrooms',
                     'TotalPorchArea', 'HouseAge', 'RemodeledAge', 'RoomsPerSF', 'LotFrontage',
                     'LotArea', 'GarageCars', '1stFlrSF', 'GarageArea', 'BsmtFinSF1']

selected_features.append('SalePrice')

selected_data = data_cleaned[selected_features]
output_file = 'selected_house_features.csv'
selected_data.to_csv(output_file, index=False)

print(f"Selected features have been saved to {output_file}")


Selected features have been saved to selected_house_features.csv


In [45]:

data_cleaned['TotalBathrooms'] = data_cleaned['FullBath'] + data_cleaned['BsmtFullBath']
data_cleaned['TotalPorchArea'] = data_cleaned['OpenPorchSF'] + data_cleaned['EnclosedPorch'] + data_cleaned['3SsnPorch'] + data_cleaned['ScreenPorch']
data_cleaned['HouseAge'] = data_cleaned['YrSold'] - data_cleaned['YearBuilt']
data_cleaned['RemodeledAge'] = data_cleaned['YrSold'] - data_cleaned['YearRemodAdd']
data_cleaned['TotalSF'] = data_cleaned['1stFlrSF'] + data_cleaned['2ndFlrSF'] + data_cleaned['TotalBsmtSF']
data_cleaned['GarageInteraction'] = data_cleaned['GarageCars'] * data_cleaned['GarageArea']
data_cleaned['RoomsPerSF'] = data_cleaned['TotRmsAbvGrd'] / data_cleaned['TotalSF']


selected_features = [
    
    'TotalSF', 'GarageInteraction', 'TotalBathrooms', 'TotalPorchArea', 'HouseAge', 
    'RemodeledAge', 'RoomsPerSF',
    

    'WoodDeckSF', 'OverallCond', 'BsmtUnfSF', 'MasVnrArea', 'GarageFinish', # Use 'GarageFinish' instead of 'GarageFinish_Unf'
    'LotFrontage', 'TotRmsAbvGrd', 'GrLivArea', 'OverallQual', 'LotArea', 'BsmtFinSF1'
]


selected_features.append('SalePrice')

preprocessed_data = data_cleaned[selected_features]
output_file = 'preprocessed_house_features.csv'
preprocessed_data.to_csv(output_file, index=False)

print(f"Preprocessed file with combined and uncombined features saved as {output_file}")


Preprocessed file with combined and uncombined features saved as preprocessed_house_features.csv


In [49]:
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

file_path = 'preprocessed_final_14_selected_features.csv' 
data = pd.read_csv(file_path)

X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

elastic_net = ElasticNet(random_state=42)

elastic_net_params = {
    'alpha': [0.01, 0.1, 1, 10, 100],
    'l1_ratio': [0.1, 0.5, 0.9]
}

elastic_net_grid = GridSearchCV(elastic_net, elastic_net_params, cv=5, scoring='neg_mean_squared_error')
elastic_net_grid.fit(X_train, y_train)

best_elastic_net = elastic_net_grid.best_estimator_

gboost = GradientBoostingRegressor(random_state=42)

gboost_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5]
}


gboost_grid = GridSearchCV(gboost, gboost_params, cv=5, scoring='neg_mean_squared_error')
gboost_grid.fit(X_train, y_train)


best_gboost = gboost_grid.best_estimator_

y_pred_elastic_net = best_elastic_net.predict(X_test)
y_pred_gboost = best_gboost.predict(X_test)

y_pred_combined = (y_pred_elastic_net + y_pred_gboost) / 2

combined_rmse = np.sqrt(mean_squared_error(y_test, y_pred_combined))
combined_r2 = r2_score(y_test, y_pred_combined)

print("Elastic Net RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_elastic_net)))
print("Elastic Net R²:", r2_score(y_test, y_pred_elastic_net))
print("Gradient Boosting RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_gboost)))
print("Gradient Boosting R²:", r2_score(y_test, y_pred_gboost))
print("Combined Model RMSE:", combined_rmse)
print("Combined Model R²:", combined_r2)


Elastic Net RMSE: 40702.51187910997
Elastic Net R²: 0.7840125141331331
Gradient Boosting RMSE: 38044.86499203308
Gradient Boosting R²: 0.8112972364974786
Combined Model RMSE: 37490.52628585595
Combined Model R²: 0.8167562208927359


In [48]:
import pickle


models = {
    'best_elastic_net': best_elastic_net,
    'best_gboost': best_gboost
}


pickle_file_path = './trained_models.pkl'
with open(pickle_file_path, 'wb') as file:
    pickle.dump(models, file)


pickle_file_path


'./trained_models.pkl'