# Import Library

In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score



In [2]:
# Load dataset 
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Data Cleaning

### Train Data Cleaning

In [3]:
# Checking the Missing Values for the Train Data
train_missing_value= train.isnull().sum()
missing_percentage = (train_missing_value / len(train)) *100
print(missing_percentage[missing_percentage > 0].sort_values(ascending=False))

PoolQC          99.520548
MiscFeature     96.301370
Alley           93.767123
Fence           80.753425
MasVnrType      59.726027
FireplaceQu     47.260274
LotFrontage     17.739726
GarageType       5.547945
GarageYrBlt      5.547945
GarageFinish     5.547945
GarageQual       5.547945
GarageCond       5.547945
BsmtFinType2     2.602740
BsmtExposure     2.602740
BsmtFinType1     2.534247
BsmtCond         2.534247
BsmtQual         2.534247
MasVnrArea       0.547945
Electrical       0.068493
dtype: float64


In [4]:
# Transforming the type of "STR" null values with None
# Transforming the type of "Numerical" null values with None
for column in train.columns:
    if train[column].dtype == 'object':  # 如果是字符串类型
        train[column] = train[column].fillna('None')
    else:  # 如果是数值类型
        train[column] = train[column].fillna(train[column].median())


# Feature Engineering

In [5]:
pd.set_option("display.max_columns", None)
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [6]:

# One-hot_encoding for "STR" variables - TRAIN dataset
train = pd.get_dummies(train, drop_first=True)

# Define target variable
X = train.drop(columns=["Id","SalePrice"])
y = train["SalePrice"]

# Data Standardlization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y, test_size=0.2, random_state =42)


# Model Training and Evaluating

In [7]:

models = {
    "LinearRegression":LinearRegression(),
    "RandomForestRegressor":RandomForestRegressor(),
    "GradientBoostingRegressor":GradientBoostingRegressor(),
    "SVR":SVR(),
    "XGBoost Regressor":xgb.XGBRegressor()
}

# storing the score of each model 
model_scores={}



# Training and evalutating each models
for model_name, model in models.items():
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    
# Calculating the performance metrics for each model 
    RMSE= np.sqrt(mean_squared_error(y_test,y_pred))
    r2 = r2_score(y_test,y_pred)
    model_scores[model_name] = {"RMSE":RMSE, "R2":r2}


# Setting the display format of float values to be rounded to 4 decimal places 
pd.options.display.float_format = "{:,.4f}".format

# Changing the dataset to DataFrame for betther readability
model_df = pd.DataFrame(model_scores).T
# Sorting the values of R2
model_df.sort_values("R2",ascending = False).head()


Unnamed: 0,RMSE,R2
GradientBoostingRegressor,27286.9712,0.9029
XGBoost Regressor,28257.522,0.8959
RandomForestRegressor,28573.6893,0.8936
LinearRegression,83088.7606,0.0999
SVR,88646.8233,-0.0245


# Hyperparameter Tuning and Training the best models 

In [8]:
# As the result from GradientBoostingRegressor, XGBoost Regressor, RandomForestRegressor have the similar R2 score
# We decided to applying Hyperparameter tuning for these models




# Parameter grids
param_dist_gb = {
    'n_estimators': np.arange(100, 500, 100),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 0.9, 1.0]
}

param_dist_xgb = {
    'n_estimators': np.arange(100, 500, 100),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2]
}

param_dist_rf = {
    'n_estimators': np.arange(100, 500, 100),
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Define Regression Models
gb = GradientBoostingRegressor()
xgb_reg = xgb.XGBRegressor()
rf = RandomForestRegressor()

# Correct usage of RandomizedSearchCV
randomized_gb = RandomizedSearchCV(gb, param_distributions=param_dist_gb, n_iter=100, cv=5, n_jobs=-1, verbose=1, random_state=42)
randomized_xgb = RandomizedSearchCV(xgb_reg, param_distributions=param_dist_xgb, n_iter=100, cv=5, n_jobs=-1, verbose=1, random_state=42)
randomized_rf = RandomizedSearchCV(rf, param_distributions=param_dist_rf, n_iter=100, cv=5, n_jobs=-1, verbose=1, random_state=42)

# Fitting the models
randomized_gb.fit(X_train, y_train)
randomized_xgb.fit(X_train, y_train)
randomized_rf.fit(X_train, y_train)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits


0,1,2
,estimator,RandomForestRegressor()
,param_distributions,"{'bootstrap': [True, False], 'max_depth': [10, 20, ...], 'max_features': [None, 'sqrt', ...], 'min_samples_leaf': [1, 2, ...], ...}"
,n_iter,100
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False


In [9]:
# Printing the R2 of each model
print(f'GradientBoostingRegressor R2 score: {randomized_gb.best_score_}')
print(f'XGBRegressor R2 score: {randomized_xgb.best_score_}')
print(f'RandomForestRegressor R2 score: {randomized_rf.best_score_}')


GradientBoostingRegressor R2 score: 0.8683088470575344
XGBRegressor R2 score: 0.864346182346344
RandomForestRegressor R2 score: 0.8469434250847424


In [10]:

# Fitting the Best parameters to models
best_gb=randomized_gb.best_estimator_
best_xgb=randomized_xgb.best_estimator_
best_rf=randomized_rf.best_estimator_

# Creating Voting Regressor
voting_reg = VotingRegressor(estimators=[("gb",best_gb), 
                                         ("xgb", best_xgb),
                                         ("rf",best_rf)])

# Training the model
voting_reg.fit(X_train,y_train)

# Make predictions
y_pred = voting_reg.predict(X_test)

# Evaluating the R2
r2 = r2_score(y_test,y_pred)
print(f"Voting Regressor R2 score: {r2:.4f}")





Voting Regressor R2 score: 0.9006


# Predicting the TEST

## Test data Cleaning

In [11]:
 # Transforming the type of "STR" null values with None
# Transforming the type of "Numerical" null values with None

for column in test.columns:
    if test[column].dtype == "object":
        test[column] = test[column].fillna("None")
    else:
        test[column] = test[column].fillna(test[column].median())

# One-hot_encoding for "STR" variables - TEST dataset
test = pd.get_dummies(test, drop_first=True)

# Extracting the features from training dataset ("ID", SalePrice) Not included
train_cols = X.columns

# Removing the Id Columns
test_features = test.drop(columns=["Id"])

 # Applying the reindex method, to removing the unmatch Columns from TEST dataset
final_X_test_aligned = test_features.reindex(columns=train_cols, fill_value=0)


In [12]:
# Standardize the features using the same scaler from the training set 
final_X_test_scaled = scaler.transform(final_X_test_aligned)

# Make predictions using the Voting Regressor model
final_y_pred_test = voting_reg.predict(final_X_test_scaled)

# Create submission files
submission = pd.DataFrame({"Id": test["Id"], "SalePrice": final_y_pred_test})
submission.to_csv("submission.csv", index=False)

print("Submission file generated: submission.csv")

Submission file generated: submission.csv
