# **Regression competition:** predicting housing prices

## Import libraries

In [None]:
import pandas as pd

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold, RFECV
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer #using ColumnTranformer in the preprocessing pipelines as want to assign own names

from sklearn.impute import SimpleImputer

from sklearn import set_config # this and the next code lines will make it so that whenever we transform data the output is a data frame
set_config(transform_output='pandas')

## Reading data

In [None]:
# reading train data
train_url = "google_drive_link" #update with actual location of data in Google Drive
train_path = 'https://drive.google.com/uc?export=download&id='+train_url.split('/')[-2]
train_data= pd.read_csv(train_path)


# reading test data
train_url = "google_drive_link" #update with actual location of data in Google Drive
test_path = 'https://drive.google.com/uc?export=download&id='+test_url.split('/')[-2]
test_data= pd.read_csv(test_path)

In [None]:
train_data.columns #seeing which column to "pop"

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [None]:
#Dropping unncessary column(s)
train_data=train_data.drop(columns=["Id"]).copy()

## Splitting data

In [None]:
X_train = train_data.copy()
y_train = X_train.pop('SalePrice')

## Explore data

In [None]:
#reviewing data after removing columns
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

# Full pipeline

In [None]:
# Step 1: Identify numerical and categorical columns
train_cat_columns = X_train.select_dtypes(exclude="number").columns
train_num_columns = X_train.select_dtypes(include="number").columns

# Step 2: Define Preprocessing Pipeline
# A. create categorical pipeline
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(sparse_output=False, handle_unknown='ignore'))


# B. create numerical pipeline
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"),
    MinMaxScaler()) #MinMaxScaler retains variance, so it is prefered for variance threshold if using it in the pipeline


# C. create preprocessor
preprocessor = make_column_transformer(
    (categoric_pipe, train_cat_columns),
    (numeric_pipe, train_num_columns))


# Step 3: Define the full Pipeline
full_pipeline = make_pipeline(
   preprocessor,
   VarianceThreshold(0),
   RFECV(estimator=GradientBoostingRegressor()),
   GradientBoostingRegressor(random_state=5979, n_estimators=150)) # model

full_pipeline.fit(X_train, y_train)

In [None]:
full_pipeline

In [None]:
# #already ran the following and adapted parameters of model to match best parameters
# # Step 4: Define parameters
# param_grid = {
#     #'columntransformer__pipeline-2__simpleimputer__strategy': ["mean", "median"],
#     #'variancethreshold__threshold': [0.00, 0.01],
#     #'rfecv__min_features_to_select': range(1, 20),
#     #'gradientboostingregressor__n_estimators': range(1, 150, 5),
# }

# # Step 5: Initialize and run RandomizedSearchCV
# grid_search = RandomizedSearchCV(
#     estimator=full_pipeline,
#     param_distributions=param_grid,
#     #cv=5,  # 5-fold cross-validation
#     verbose=2, #2 gives even more details
#     scoring = 'neg_root_mean_squared_error',
#     n_jobs=-1
# )

# # Step 6: Fit grid search on training data
# grid_search.fit(X_train, y_train) #we only fit the pipeline once it has been fully assembled


# # Step 7: Get the best model and parameters
# print("Best Parameters:", grid_search.best_params_)
# print("Best score based on RMSE:", grid_search.best_score_)


# # Step 8: Transform X_test and get predictions using the best model
# best_model = grid_search.best_estimator_

# # Best Parameters: {'columntransformer__pipeline-2__simpleimputer__strategy': 'mean'}
# # Best score based on RMSE: -28454.1901432759

### Making predictions on test data

In [None]:
#make predictions
y_test_pred= full_pipeline.predict(test_data)
y_test_pred

array([122278.59482144, 163750.97154213, 183209.16743048, ...,
       161900.49356631, 119071.5495335 , 227502.12719011])

### Saving results

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
regression_prediction = pd.DataFrame({"Id": test_data["Id"], "SalePrice": y_test_pred})

regression_prediction.to_csv('/content/drive/MyDrive/.../regression_prediction.csv',index=False) #update MyDrive location with actual location of folder to save output

In [None]:
#use the following code to verify that csv has been successfully saved
csv_verify = pd.read_csv('/content/drive/MyDrive...regression_prediction.csv',index=False) #update MyDrive location with actual location of folder to save output
print(csv_verify.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         1459 non-null   int64  
 1   SalePrice  1459 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 22.9 KB
None


### Accuracy of results (per competition website)

0.13481