# **Streamlit App:** predicting housing prices

## Import libraries

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold, RFECV
from sklearn.pipeline import make_pipeline

import pickle

from sklearn.impute import SimpleImputer 

from sklearn import set_config # this and the next code line will make it so that whenever we transform data the output is a data frame
set_config(transform_output='pandas') 

## Reading data

In [None]:
# reading
import pandas as pd
housing = pd.read_csv('[your_file_location]/housing-deployment-reg.csv') # replace [your_file_location] with actual file location

In [None]:
housing.columns #seeing which column to "pop" out = SalePrice

Index(['LotArea', 'TotalBsmtSF', 'BedroomAbvGr', 'GarageCars', 'SalePrice'], dtype='object')

In [43]:
#Dropping unncessary column(s)
X = housing.drop(columns="SalePrice").copy()

## Splitting data

In [46]:
# train test split
y = housing["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=8)

## Explore data

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1241 entries, 1008 to 451
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   LotArea       1241 non-null   int64
 1   TotalBsmtSF   1241 non-null   int64
 2   BedroomAbvGr  1241 non-null   int64
 3   GarageCars    1241 non-null   int64
dtypes: int64(4)
memory usage: 48.5 KB


# Full pipeline

In [None]:
# pipeline
pipe = make_pipeline(
        SimpleImputer(),
        StandardScaler(),
        VarianceThreshold(0),
        #RFECV(estimator=GradientBoostingRegressor()), #model proves to be better without 
        RandomForestRegressor()# model
)

# parameter grid for pipeline
param_grid = {
    'simpleimputer__strategy': ["mean", "median"],
    'standardscaler__with_mean':[True, False],
    'variancethreshold__threshold': [0.00, 0.01],
    #'rfecv__min_features_to_select': range(1, 20), 
    #'gradientboostingregressor__n_estimators': range(1, 150, 5),
    'randomforestregressor__n_estimators': [100, 300, 500],          
    'randomforestregressor__max_depth': [10, 20, 30, None],          
    'randomforestregressor__min_samples_split': [2, 5, 10],          
    'randomforestregressor__min_samples_leaf': [1, 2, 4],            
    'randomforestregressor__max_features': ['sqrt', 'log2', None],  
    'randomforestregressor__bootstrap': [True, False],  
}
trained_pipe = RandomizedSearchCV(
    pipe,
    param_grid,
    cv = 5)

# fitting the best results from the grid search on training data
trained_pipe.fit(X_train,y_train)

# see what was the best model and parameters
print("Best Parameters:", trained_pipe.best_params_)


Best Parameters: {'variancethreshold__threshold': 0.0, 'standardscaler__with_mean': True, 'simpleimputer__strategy': 'median', 'randomforestregressor__n_estimators': 500, 'randomforestregressor__min_samples_split': 10, 'randomforestregressor__min_samples_leaf': 2, 'randomforestregressor__max_features': 'log2', 'randomforestregressor__max_depth': 20, 'randomforestregressor__bootstrap': True}


### Making predictions on test data

In [86]:
#make predictions
y_pred = trained_pipe.predict(X_test)

# test accuracy on the test set
r2 = r2_score(y_test, y_pred)
print(r2)

0.7342738123212923


### Storing pipeline as "Pickle"

In [None]:
# store the trained pipeline
pickle.dump(trained_pipe,
            open(file='[your_file_location]/trained_pipe_randomforestregressor.sav', # replace [your_file_location] with actual file location
                 mode='wb'))

### Testing the model locally

In [None]:
# house values to test with model and get predictions
new_house = pd.DataFrame({
    'LotArea':[9000],
    'TotalBsmtSF':[1000],
    'BedroomAbvGr':[5],
    'GarageCars':[4]
})

# prediction
loaded_model = pickle.load(open('[your_file_location]/trained_pipe_randomforestregressor.sav', 'rb')) # replace [your_file_location] with actual file location

loaded_model.predict(new_house)

### Installing Streamlit

In [None]:
#temporary installation, if necessary (can permanently install through terminal)
!pip install streamlit