# house price prediction project from kaggle

- [based on kaggle competition](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques)


* steps to take:
    * read the data
    * visualize and make sense of data
    * clean the data
    * build models
    * pick one and tune it
    * export model (optional)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# import data
housing_data_train = pd.read_csv("train.csv")
housing_data_test = pd.read_csv("test.csv")

In [None]:
train_set = housing_data_train

In [None]:
housing_data_train.head()

In [None]:
housing_data_train.info()

In [None]:
housing_data_train.isna().sum().head(10)

In [None]:
housing_data_train.dtypes

In [None]:
## plot the relationship between LotArea & YearBuilt and saleprice
fig , (ax1,ax2) = plt.subplots(nrows=2,ncols=1,figsize=(10,6))
plt.subplots_adjust(hspace=0.5)

scatter = ax1.scatter(x= housing_data_train["LotArea"]/10000,
                      y= housing_data_train["SalePrice"]/10000,
                      c="blue")
ax1.set_ylim((0,100))
ax1.set_xlim((0,13))
ax1.set(title="relationship between Lot area & sale price",
        ylabel="saleprice(10K)$",
        xlabel="Lot area(10K)")

scatter = ax2.scatter(x = housing_data_train["YearBuilt"],
                      y= housing_data_train["SalePrice"]/10000,
                      c= "g")
ax2.set_ylim((0,60))
ax2.set_xlim((1870,2015))
ax2.set(title = "relationship between Yearbuilt and sale price",
        ylabel="saleprice(10K)$",
        xlabel="yearbuilt");



In [None]:
housing_data_train_copy=housing_data_train.copy()
housing_data_train_copy["LotArea/10k"]=housing_data_train_copy["LotArea"]/10000
housing_data_train_copy["SalePrice/1k"]=housing_data_train_copy["SalePrice"]/1000

housing_data_train_copy.plot(kind="scatter",x="LotArea/10k", y="YearBuilt", alpha=0.4,xlim=(0,5),
                        s=housing_data_train_copy["SalePrice/1k"],figsize=(16,10),
                        c=housing_data_train_copy["SalePrice"], cmap=plt.get_cmap("jet"),colorbar=True);

## Cleaning the data

In [None]:
imputer = SimpleImputer(strategy="median")

In [None]:
housing_num=train_set.select_dtypes(include=["number"]).drop("SalePrice",axis=1)

In [None]:
imputer.fit(housing_num)

## Create full pipeline to process numerical & categorical data

In [None]:
num_pipeline = Pipeline([
    ("imputer",SimpleImputer(strategy="median")),
    ("standard scaler",StandardScaler())
])

In [None]:
cat_attributes=train_set.select_dtypes(include=["object"]).columns
len(cat_attributes)

In [None]:
num_attributes = list(housing_num)

full_pipeline_train = ColumnTransformer([
    ("num", num_pipeline, num_attributes),
    ("cat", OneHotEncoder(), cat_attributes)
])

housing_prepared = full_pipeline_train.fit_transform(train_set)
housing_prepared

In [None]:
# cat_attributes_test=cros_val.select_dtypes(include=["object"]).columns
# num_attributes_test = list(housing_num_test)

# print("//////", len(cat_attributes_test), len(cat_attributes))

# full_pipeline_test = ColumnTransformer([
#     ("num_test", num_pipeline, num_attributes_test),
#     ("cat_test", OneHotEncoder(), cat_attributes_test)
# ])

# housing_prepared_test = full_pipeline_test.fit_transform(cros_val)
# housing_prepared_test

# # print("///////", OneHotEncoder(cat_attributes_test).shape)

In [None]:
df_housing_prepared = pd.DataFrame.sparse.from_spmatrix(housing_prepared)
df_housing_prepared.head().T

In [None]:
# df_housing_prepared_test = pd.DataFrame.sparse.from_spmatrix(housing_prepared_test)
# df_housing_prepared_test.head().T

## Build model

### Build simple models

In [None]:
y_train = train_set["SalePrice"]

In [None]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(df_housing_prepared, y_train)

In [None]:
len(housing_data_test.columns)

In [None]:
len(housing_data_train.columns)

In [None]:
len(housing_data_train)

In [None]:
# test_preprocessed = full_pipeline_test.fit_transform(cros_val)
# df_housing_prepared_test = pd.DataFrame(test_preprocessed)
# print(df_housing_prepared_test.shape)

In [None]:
train_preprocessed = full_pipeline_train.fit_transform(train_set)
df_housing_prepared_train = pd.DataFrame.sparse.from_spmatrix(train_preprocessed)

In [None]:
prediction = reg.predict(df_housing_prepared_train)
prediction

In [None]:
prediction[:5], y_train[:5]

## Evaluation matrices

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

linreg_mse_error = mean_squared_error(y_train, prediction)
linreg_mse_error = np.sqrt(linreg_mse_error)


linreg_mae_error = mean_absolute_error(y_train, prediction)
linreg_mae_error,linreg_mse_error

In [None]:
y_train.mean()

### prepare test data

In [None]:
housing_prepared = full_pipeline_train.fit_transform(housing_data_test)
df_test= pd.DataFrame.sparse.from_spmatrix(train_preprocessed)
df_test.head().T

### Create function to evaluate in different ways 

In [None]:
from sklearn.metrics import *

def evaluate(model):
    test_preds = model.predict(df_test)
    scores = {
        "Test RMSE": np.sqrt(mean_squared_error(y_train, test_preds)),
        "Test MAE": mean_absolute_error(y_train, test_preds),
        "Test MSLE": mean_squared_log_error(y_train, test_preds),
        "Test R^2": r2_score(y_train, test_preds),
    }
    return scores

### Use advance models

In [None]:
# RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor()
reg.fit(df_housing_prepared_train,y_train)

print(evaluate(reg))

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

reg = DecisionTreeRegressor(random_state = 42)
reg.fit(df_housing_prepared_train,y_train)

scores = cross_val_score(reg, df_housing_prepared_train,y_train,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

# print(evaluate(reg))

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

In [None]:
from sklearn.linear_model import Ridge

regr = Ridge()
regr.fit(df_housing_prepared_train,y_train)
    
print(evaluate(regr))

In [None]:
from sklearn.svm import LinearSVR

reg = LinearSVR()
reg.fit(df_housing_prepared_train,y_train)

print(evaluate(reg))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

reg = GradientBoostingRegressor()
reg.fit(df_housing_prepared_train,y_train)

print(evaluate(reg))

In [None]:
from xgboost import XGBRegressor

reg = XGBRegressor()
reg.fit(df_housing_prepared_train,y_train)

print(evaluate(reg))

In [None]:
from sklearn.neural_network import MLPRegressor

reg = MLPRegressor(random_state=1, max_iter=500).fit(df_housing_prepared_train,y_train)

print(evaluate(reg))

In [None]:
from sklearn.ensemble import AdaBoostRegressor

reg= AdaBoostRegressor().fit(df_housing_prepared_train,y_train)

print(evaluate(reg))

## Tuning the prameters
### Our best performing models are XGBOOST & RFR so we will tune this two

In [None]:
# from sklearn.model_selection import RandomizedSearchCV

# parameter_space = \
# {
# "max_depth": [4, 5, 6, 7],
# "learning_rate": [0.005, 0.01 ,0.1, 1],
# "n_estimators": [700, 1000, 2500, 3000],
# "booster": ["gbtree",],
# "gamma": [7, 25, 100,200],
# "subsample": [0.3, 0.6, 1],
# "colsample_bytree": [0.5, 0.7,2],
# "colsample_bylevel": [0.5, 0.7,3],
# "reg_alpha": [0.5, 1, 10, 33,100],
# "reg_lambda": [1, 3, 10,25],
# }
# reg = RandomizedSearchCV(XGBRegressor(random_state=3),
# parameter_space, cv=5, n_jobs=-1,
# scoring="neg_mean_squared_error",
# random_state=3, n_iter=20)

# reg.fit(df_housing_prepared_train,y_train)

# print(reg.best_params_)
# regXGBR= XGBRegressor(**reg.best_params_)
# regXGBR.fit(df_housing_prepared_train,y_train)
# my_model_pred= regXGBR.predict(df_test)

# print(evaluate(regXGBR))

In [None]:
## use grid search
from sklearn.model_selection import GridSearchCV

parameter_space = \
{
"max_depth": [4, 5, 6, 7],
"learning_rate": [0.005, 0.01 ,0.1, 1],
"n_estimators": [700, 1000, 2500, 3000],
"booster": ["gbtree",],
"gamma": [7, 25, 100,200],
"subsample": [0.3, 0.6, 1],
"colsample_bytree": [0.5, 0.7,2],
"colsample_bylevel": [0.5, 0.7,3],
"reg_alpha": [0.5, 1, 10, 33,100],
"reg_lambda": [1, 3, 10,25],
}
reg = GridSearchCV(XGBRegressor(random_state=3),
parameter_space, cv=10, n_jobs=-1,
scoring="neg_mean_squared_error")

reg.fit(df_housing_prepared_train,y_train)

print(reg.best_params_)
regXGBR= XGBRegressor(**reg.best_params_)
regXGBR.fit(df_housing_prepared_train,y_train)
my_model_pred= regXGBR.predict(df_test)

print(evaluate(regXGBR))

### Submit to kaggle

In [None]:
import os
my_submission = pd.DataFrame({'Id': housing_data_test.Id, 'SalePrice': my_model_pred[:1459]})

path="./submission.csv"
if not os.path.exists(path):
    with open(path, 'w'): 
        my_submission.to_csv(path,index=False)