# Aprendizaje de Máqina - Predicción de precios de casas
- Source: Ames, Iowa: Alternative to the Boston Housing Data as an End of Semester Regression Project

# Imports

#### Python libraries

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 50)

import numpy as np

import sys

import seaborn as sns

import matplotlib.pyplot as plt

from sklearn import (
    linear_model,
    datasets
)

from sklearn.model_selection import (
    train_test_split,
    cross_val_score
)

from sklearn.preprocessing import (
    OrdinalEncoder,
    StandardScaler,
    OneHotEncoder
)

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from sklearn.metrics import (
    mean_squared_error,
    mean_squared_log_error
)

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

#### Ancillary modules

In [2]:
%load_ext autoreload
%autoreload 2

sys.path.append("../utils")

## Functions module
from houses_funcs import (
    json_dump_dict, 
    features_dictrionary,
    clean_col_names,
    data_profiling_numeric,
    data_profiling_categ,
    display_scores,
    clean_data,
    format_predicts,
    lists_by_type_of_var
)

## Parameters module
from houses_params import (
    features_dict,
    data_path_from_main,
    train_data,
    test_data
)

## ML module
from houses_ml import(
    num_pipeline,
    cat_pipeline,
    select_model
)

# Import datasets

In [3]:
## Sample submission
df_samplesub = pd.read_csv("../data/sample_submission.csv")
df_samplesub

Unnamed: 0,id,SalePrice
0,1,180401.23
1,2,180401.23
2,3,180401.23
3,4,180401.23
4,5,180401.23
...,...,...
1198,1199,180401.23
1199,1200,180401.23
1200,1201,180401.23
1201,1202,180401.23


In [4]:
## Main dataset
df_test = pd.read_csv("../data/casas_prueba.csv")
df_test

Unnamed: 0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,...,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,id
0,60,RL,,10316,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,,...,1,TA,Attchd,2000.0,RFn,3,839,TA,TA,Y,0,184,0,0,0,0,,,,0,6,2008,WD,Normal,1
1,120,RL,43.0,7052,Pave,,IR1,Lvl,AllPub,Inside,Gtl,NridgHt,Norm,Norm,TwnhsE,1Story,7,5,2005,2005,Gable,CompShg,VinylSd,VinylSd,Stone,...,1,Gd,Attchd,2005.0,RFn,2,484,TA,TA,Y,192,36,0,0,0,0,,,,0,6,2006,WD,Normal,2
2,20,RL,76.0,8243,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,7,1961,1961,Gable,CompShg,VinylSd,VinylSd,BrkFace,...,1,Fa,Detchd,1985.0,Fin,2,784,TA,TA,Y,170,0,0,0,0,0,,GdPrv,,0,2,2007,WD,Normal,3
3,60,RL,,18275,Pave,,IR1,HLS,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,2Story,7,8,1962,1998,Gable,CompShg,Plywood,Plywood,,...,2,Gd,Attchd,1962.0,RFn,2,441,TA,TA,Y,520,102,0,0,0,0,,,,0,9,2006,WD,Normal,4
4,20,RL,100.0,15263,Pave,,IR1,Lvl,AllPub,Inside,Gtl,ClearCr,Feedr,Norm,1Fam,1Story,5,5,1959,1959,Gable,CompShg,HdBoard,HdBoard,BrkFace,...,2,Gd,Attchd,1959.0,Unf,1,365,TA,TA,Y,0,132,0,0,0,0,,,,0,5,2010,WD,Normal,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1198,20,RL,90.0,10454,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1957,1957,Hip,CompShg,Plywood,Plywood,Stone,...,0,,Detchd,1957.0,Unf,1,308,TA,TA,Y,0,0,0,0,0,0,,,,0,4,2009,WD,Normal,1199
1199,120,RL,51.0,3635,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Blmngtn,Norm,Norm,TwnhsE,1Story,7,5,2007,2007,Hip,CompShg,VinylSd,VinylSd,BrkFace,...,1,TA,Attchd,2007.0,RFn,3,660,TA,TA,Y,143,20,0,0,0,0,,,,0,5,2009,WD,Normal,1200
1200,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,1.5Fin,5,5,1993,1995,Gable,CompShg,VinylSd,VinylSd,,...,0,,Attchd,1993.0,Unf,2,480,TA,TA,Y,40,30,0,320,0,0,,MnPrv,Shed,700,10,2009,WD,Normal,1201
1201,20,RL,60.0,11664,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Artery,Norm,1Fam,1Story,6,5,1948,1950,Gable,CompShg,MetalSd,MetalSd,BrkFace,...,1,Gd,Detchd,1948.0,Unf,1,240,TA,TA,Y,0,130,0,0,0,0,,,,0,11,2007,WD,Normal,1202


In [5]:
## Main dataset
df_train = pd.read_csv("../data/casas_entrena.csv")
df_train

Unnamed: 0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,...,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,120,RM,,3072,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Blmngtn,Norm,Norm,TwnhsE,1Story,7,5,2004,2004,Hip,CompShg,VinylSd,VinylSd,BrkFace,...,1,TA,Attchd,2004.0,Fin,2,388,TA,TA,Y,143,20,0,0,0,0,,,,0,9,2006,WD,Normal,225000
1,120,RL,53.0,4045,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Blmngtn,Norm,Norm,TwnhsE,1Story,7,5,2006,2006,Hip,CompShg,VinylSd,VinylSd,BrkFace,...,1,Gd,Attchd,2006.0,Fin,3,648,TA,TA,Y,161,20,0,0,0,0,,,,0,10,2006,New,Partial,246578
2,120,RL,43.0,3013,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Blmngtn,Norm,Norm,TwnhsE,1Story,7,5,2005,2005,Gable,CompShg,VinylSd,VinylSd,BrkFace,...,1,Gd,Attchd,2005.0,Fin,2,440,TA,TA,Y,142,20,0,0,0,0,,,,0,4,2006,WD,Normal,213490
3,120,RL,53.0,3922,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Blmngtn,Norm,Norm,TwnhsE,1Story,7,5,2006,2007,Gable,CompShg,WdShing,Wd Shng,BrkFace,...,1,Gd,Attchd,2007.0,Fin,3,648,TA,TA,Y,144,16,0,0,0,0,,,,0,6,2007,New,Partial,172500
4,120,RL,,3196,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Blmngtn,Norm,Norm,TwnhsE,1Story,8,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,...,1,TA,Attchd,2003.0,Fin,2,400,TA,TA,Y,143,20,0,0,0,0,,,,0,5,2006,WD,Normal,215000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1462,20,RL,80.0,12000,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Veenker,Norm,Norm,1Fam,1Story,7,6,1980,1980,Hip,CompShg,VinylSd,MetalSd,BrkFace,...,1,TA,Attchd,1980.0,RFn,2,546,Gd,TA,Y,180,16,0,0,0,0,,,,0,3,2007,WD,Normal,255000
1463,20,RL,,14694,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Veenker,Norm,Norm,1Fam,1Story,8,9,1977,2008,Gable,CompShg,MetalSd,MetalSd,BrkFace,...,1,Gd,Attchd,1977.0,Fin,2,642,TA,TA,Y,501,120,0,225,0,0,,,,0,6,2009,WD,Normal,318750
1464,60,RL,,19522,Pave,,IR1,Bnk,AllPub,Inside,Gtl,Veenker,Norm,Norm,1Fam,2Story,7,5,1990,1990,Gable,CompShg,HdBoard,HdBoard,BrkFace,...,1,TA,Attchd,1990.0,RFn,2,564,TA,TA,Y,0,99,0,0,182,0,,,,0,2,2007,WD,Normal,300000
1465,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,...,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500


In [6]:
## Name of feature that will be predicted
for feat in features_dict:
    if features_dict[feat]["ml_label"] == True:
        predict_feature = feat

---

In [7]:
df_train["Exter Cond"].unique()

array(['TA', 'Fa', 'Gd', 'Po', 'Ex'], dtype=object)

In [8]:
df_test["Exter Cond"].unique()

array(['TA', 'Gd', 'Fa', 'Ex'], dtype=object)

# Initial EDA + GEDA

#### How does the correlation matriz looks like?

In [None]:
df_train.corr()["SalePrice"].sort_values(ascending=False)

#### How does the "Sale Type" attribute affects the Sale Price?

In [None]:
dfx = df_train.copy()

rc = [
    "Sale Condition",
    "SalePrice",
    "Gr Liv Area"
]

dfx = dfx.loc[:, rc]

sns.lmplot(
    data = dfx,
    x = "Gr Liv Area",
    y = "SalePrice",
    hue = "Sale Condition",
    height = 15 
)

#### Does column "MS Zoning" has a relevant influence on the sale price?

In [None]:
dfx = df_train.copy()

rc = [
    "MS Zoning",
    "SalePrice",
    "Gr Liv Area"
]

dfx = dfx.loc[:, rc]

f, ax = plt.subplots(figsize=(15, 10))
sns.scatterplot(
    data = dfx,
    x = "Gr Liv Area",
    y = "SalePrice",
    hue = "MS Zoning",
    ax = ax
)

In [None]:
f, ax = plt.subplots(figsize=(15, 10))
sns.boxplot(
    data = dfx,
    x = "MS Zoning",
    y = "SalePrice",
    ax = ax
)

#### Does "Lot Area" is a good predictor of sale price?

In [None]:
dfx = df_train.copy()

rc = [
    "Lot Area",
    "SalePrice",
    "MS Zoning"
]

dfx = dfx.loc[:, rc]

f, ax = plt.subplots(figsize=(15, 10))
sns.scatterplot(
    data = dfx,
    x = "Lot Area",
    y = "SalePrice",
    hue = "MS Zoning",
    ax = ax
)

# Data preparation

## Simplifying data based on definitions dictionary

In [9]:
## Cleaning data based on definitions dictionary.
housingc = clean_data(df_train)

## Adding training labels to cleaned data.
housingc = housingc.join(df_train[predict_feature])

In [10]:
housingc

Unnamed: 0,MS Zoning,Lot Area,Street,Overall Qual,Year Built,Exter Qual,Exter Cond,Total Bsmt SF,1st Flr SF,Gr Liv Area,Garage Cars,Garage Area,SalePrice
0,0,3072,Pave,7,2004,Gd,TA,1365,1548,1548,2,388,225000
2,1,3013,Pave,7,2005,Gd,TA,1362,1506,1506,2,440,213490
4,1,3196,Pave,8,2003,Gd,TA,1273,1456,1456,2,400,215000
6,1,3182,Pave,7,2007,Gd,TA,1266,1266,1266,2,388,159895
7,1,3203,Pave,7,2006,Gd,TA,1145,1145,1145,2,437,160000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1462,1,12000,Pave,7,1980,Gd,TA,2002,2362,2362,2,546,255000
1463,1,14694,Pave,8,1977,Ex,Ex,1694,1694,1694,2,642,318750
1464,1,19522,Pave,7,1990,Gd,TA,1223,1271,2503,2,564,300000
1465,1,9600,Pave,6,1976,TA,TA,1262,1262,1262,2,460,181500


### Separating training data from labels.

In [11]:
## Training labels
housingc_labs = housingc[predict_feature]

## Training data
housingc.drop(predict_feature, axis=1, inplace=True)

### Reviewing information about this data

In [None]:
## Reviewing obtained dataframe
housingc.info()

In [None]:
housingc["MS Zoning"].value_counts()

In [None]:
housingc["Exter Qual"].value_counts()

### Specifying numerical and categorical columns

In [12]:
housingc_num, housingc_cat = lists_by_type_of_var(features_dict)

Numerical columns: ['Lot Area', 'Overall Qual', 'Year Built', 'Total Bsmt SF', '1st Flr SF', 'Gr Liv Area', 'Garage Cars', 'Garage Area']

Categorical columns: ['MS Zoning', 'Exter Qual', 'Exter Cond']




In [13]:
housingc_cat.remove("Exter Cond")

## Creating and executing pipelines

### Full pipeline

In [15]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, housingc_num),
    ("cat", cat_pipeline, housingc_cat),
])

### Preparing data

In [16]:
housingc_prp = full_pipeline.fit_transform(housingc)

---

## Training and evaluating models

### Simple evaluation

#### Linear regressión

In [None]:
## Defining and training model
# lin_reg = LinearRegression()
lin_reg = select_model("lr")
lin_reg.fit(housingc_prp, housingc_labs)

In [None]:
## Evluating error with RMSE.
housingc_predicts = lin_reg.predict(housingc_prp)
lin_msle = mean_squared_log_error(housingc_labs, abs(housingc_predicts))
lin_msle

## NOTA:
#### Hay un valor que salió negativo y lo estoy cambiando a positivo arbitrariamente.

#### Decision tree regresor

In [None]:
tree_reg = select_model("dt")
tree_reg.fit(housingc_prp, housingc_labs)

In [None]:
## Evluating error with RMSE.
housingc_predicts = tree_reg.predict(housingc_prp)
tree_msle = mean_squared_log_error(housingc_labs, housingc_predicts)
tree_msle

#### Random forrest

In [None]:
forest_reg = select_model("rf")
forest_reg.fit(housingc_prp, housingc_labs)

In [None]:
## Evluating error with RMSE.
housingc_predicts = forest_reg.predict(housingc_prp)
forest_msle = mean_squared_log_error(housingc_labs, housingc_predicts)
forest_msle

### Using cross validation

In [None]:
## Linear regression
lin_cv_scores = cross_val_score(lin_reg, 
                             abs(housingc_prp), 
                             housingc_labs,
                             scoring="neg_mean_squared_log_error", 
                             cv=10)
display_scores(-lin_cv_scores)

In [None]:
## Decision tree
tree_cv_scores = cross_val_score(tree_reg, 
                             abs(housingc_prp), 
                             housingc_labs,
                             scoring="neg_mean_squared_log_error", 
                             cv=10)
display_scores(-tree_cv_scores)

In [None]:
## Decision forest
forest_cv_scores = cross_val_score(forest_reg, 
                             abs(housingc_prp), 
                             housingc_labs,
                             scoring="neg_mean_squared_log_error", 
                             cv=10)
display_scores(-forest_cv_scores)

## Predictions on test data

### Initial cleaning of data

In [None]:
## Cleaning data based on definitions dictionary.
housingc_test = clean_data(df_test)

### Passing test data through constructed pipeline

In [None]:
housingc_test_prp = full_pipeline.fit_transform(housingc_test)

### Making predictions with constructed model

In [None]:
housingc_test_predicts = forest_reg.predict(housingc_test_prp)

### Formatting predictions

In [None]:
housingc_test_pred_form = format_predicts(housingc_test_predicts)
# housingc_test_pred_form.to_csv("sub_rp_1022_v3.csv")

In [None]:
housingc_test_pred_form

### Comparing with other results

In [None]:
df_final_eval = df_samplesub.join(housingc_test_pred_form, rsuffix="_Robs")
df_final_eval

In [None]:
df_final_eval = df_final_eval.loc[:, ["SalePrice", "SalePrice_Robs"]]
df_final_eval

In [None]:
df_final_eval.plot()

---

# Notes

## Bugs to solve

- [ ] The code to eliminate "SalesPrice" from numerical features is "hard-coded".
- [ ] No estoy seguro si estoy cayendo en data leaking al momento de trabajar con los datos de prueba. No se si sí debería estar aplicando mi función de clean.
- [ ] Uno de los features ("Exter Cond") tiene diferente número de categorías en el entrenamiento que en la prueba. Una posible solución podría ser convertir esas categorías en números. (Por lo pronto se va a eliminar).

## Possible transformations

- Price as logarithm

## Features dictionary

#### Creating dictionary

In [None]:
features_dict

In [None]:
for key in features_dict:
    features_dict[key]["notes"] = "-"

In [None]:
features_dict

In [None]:
json_dump_dict(features_dict)

#### Modifying dictionary

In [None]:
features_dict

In [None]:
for key in features_dict:
    features_dict[key]["ml_label"] = False

In [None]:
features_dict

In [None]:
json_dump_dict(features_dict)

---

## Test 1
- SkLearn Linear Regression (Housing Prices Example): https://www.youtube.com/watch?v=JTj-WgWLKFM

In [None]:
boston = datasets.load_boston()

In [None]:
boston

In [None]:
df_x = pd.DataFrame(boston.data, columns=boston.feature_names)
df_y = pd.DataFrame(boston.target)

In [None]:
df_x.describe()

In [None]:
reg = linear_model.LinearRegression()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)

In [None]:
reg.fit(x_train, y_train)

In [None]:
reg.coef_

In [None]:
a = reg.predict(x_test)

In [None]:
a[4]

In [None]:
y_test[0]

In [None]:
# MSE
np.mean((a - y_test)**2)

---