# Predicting Housing Data
This notebook makes all the necessary transformations to the data prior to training the model.

## Import Libraries

In [None]:
# DML
import json
import pickle
import pandas as pd

# ML
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

## Load data

In [None]:
# Read training dataset
df_train = pd.read_csv(
    filepath_or_buffer='../../data/processed/train.csv'
)

# Read testing dataset
df_test = pd.read_csv(
    filepath_or_buffer='../../data/processed/test.csv'
)

## Fit Model

In [None]:
# Slice data (target and features)
y = df_train['SalePrice']
X = df_train.drop(['SalePrice'], axis=1)

# Try different parameters
param_grid = [
    100,
    200,
    300,
    400
]

for n in param_grid:
    
    # Init model
    model = RandomForestRegressor(
        n_estimators=n,
        random_state=42
    )

    # Fit model
    model.fit(
        X=X,
        y=y
    )

    # CV Score
    score = cross_val_score(
        estimator=model,
        X=X,
        y=y,
        cv=10
    )
    
    # Debug score
    print(
        f'Random Forest with {n} trees:',
        round(
            score.mean(),
            3
        )
    )

Best model based on CV score

In [None]:
# Init model
model = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)

# Fit model
model.fit(
    X=X,
    y=y
)

# Parameters to string
params = json.dumps(
    model.get_params()
)

# String to json
with open('../../results/model_params.json', 'w') as out:
    out.write(params)

# Export model as binary
pickle.dump(
    model,
    open(
        '../../results/model.pkl',
        'wb'
    )
)

## Final Predictions

In [None]:
# Retrieve test ids
df_pred = pd.read_csv(
    filepath_or_buffer='../../data/raw/test.csv',
    usecols=['Id']
)

# Make predictions
df_pred['SalePrice'] = model.predict(df_test)

# Export
df_pred.to_csv(
    path_or_buf='../../results/predictions.csv',
    index=False
)