### House Pricing Prediction 

In [None]:
## Loading Generic Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

##### Loading Federal Reserve Data

In [None]:
fed_files =["MORTGAGE30US.csv", "RRVRUSQ156N.csv", "CPIAUCSL.csv"]
dfs = [pd.read_csv(f, parse_dates=True, index_col=0)  for f in fed_files]

In [None]:
dfs[0]

In [None]:
dfs[1]

In [None]:
dfs[2]

In [None]:
fed_data = pd.concat(dfs, axis=1)

In [None]:
fed_data

In [None]:
fed_data.tail(50)

In [None]:
fed_data= fed_data.ffill()

In [None]:
fed_data.tail(50)

#### Loading Zillow House Price Data

In [None]:
zillow_files = ["Metro_median_sale_price_uc_sfrcondo_week.csv","Metro_zhvi_uc_sfrcondo_tier_0.33_0.67_month.csv"]
dfs = [pd.read_csv(f) for f in zillow_files]

In [None]:
dfs[0]

In [None]:
dfs=[pd.DataFrame(df.iloc[0,5:])for df in dfs]

In [None]:
dfs[0]

In [None]:
dfs[1]

In [None]:
for df in dfs:
    df.index = pd.to_datetime(df.index) 
    df["month"] = df.index.to_period("M")

In [None]:
dfs[0]

In [None]:
price_data = dfs[0].merge(dfs[1], on="month")

In [None]:
price_data.index = dfs[0].index

In [None]:
price_data

In [None]:
del price_data["month"]
price_data.columns = ["price","value"]

In [None]:
price_data

#### Preparing Data For Machine Learning

In [None]:
fed_data = fed_data.dropna()

In [None]:
fed_data

In [None]:
fed_data.tail(20)

In [None]:
from datetime import timedelta 
fed_data.index = fed_data.index + timedelta(days=2)

In [None]:
fed_data

In [None]:
price_data = fed_data.merge(price_data, left_index=True, right_index=True)

In [None]:
price_data

#### Setting Up Machine Learning Target

In [None]:
price_data.columns = ["interest", "vacancy", "cpi", "price", "value"]

In [None]:
price_data

In [None]:
price_data.plot.line(y="price", use_index=True)


In [None]:
 price_data["adj_price"] = price_data["price"] / price_data["cpi"] * 100

In [None]:
price_data.plot.line(y="adj_price", use_index=True)

In [None]:
price_data["adj_value"] = price_data["value"] / price_data["cpi"] * 100

In [None]:
price_data.plot.line(y="adj_value", use_index=True)

In [None]:
price_data["next_quarter"] = price_data["adj_price"].shift(-13)

In [None]:
price_data

In [None]:
price_data.dropna(inplace=True)

In [None]:
price_data

In [None]:
price_data["change"] = (price_data["next_quarter"] > price_data["adj_price"]).astype(int)

In [None]:
price_data

In [None]:
price_data["change"].value_counts()

In [None]:
predictors = ["interest", "vacancy", "adj_price", "adj_value"]
target = "change"

#### Creating A Machine Learning Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
START = 260
STEP = 52


def predict(train, test, predictors, target):
    rf = RandomForestClassifier(min_samples_split=10, random_state=1)
    rf.fit(train[predictors], train[target])
    preds = rf.predict(test[predictors])
    return preds

#### Creating A Backtesting Engine

In [None]:
def backtest(data, predictors, target):
    all_preds = []
    for i in range(START, data.shape[0], STEP):
        train = price_data.iloc[:i]
        test = price_data.iloc[i:(i+STEP)]
        all_preds.append(predict(train, test, predictors, target))
        
    preds = np.concatenate(all_preds)
    return preds, accuracy_score(data.iloc[START:][target], preds)

#### Measuring Error

In [None]:
preds, accuracy = backtest(price_data, predictors, target)

In [None]:
accuracy

#### Improving Our Accuracy

In [None]:
yearly = price_data.rolling(52, min_periods=1).mean()

In [None]:
yearly

In [None]:
yearly_ratios = [p + "_year" for p in predictors]
price_data[yearly_ratios] = price_data[predictors] / yearly[predictors]

In [None]:
price_data

In [None]:
preds, accuracy = backtest(price_data, predictors + yearly_ratios, target)

In [None]:
accuracy

#### Running Diagnostics On Our Model

In [None]:
pred_match = (preds == price_data[target].iloc[START:])

In [None]:
## red colored points shows where our model prediction gone wrong while green ones shows the right predictions
pred_match[pred_match == True] = "green"
pred_match[pred_match == False] = "red"

In [None]:
plot_data = price_data.iloc[START:].copy()
plot_data.reset_index().plot.scatter(x="index", y="adj_price",color=pred_match)
plt.show

In [None]:
## finding the most importnat variables on which our model depends 
from sklearn.inspection import permutation_importance
rf = RandomForestClassifier(min_samples_split = 10, random_state =1)
rf.fit(price_data[predictors], price_data[target])
result = permutation_importance(rf, price_data[predictors],price_data[target], n_repeats=10, random_state =1  )

In [None]:
result["importances_mean"]

In [None]:
predictors