In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

path = Path("data/Metro_ZORI_AllHomesPlusMultifamily_Smoothed.csv")

In [None]:
def start_pipe(dataf:pd.DataFrame) -> pd.DataFrame:
    return dataf.copy()

def remove_columns(dataf):
    dataf = (
        dataf.drop(0)
        .drop(["RegionID", "SizeRank"], axis = 1)
        )
    return dataf

def parse_dates(dataf):
    return dataf.assign(Date = pd.to_datetime(dataf['Date'], infer_datetime_format=True))

def melt_df(dataf):
    # Melt data and parse dates.
    return dataf.melt(["RegionName"], var_name="Date", value_name = "RentIndex")

def interpolate_data_and_lag(dataf, lag=12, targets=6):
    cities = dataf['RegionName'].unique()
    interpol = []
    for c in cities:

        mask = dataf['RegionName'] == c
        chunk = (dataf[mask]
                 .drop('RegionName', 1)
                 .set_index('Date')
                 .resample(rule="M")
                 .mean()
                 .interpolate()
                 .bfill()
                 )
        for i in range(1,lag+1):
            chunk[f't-{i}'] = chunk['RentIndex'].shift(i)
            chunk[f't-{i}Diff'] = chunk['RentIndex'] - chunk[f't-{i}']
        for i in range(1,targets+1):
            chunk[f't+{i}'] = chunk['RentIndex'].shift(-i)
        chunk['RegionName'] = c
        chunk=chunk.reset_index().reset_index().rename({"index":"TimeIndex"}, axis=1)
        interpol.append(chunk)

    return pd.concat(interpol, ignore_index=True)


def extract_month_and_year(dataf):
    day = 24*60*60
    year = (365.2425)*day
    return dataf.assign(
        MonthSin = np.sin(dataf['Date'].dt.month * (2 * np.pi / 12)),
        MonthCos = np.cos(dataf['Date'].dt.month * (2 * np.pi / 12)),
        Year = dataf['Date'].dt.year,
        Covid = (dataf['Date'].dt.year >= 2020) & (dataf['Date'].dt.month >= 3)
        )

from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

encoder = OneHotEncoder()
scaler = StandardScaler()

def scale_and_encode(dataf, df_scaler, df_encoder, n_lag = 12, n_target=6):
    # col_to_scale = ['RentIndex', 't-1', 't-1Diff', 't-2', 't-2Diff', 't-3',
    #    't-3Diff', 't-4', 't-4Diff', 't-5', 't-5Diff', 't-6', 't-6Diff', 't-7',
    #    't-7Diff', 't-8', 't-8Diff', 't-9', 't-9Diff', 't-10', 't-10Diff',
    #    't-11', 't-11Diff', 't-12', 't-12Diff', 't+1', 't+2', 't+3', 't+4',
    #    't+5', 't+6']

    col_to_scale = [f't-{t}' for t in range(1, n_lag+1)] + [f't-{t}Diff' for t in range(1, n_lag+1)] + [f't+{t}' for t in range(1, n_target+1)]

    df_scaler.fit_transform(dataf[['RentIndex']])
    
    for col in col_to_scale:
        dataf[col] = df_scaler.transform(dataf[[col]])
    

    
    regions = dataf['RegionName']
    dataf = encoder.fit_transform(dataf)
    dataf['RegionName'] = regions
    return dataf

def drop_id_columns(dataf):
    return dataf.drop(['RegionName', 'Date'], axis=1).dropna()




In [None]:
df = pd.read_csv(path)

encoder = OneHotEncoder()
scaler = StandardScaler()

n_lag, n_target = 12,6

cleaned = (df.pipe(start_pipe)
        .pipe(remove_columns)
        .pipe(melt_df)
        .pipe(parse_dates)
        .pipe(interpolate_data_and_lag, lag=n_lag, targets=n_target)
        .pipe(extract_month_and_year)
        .pipe(scale_and_encode, scaler, encoder, n_lag=n_lag, n_target=n_target)
        )
df = (cleaned.pipe(drop_id_columns))
df

In [None]:
max_time = max(df['TimeIndex'])

train = df[df['TimeIndex'] < int(max_time - 6)].drop('TimeIndex',1)
val = df[df['TimeIndex'] > int(max_time - 6)].drop('TimeIndex',1)
full = df[df['TimeIndex'] < int(max_time - 1)].drop('TimeIndex',1)
last = df[df['TimeIndex'] > int(max_time - 1)].drop('TimeIndex',1)

In [None]:
def split_target(df, n_target=6):
    targets = [f't+{t}' for t in range(1, n_target+1)]
    return (df.drop(targets,1), df[targets])


In [None]:
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.ensemble import RandomForestRegressor

In [None]:
val_x, val_y = split_target(val)
train_x, train_y = split_target(train)

In [None]:
train_y

In [None]:
model = RandomForestRegressor(2500, n_jobs=-1)
model.fit(train_x, train_y.dropna())

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
preds = model.predict(val_x)
mape(scaler.inverse_transform(preds), scaler.inverse_transform(val_y))

In [None]:
val_idx = val_x.index.to_numpy()
ids = cleaned.loc[cleaned.index.isin(val_idx),['RegionName', 'Date']]

In [None]:
projections = pd.DataFrame(scaler.inverse_transform(preds), columns=[f't+{t}' for t in range(1, n_target+1)], index=val_idx)
projections

In [None]:
projections.join(ids)

In [None]:
path = Path("data/Metro_ZORI_AllHomesPlusMultifamily_Smoothed.csv")
df = pd.read_csv(path)
df

In [None]:
base = pd.DataFrame([val_x['t-1'],val_x['t-1'],val_x['t-1'],val_x['t-1'],val_x['t-1'],val_x['t-1']]).T

In [None]:
a = mean_squared_error(scaler.inverse_transform(preds), scaler.inverse_transform(val_y), squared=True)


In [None]:
b = mean_squared_error(scaler.inverse_transform(base), scaler.inverse_transform(val_y), squared=True)

In [None]:
(b-a) / b *100

In [None]:
all_x, all_y = split_target(full)
last_x, last_y = split_target(last)

In [None]:
model.fit(all_x, all_y)


In [None]:
pred = model.predict(last_x)

In [None]:
ny_proj = pd.DataFrame(scaler.inverse_transform(pred)).iloc[0,:]
ny_proj

In [None]:
ny_val = pd.DataFrame(scaler.inverse_transform(last_y)).iloc[0,:]

In [None]:
data= pd.DataFrame([ny_val, ny_proj ], index=['Actual','Projected'])

In [None]:
last_x

In [None]:
prior_data = pd.DataFrame(scaler.inverse_transform(last_x.loc[77][['t-12', 't-11','t-10','t-9','t-8','t-7','t-6','t-5','t-6','t-5','t-4','t-3','t-2','t-1']]))

In [None]:
prior_data

In [None]:
prior_data['Projected'] = np.nan

In [None]:
prior_data=prior_data.rename({0:'Actual'},axis=1)

In [None]:
data.T.set_index(pd.Index([i+15 for i in range(6)]))

In [None]:
combined_data = pd.concat([prior_data,data.T.set_index(pd.Index([i+15 for i in range(6)]))])
combined_data

In [None]:
melted = combined_data.reset_index().melt(id_vars='index')
melted

In [None]:
from plotly import express as px

In [None]:
px.line(melted, x = 'index', y = 'value', color='variable')

In [None]:
cities= pd.DataFrame(all_x['RegionName'].unique())

In [None]:
cities.to_csv("cities.csv")

In [None]:
cities[0].str.split(', ',expand=True).sort_values(1)

In [None]:
pd.DataFrame(cities[0].str.split(', ',expand=True)[1].unique()).to_csv('states.csv')

In [None]:
cleaned

In [None]:
cleaned.shape

In [None]:
cleaned.iloc[8483,:]

In [None]:
latest = cleaned.pipe(drop_id_columns).loc[8477].to_numpy().reshape(1,-1)

In [None]:
model.predict(latest)

In [None]:
latest_pred = model.predict(cleaned.pipe(drop_id_columns).loc[8477].drop([f't+{t}' for t in range(1, n_target+1)]+['TimeIndex']).to_numpy().reshape(1,-1))

In [None]:
scaler.inverse_transform(latest_pred)

In [None]:
set(col1)

In [None]:
val_x.columns