In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

def process_df() -> pd.DataFrame:
    path = Path("data/Metro_ZORI_AllHomesPlusMultifamily_Smoothed.csv")
    df = pd.read_csv(path)
    
    # Drop United States, extra columns
    df = df.drop(0)
    df = df.drop(["RegionID", "SizeRank"], axis =1)
    
    # Melt data and parse dates.
    df = df.melt(["RegionName"], var_name="Date", value_name = "RentIndex")
    df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
    
    # Add interpolate missing data
    cities = df["RegionName"].unique()
    interpol = []
    num = 0

    for c in cities:

        mask = df['RegionName'] == c
        chunk = (df[mask]
                 .drop('RegionName', 1)
                 .set_index('Date')
                 .resample(rule="M")
                 .mean()
                 .interpolate()
                 .reset_index()
                 .reset_index()
                 .rename({"index":"TimeIndex"}, axis=1))
        chunk['RegionName'] = c
        num+=1
        interpol.append(chunk)

    df = pd.concat(interpol, ignore_index=True)

    # Region Name to Categorical
    #     df['RegionName'] = df['RegionName'].astype("category")
    
    # Extract month and year
    
    day = 24*60*60
    year = (365.2425)*day

    df['MonthSin'] = np.sin(df['Date'].dt.month * (2 * np.pi / 12))
    df['MonthCos'] = np.cos(df['Date'].dt.month * (2 * np.pi / 12))
#     df['Month'] = df['Date'].dt.month
    df['Year'] = df['Date'].dt.year
    df['target'] = df['RentIndex'].shift(-3)
    df = df.drop('Date', 1)
    df=df.dropna()

    return(df)

In [None]:
df = process_df()
df

In [None]:
from category_encoders import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
df['RentIndex']= scaler.fit_transform(df[['RentIndex']])

In [None]:
encoder = OrdinalEncoder()
df['RegionName'] = encoder.fit_transform(df['RegionName'])

In [None]:
df

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, GRU, Conv1D

In [None]:
target = df.pop('target')

In [None]:
target

In [None]:
df.dtypes

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((tf.expand_dims(df.to_numpy(), -1), target.to_numpy()))
# dataset = tf.data.Dataset.from_tensor_slices((df.to_numpy(), target.to_numpy()))

In [None]:
def get_compiled_model():
  model = tf.keras.Sequential([
#       Dense(100, activation='relu'),
      
#       Dense(1000, activation='relu'),
#       Dense(1000, activation='relu'),
#     tf.keras.layers.GRU(4, return_sequences=True),
      tf.keras.layers.GRU(100, return_sequences=True),
      tf.keras.layers.GRU(100, return_sequences=True),
#       tf.keras.layers.GRU(100,),
#       tf.keras.layers.GRU(1000, return_sequences=True, batch_size=15),
#       tf.keras.layers.GRU(4, return_sequences=True, batch_size=15),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1)),
      
  ])

  model.compile(loss=tf.losses.MeanAbsolutePercentageError(),
                optimizer=tf.optimizers.Adam(),
                metrics=[tf.metrics.MeanAbsolutePercentageError()])

            
  return model

In [None]:
from tqdm.keras import TqdmCallback

In [None]:
# class ResidualWrapper(tf.keras.Model):
#   def __init__(self, model):
#     super().__init__()
#     self.model = model

#   def call(self, inputs, *args, **kwargs):
#     delta = self.model(inputs, *args, **kwargs)

#     # The prediction for each timestep is the input
#     # from the previous time step plus the delta
#     # calculated by the model.
#     return inputs + delta

# residual_lstm = ResidualWrapper(
#     tf.keras.Sequential([
#     tf.keras.layers.LSTM(32, return_sequences=True),
#     tf.keras.layers.LSTM(32, return_sequences=True),
#     tf.keras.layers.Dense(
#         1,
#         # The predicted deltas should start small
#         # So initialize the output layer with zeros
#         kernel_initializer=tf.initializers.zeros)
# ]))

# residual_lstm.compile(loss=tf.losses.MeanAbsolutePercentageError(),
#                 optimizer=tf.optimizers.Adam(),
#                 metrics=[tf.metrics.MeanAbsolutePercentageError()])
# residual_lstm.fit(dataset.batch(84), epochs=2500,
#          callbacks=[tf.keras.callbacks.EarlyStopping(
#     monitor='loss', min_delta=0, patience=25, verbose=0,
#     mode='auto', baseline=None, restore_best_weights=True),
#                    TqdmCallback(verbose=1)],
#          verbose=0)


In [None]:
model = get_compiled_model()
model.fit(dataset.batch(1024), epochs=2500,
         callbacks=[tf.keras.callbacks.EarlyStopping(
    monitor='loss', min_delta=0, patience=25, verbose=0,
    mode='auto', baseline=None, restore_best_weights=True),
                   TqdmCallback(verbose=0)],
         verbose=0)

1,066,943
1,722,103

In [None]:
base = pd.DataFrame([target, target.shift(12)]).T.dropna()

In [None]:
val = tf.expand_dims(pd.DataFrame(df.iloc[-1,:]).T.to_numpy(), -1)

In [None]:
residual_lstm.predict(val)

In [None]:
base

In [None]:
from sklearn.metrics import mean_absolute_percentage_error as MAPE

In [None]:
MAPE(base.iloc[:,0], base.iloc[:,1])

In [None]:
column_indices = {name: i for i, name in enumerate(df.columns)}

n = len(df)
train_df = df[0:int(n*0.7)]
val_df = df[int(n*0.7):int(n*0.9)]
test_df = df[int(n*0.9):]

num_features = df.shape[1]


In [None]:
train_df

In [167]:
  def __init__(self, input_width, label_width, shift,
               train_df=train_df, val_df=val_df, test_df=test_df,
               label_columns=None):
    # Store the raw data.
    self.train_df = train_df
    self.val_df = val_df
    self.test_df = test_df

    # Work out the label column indices.
    self.label_columns = label_columns
    if label_columns is not None:
      self.label_columns_indices = {name: i for i, name in
                                    enumerate(label_columns)}
    self.column_indices = {name: i for i, name in
                           enumerate(train_df.columns)}

    # Work out the window parameters.
    self.input_width = input_width
    self.label_width = label_width
    self.shift = shift

    self.total_window_size = input_width + shift

    self.input_slice = slice(0, input_width)
    self.input_indices = np.arange(self.total_window_size)[self.input_slice]

    self.label_start = self.total_window_size - self.label_width
    self.labels_slice = slice(self.label_start, None)
    self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

  def __repr__(self):
    return '\n'.join([
        f'Total window size: {self.total_window_size}',
        f'Input indices: {self.input_indices}',   
        f'Label indices: {self.label_indices}',
        f'Label column name(s): {self.label_columns}']) 


NameError: name 'train_df' is not defined

# Random Forest

In [435]:
import pandas as pd
from pathlib import Path
import numpy as np

def process_df() -> pd.DataFrame:
    path = Path("data/Metro_ZORI_AllHomesPlusMultifamily_Smoothed.csv")
    df = pd.read_csv(path)
    
    # Drop United States, extra columns
    df = df.drop(0)
    df = df.drop(["RegionID", "SizeRank"], axis =1)
    
    # Melt data and parse dates.
    df = df.melt(["RegionName"], var_name="Date", value_name = "RentIndex")
    df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
    
    # Add interpolate missing data
    cities = df["RegionName"].unique()
    interpol = []
    num = 0

    for c in cities:

        mask = df['RegionName'] == c
        chunk = (df[mask]
                 .drop('RegionName', 1)
                 .set_index('Date')
                 .resample(rule="M")
                 .mean()
                 .interpolate()
                    )
        for i in range(1,13):
            chunk[f't-{i}'] = chunk['RentIndex'].shift(i)
            chunk[f't-{i}Diff'] = chunk['RentIndex'] - chunk[f't-{i}']
        for i in range(1,7):
            chunk[f't+{i}'] = chunk['RentIndex'].shift(-i)
        chunk['RegionName'] = c
        chunk=chunk.dropna().reset_index().reset_index().rename({"index":"TimeIndex"}, axis=1)
        num+=1
        interpol.append(chunk)

    df = pd.concat(interpol, ignore_index=True)

    # Region Name to Categorical
    #     df['RegionName'] = df['RegionName'].astype("category")
    
    # Extract month and year
    
    day = 24*60*60
    year = (365.2425)*day

    df['MonthSin'] = np.sin(df['Date'].dt.month * (2 * np.pi / 12))
    df['MonthCos'] = np.cos(df['Date'].dt.month * (2 * np.pi / 12))
#     df['Month'] = df['Date'].dt.month
    df['Year'] = df['Date'].dt.year
#     df['target'] = df['RentIndex'].shift(-3)
    # df = df.drop('Date', 1)
    # df['Covid'] = (df['Date'].dt.year >= 2020) & (df['Date'].dt.month >= 3)
    df=df.dropna()

    return(df)

In [436]:
df = process_df()

In [437]:
df['Covid'] = (df['Date'].dt.year >= 2020) & (df['Date'].dt.month >= 3)

In [438]:
from sklearn.preprocessing import StandardScaler
from category_encoders import OneHotEncoder

In [439]:
def encode_and_scale(df):
    col_to_scale = ['RentIndex', 't-1', 't-1Diff', 't-2', 't-2Diff', 't-3',
       't-3Diff', 't-4', 't-4Diff', 't-5', 't-5Diff', 't-6', 't-6Diff', 't-7',
       't-7Diff', 't-8', 't-8Diff', 't-9', 't-9Diff', 't-10', 't-10Diff',
       't-11', 't-11Diff', 't-12', 't-12Diff', 't+1', 't+2', 't+3', 't+4',
       't+5', 't+6']
    
    scaler1 = StandardScaler()
    scaler1.fit(df[['RentIndex']])
    lags= [n for n in col_to_scale if "Diff" not in n]
    for col in lags:
        df[col] = scaler1.transform(df[[col]])
        
    scaler2 = StandardScaler()
    diffs = [n for n in col_to_scale if "Diff" in n]
    scaler2.fit(df[['t-1Diff']])
    for col in diffs:
        df[col] = scaler2.transform(df[[col]])
    
    regions = df[['RegionName']]
    encoder = OneHotEncoder(use_cat_names=True)
    
    df = pd.concat(
        [df,
         encoder.fit_transform(pd.DataFrame(regions))
        ],
    axis=1)
   
    

    return(df, scaler1, encoder)
    

In [440]:
df, scaler, encoder = encode_and_scale(df)

In [408]:
df['Covid']

0       False
1       False
2       False
3       False
4       False
        ...  
6655    False
6656     True
6657     True
6658     True
6659     True
Name: Covid, Length: 6660, dtype: bool

In [441]:
max_time = max(df['TimeIndex'])

In [442]:
train = df[df['TimeIndex'] < int(max_time - 6)].drop('TimeIndex',1)
val = df[df['TimeIndex'] > int(max_time - 6)].drop('TimeIndex',1)
full = last = df[df['TimeIndex'] < int(max_time - 1)].drop('TimeIndex',1)
last = df[df['TimeIndex'] > int(max_time - 1)].drop('TimeIndex',1)

In [443]:
def split_target(df):
    targets = ['t+1', 't+2', 't+3','t+4','t+5','t+6']
    return (df.drop(targets,1), df[targets])


In [444]:
from sklearn.metrics import mean_absolute_percentage_error as mape

In [445]:
val_x, val_y = split_target(val)

In [None]:
val_y

In [119]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [446]:
train_x, train_y = split_target(train)

In [447]:
model = RandomForestRegressor(1000, n_jobs=-1)

In [448]:
model.fit(train_x.drop(['RegionName', 'Date'],1), train_y)

RandomForestRegressor(n_estimators=1000, n_jobs=-1)

In [190]:
from sklearn.metrics import mean_squared_error

In [191]:
val_x

Unnamed: 0,Date,RentIndex,t-1,t-1Diff,t-2,t-2Diff,t-3,t-3Diff,t-4,t-4Diff,...,"RegionName_Melbourne, FL","RegionName_Chattanooga, TN","RegionName_Spokane, WA","RegionName_Provo, UT","RegionName_Durham, NC","RegionName_Port St. Lucie, FL","RegionName_Fort Collins, CO","RegionName_Boulder, CO","RegionName_Greeley, CO","RegionName_Gainesville, GA"
60,2020-01-31,3.260221,3.245915,0.206615,3.272142,-0.998146,3.319829,-3.188622,3.319829,-3.188622,...,0,0,0,0,0,0,0,0,0,0
61,2020-02-29,3.322213,3.260221,2.397091,3.245915,3.054233,3.272142,1.849472,3.319829,-0.341004,...,0,0,0,0,0,0,0,0,0,0
62,2020-03-31,3.365132,3.322213,1.520900,3.260221,4.368519,3.245915,5.025661,3.272142,3.820900,...,0,0,0,0,0,0,0,0,0,0
63,2020-04-30,3.331751,3.365132,-1.983860,3.322213,-0.012432,3.260221,2.835186,3.245915,3.492328,...,0,0,0,0,0,0,0,0,0,0
64,2020-05-31,3.293601,3.331751,-2.202908,3.365132,-3.736241,3.322213,-1.764813,3.260221,1.082805,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6655,2020-02-29,0.012750,0.031824,-1.326718,-0.034937,1.739948,-0.039706,1.958996,-0.056396,2.725662,...,0,0,0,0,0,0,0,0,0,1
6656,2020-03-31,-0.013478,0.012750,-1.655289,0.031824,-2.531479,-0.034937,0.535186,-0.039706,0.754234,...,0,0,0,0,0,0,0,0,0,1
6657,2020-04-30,-0.013478,-0.013478,-0.450528,0.012750,-1.655289,0.031824,-2.531479,-0.034937,0.535186,...,0,0,0,0,0,0,0,0,0,1
6658,2020-05-31,-0.042090,-0.013478,-1.764813,-0.013478,-1.764813,0.012750,-2.969574,0.031824,-3.845765,...,0,0,0,0,0,0,0,0,0,1


In [451]:
preds = model.predict(val_x.drop(['RegionName', 'Date'],1))
mape(scaler.inverse_transform(preds), scaler.inverse_transform(val_y))

0.013313749788021599

In [221]:
ids = val_x[['RegionName','Date']].reset_index().drop('index',1)
ids

Unnamed: 0,RegionName,Date
0,"New York, NY",2020-01-31
1,"New York, NY",2020-02-29
2,"New York, NY",2020-03-31
3,"New York, NY",2020-04-30
4,"New York, NY",2020-05-31
...,...,...
595,"Gainesville, GA",2020-02-29
596,"Gainesville, GA",2020-03-31
597,"Gainesville, GA",2020-04-30
598,"Gainesville, GA",2020-05-31


In [449]:
projections = pd.DataFrame(scaler.inverse_transform(preds), columns=['t+1', 't+2', 't+3','t+4','t+5','t+6'])
projections

Unnamed: 0,t+1,t+2,t+3,t+4,t+5,t+6
0,2700.090,2710.985,2724.910,2737.720,2740.894,2737.584
1,2709.322,2720.300,2733.498,2743.449,2745.342,2740.150
2,2850.663,2864.362,2877.007,2886.707,2892.843,2893.158
3,2721.797,2733.999,2744.562,2753.140,2754.975,2753.726
4,2696.779,2706.290,2716.638,2725.213,2728.538,2729.344
...,...,...,...,...,...,...
595,1353.374,1361.087,1368.505,1375.731,1381.102,1387.013
596,1346.974,1351.295,1354.941,1360.434,1365.731,1369.926
597,1345.320,1348.285,1351.289,1354.934,1357.680,1361.391
598,1334.828,1341.341,1348.556,1351.200,1360.424,1366.116


In [226]:
projections.join(ids)

Unnamed: 0,t+1,t+2,t+3,t+4,t+5,t+6,RegionName,Date
0,2702.458,2713.214,2726.690,2738.686,2740.824,2736.990,"New York, NY",2020-01-31
1,2712.531,2723.431,2735.801,2744.602,2745.964,2740.979,"New York, NY",2020-02-29
2,2851.551,2864.929,2877.954,2887.554,2894.429,2894.198,"New York, NY",2020-03-31
3,2726.842,2738.986,2748.921,2755.661,2756.288,2754.430,"New York, NY",2020-04-30
4,2699.520,2709.539,2719.796,2727.709,2729.222,2729.137,"New York, NY",2020-05-31
...,...,...,...,...,...,...,...,...
595,1353.502,1361.418,1368.870,1375.703,1381.490,1387.010,"Gainesville, GA",2020-02-29
596,1346.724,1351.228,1354.916,1359.737,1364.940,1369.018,"Gainesville, GA",2020-03-31
597,1345.001,1347.508,1350.456,1354.371,1357.556,1361.644,"Gainesville, GA",2020-04-30
598,1334.622,1341.275,1348.251,1350.322,1360.221,1365.540,"Gainesville, GA",2020-05-31


In [165]:
region_names = val_x[encoder.feature_names].idxmax(1).str.slice(start=len('regionname_'))

In [166]:
pd.concat([projections, dates,region_names],1)

Unnamed: 0,0,1,2,3,4,5,Date,0.1
0,2701.672,2712.109,2726.598,2740.086,2743.757,2741.682,2015-01-31,
1,2710.979,2721.855,2734.992,2745.038,2747.131,2742.618,2015-02-28,
2,2857.123,2870.687,2884.398,2895.765,2904.184,2905.560,2015-03-31,
3,2727.290,2738.451,2749.107,2757.233,2759.425,2758.806,2015-04-30,
4,2699.868,2708.276,2718.935,2728.269,2731.553,2732.983,2015-05-31,
...,...,...,...,...,...,...,...,...
6655,,,,,,,2020-02-29,"Gainesville, GA"
6656,,,,,,,2020-03-31,"Gainesville, GA"
6657,,,,,,,2020-04-30,"Gainesville, GA"
6658,,,,,,,2020-05-31,"Gainesville, GA"


In [None]:
path = Path("data/Metro_ZORI_AllHomesPlusMultifamily_Smoothed.csv")
df = pd.read_csv(path)
df

In [450]:
base = pd.DataFrame([val_x['t-1'],val_x['t-1'],val_x['t-1'],val_x['t-1'],val_x['t-1'],val_x['t-1']]).T

In [452]:
a = mean_squared_error(scaler.inverse_transform(preds), scaler.inverse_transform(val_y), squared=True)


In [453]:
b = mean_squared_error(scaler.inverse_transform(base), scaler.inverse_transform(val_y), squared=True)

In [454]:
(b-a) / b *100

13.76424662777896

In [None]:
pd.DataFrame(scaler.inverse_transform(preds))

In [None]:
pd.DataFrame(scaler.inverse_transform(val_y))

In [412]:
all_x, all_y = split_target(full)
last_x, last_y = split_target(last)

In [414]:
last

Unnamed: 0,Date,RentIndex,t-1,t-1Diff,t-2,t-2Diff,t-3,t-3Diff,t-4,t-4Diff,...,"RegionName_Melbourne, FL","RegionName_Chattanooga, TN","RegionName_Spokane, WA","RegionName_Provo, UT","RegionName_Durham, NC","RegionName_Port St. Lucie, FL","RegionName_Fort Collins, CO","RegionName_Boulder, CO","RegionName_Greeley, CO","RegionName_Gainesville, GA"
65,2020-06-30,3.248299,3.293601,-2.531479,3.331751,-4.283860,3.365132,-5.817192,3.322213,-3.845765,...,0,0,0,0,0,0,0,0,0,0
131,2020-06-30,2.945488,2.931182,0.206615,2.976484,-1.874337,3.028940,-4.283860,3.028940,-4.283860,...,0,0,0,0,0,0,0,0,0,0
197,2020-06-30,0.770970,0.770970,-0.450528,0.770970,-0.450528,0.778123,-0.779099,0.756664,0.206615,...,0,0,0,0,0,0,0,0,0,0
263,2020-06-30,0.565917,0.553995,0.097091,0.565917,-0.450528,0.580223,-1.107670,0.570685,-0.669575,...,0,0,0,0,0,0,0,0,0,0
329,2020-06-30,0.520614,0.522998,-0.560051,0.532536,-0.998146,0.530152,-0.888623,0.506308,0.206615,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6395,2020-06-30,0.894956,0.964101,-3.626717,0.801966,3.820900,0.909262,-1.107670,0.902109,-0.779099,...,0,0,0,0,0,1,0,0,0,0
6461,2020-06-30,0.508692,0.456237,1.958996,0.458621,1.849472,0.446700,2.397091,0.394244,4.806614,...,0,0,0,0,0,0,1,0,0,0
6527,2020-06-30,1.696094,1.703247,-0.779099,1.679403,0.316139,1.727090,-1.874337,1.727090,-1.874337,...,0,0,0,0,0,0,0,1,0,0
6593,2020-06-30,0.601682,0.606450,-0.669575,0.592144,-0.012432,0.632678,-1.874337,0.613603,-0.998146,...,0,0,0,0,0,0,0,0,1,0


In [416]:
model.fit(all_x.drop(['Date', 'RegionName'],1), all_y)
pred = model.predict(last_x.drop(['Date', 'RegionName']))

KeyError: "['Date' 'RegionName'] not found in axis"

In [None]:
ny_proj = pd.DataFrame(scaler.inverse_transform(pred)).iloc[0,:]

In [None]:
ny_val = pd.DataFrame(scaler.inverse_transform(last_y)).iloc[0,:]

In [None]:
data= pd.DataFrame([ny_val, ny_proj ], index=['Actual','Projected'])

In [None]:
prior_data = pd.DataFrame(scaler.inverse_transform(last_x.loc[65][['t-12', 't-11','t-10','t-9','t-8','t-7','t-6','t-5','t-6','t-5','t-4','t-3','t-2','t-1','RentIndex']]))

In [421]:
prior_data['Projected'] = np.nan

In [422]:
prior_data=prior_data.rename({0:'Actual'},axis=1)

In [423]:
data.T.set_index(pd.Index([i+15 for i in range(6)]))

Unnamed: 0,Actual,Projected
15,2673.0,2723.701
16,2607.0,2717.384
17,2556.0,2700.683
18,2501.0,2666.01
19,2479.0,2631.242
20,2469.0,2590.838


In [424]:
combined_data = pd.concat([prior_data,data.T.set_index(pd.Index([i+15 for i in range(6)]))])

In [425]:
melted = combined_data.reset_index().melt(id_vars='index')
melted

Unnamed: 0,index,variable,value
0,0,Actual,2734.0
1,1,Actual,2743.0
2,2,Actual,2745.0
3,3,Actual,2736.0
4,4,Actual,2736.0
5,5,Actual,2716.0
6,6,Actual,2705.0
7,7,Actual,2711.0
8,8,Actual,2705.0
9,9,Actual,2711.0


In [426]:
from plotly import express as px

In [427]:
px.line(melted, x = 'index', y = 'value', color='variable')

In [240]:
cities= pd.DataFrame(all_x['RegionName'].unique())

In [242]:
cities.to_csv("cities.csv")

In [271]:
cities[0].str.split(', ',expand=True).sort_values(1)

Unnamed: 0,0,1
47,Birmingham,AL
71,Little Rock,AR
51,Tucson,AZ
13,Phoenix,AZ
72,Stockton,CA
...,...,...
35,Virginia Beach,VA
93,Spokane,WA
14,Seattle,WA
37,Milwaukee,WI


In [287]:
pd.DataFrame(cities[0].str.split(', ',expand=True)[1].unique()).to_csv('states.csv')