In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import io
import imageio
from IPython.display import Image, display
from ipywidgets import widgets, Layout, HBox
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from matplotlib.figure import Figure
import math
import imageio
import datetime
from os import walk
from scipy import stats

from ConvLSTM import ConvLSTM
from SlimAutoencoder import SlimAutoencoderBuilder
from rea_cd_autoencoder import REA_CD_Autoencoder
from dataloader import hillsborough_county_rea_cd_encoded as hc_rea_cd_encoded
from datawrangler import DataWrangler


# random forest
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.metrics import mean_squared_error
import random

#file io
import pickle

import multiprocessing
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [145]:
df = pd.read_pickle('../Datasets/residential_conveyances_jan2000_and_up_ohe.pickle')
df.shape

(1286573, 753)

In [119]:
df[ pd.to_datetime(df['S_DATE_x']) > pd.to_datetime('08-20-2020')].shape
print(type(pd.to_datetime('08-20-2020')))

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [392]:
def adjustDates(df, date_col, new_end_date):
    #print(new_end_date[0], type(new_end_date[0]))
    out = df.copy()
    out['daysago'] = (new_end_date[0] - pd.to_datetime(df[date_col])).dt.days
    return out

In [393]:
def getSliceAndChangeDates(df, start_date, end_date):
    #print(start_date[0])
    #print(type(end_date.date[0]))
    
    maskleft = (pd.to_datetime(df['S_DATE_x']) >= start_date[0])
    maskright = (pd.to_datetime(df['S_DATE_x']) < end_date[0])
    df = df[ maskleft & maskright ]
    #print(f"     {maskleft.shape} {maskright.shape}")
    return df

In [434]:
def sliceIntoHistoricalWindows(df, history_length_in_days=365*5, step_size_in_days=365/2, pickle_path='../Datasets/windows/'):
    '''
    history_length_in_days - the number of days to look back and generate a history for
    step_size_in_days      - the number of days we step backwards to generate a new history
    '''
    df = df.copy()
    start_date = pd.to_datetime(df[:1]['S_DATE_x'].values)
    end_date = pd.to_datetime(df[-1:]['S_DATE_x'].values)
    
    total_span = end_date - start_date
    current_end_date = end_date
    current_start_date = pd.to_datetime(current_end_date) - pd.to_timedelta(f"{history_length_in_days} days")
    window = 0
    while current_start_date >= start_date:
        
        #do logic
        
        # grab the data from the previous history_length_in_days
        thisSlice = getSliceAndChangeDates(df, current_start_date, current_end_date)
        
        # create a 'daysago' column measuring time from the current_end_date
        thisSlice = adjustDates(thisSlice, 'S_DATE_x', current_end_date)
        
        filename = f"df.{current_start_date.strftime('%Y-%m-%d')[0]}..{current_end_date.strftime('%Y-%m-%d')[0]}.pickle"
        print(f"writing {filename}")
        thisSlice.to_pickle(f"{pickle_path}/{filename}")
        
    
        #slide window back
        current_end_date = pd.to_datetime(current_end_date) - pd.to_timedelta(f"{step_size_in_days} days")
        current_start_date = pd.to_datetime(current_end_date) - pd.to_timedelta(f"{history_length_in_days} days")
        window += 1


In [435]:
sliceIntoHistoricalWindows(df)

writing df.2016-08-12..2021-08-11.pickle
writing df.2016-02-11..2021-02-09.pickle
writing df.2015-08-13..2020-08-11.pickle
writing df.2015-02-11..2020-02-10.pickle
writing df.2014-08-13..2019-08-12.pickle
writing df.2014-02-11..2019-02-10.pickle
writing df.2013-08-13..2018-08-12.pickle
writing df.2013-02-11..2018-02-10.pickle
writing df.2012-08-13..2017-08-12.pickle
writing df.2012-02-12..2017-02-10.pickle
writing df.2011-08-14..2016-08-12.pickle
writing df.2011-02-12..2016-02-11.pickle
writing df.2010-08-14..2015-08-13.pickle
writing df.2010-02-12..2015-02-11.pickle
writing df.2009-08-14..2014-08-13.pickle
writing df.2009-02-12..2014-02-11.pickle
writing df.2008-08-14..2013-08-13.pickle
writing df.2008-02-13..2013-02-11.pickle
writing df.2007-08-15..2012-08-13.pickle
writing df.2007-02-13..2012-02-12.pickle
writing df.2006-08-15..2011-08-14.pickle
writing df.2006-02-13..2011-02-12.pickle
writing df.2005-08-15..2010-08-14.pickle
writing df.2005-02-13..2010-02-12.pickle
writing df.2004-