In [20]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from math import sqrt

from statsmodels.tsa.vector_ar.var_model import VAR

# Bring in raw data and prepare for VAR model

In [22]:
data = pd.read_csv('Resources/Clean_Data/Final_Data.csv')
data.head()

Unnamed: 0,Zip_Code,Date,Date_Code,Year,Month,Sale_Price,Interest_Rate,Property_Tax,Rent_Price,Household_Income,...,FTE_Employed,Unemployed,Expense_Index,Average_Commute,Crime_Index,Loan_Amount,Loan_Term,Loan_R,Loan_Payment,Home_Affordability
0,32003,2019-01-01,201901,2019,1,244950.0,4.464,3778,1113,94154.0,...,10671,1069,104,35,28,195960.0,360.0,1.00372,988.713278,0.126012
1,32003,2019-02-01,201902,2019,2,270000.0,4.37,3778,1113,94154.0,...,10671,1069,104,35,28,216000.0,360.0,1.003642,1077.819242,0.137369
2,32003,2019-03-01,201903,2019,3,275000.0,4.265,3778,1113,94154.0,...,10671,1069,104,35,28,220000.0,360.0,1.003554,1084.200547,0.138182
3,32003,2019-04-01,201904,2019,4,264500.0,4.1425,3778,1113,94154.0,...,10671,1069,104,35,28,211600.0,360.0,1.003452,1027.671397,0.130978
4,32003,2019-05-01,201905,2019,5,281000.0,4.072,3778,1113,94154.0,...,10671,1069,104,35,28,224800.0,360.0,1.003393,1082.581718,0.137976


In [23]:
ml_data = data[['Zip_Code', 'Date_Code', 'Mobility_Rate', 'Expense_Index', 'Crime_Index' , 'Total_Vacant', 'Total_Dwellings', 'Total_Sales', 'FHA_Count', 'Home_Affordability', 'Rent_Affordability', 'Sale_Price' ]]
ml_data.head()

Unnamed: 0,Zip_Code,Date_Code,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price
0,32003,201901,0.182327,104,28,704,11724,32,7,0.126012,0.141853,244950.0
1,32003,201902,0.182327,104,28,704,11724,41,2,0.137369,0.141853,270000.0
2,32003,201903,0.182327,104,28,704,11724,21,7,0.138182,0.141853,275000.0
3,32003,201904,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0
4,32003,201905,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0


In [24]:
var_data = ml_data.sort_values(by = ['Zip_Code','Date_Code'], ascending = [True, True])
var_data['Zip_Code'] = var_data['Zip_Code'].astype(str)
var_data = var_data.set_index('Zip_Code')

# Drop zipcodes with no 2021 data
var_data = var_data.drop(['32061', '32072', '32079', '32332', '32361', '32426',
                        '32449', '32463', '32542', '32639', '32697', '33122'])
var_data

Unnamed: 0_level_0,Date_Code,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price
Zip_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
32003,201901,0.182327,104,28,704,11724,32,7,0.126012,0.141853,244950.0
32003,201902,0.182327,104,28,704,11724,41,2,0.137369,0.141853,270000.0
32003,201903,0.182327,104,28,704,11724,21,7,0.138182,0.141853,275000.0
32003,201904,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0
32003,201905,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0
...,...,...,...,...,...,...,...,...,...,...,...
34997,202011,0.143471,91,113,2976,21551,132,11,0.173497,0.247189,259750.0
34997,202012,0.143471,91,113,2976,21551,142,16,0.196301,0.247189,297000.0
34997,202101,0.143471,91,113,2976,21551,81,10,0.202930,0.247189,305000.0
34997,202102,0.143471,91,113,2976,21551,116,7,0.201553,0.247189,300000.0


# Loop through and run VAR model on all Florida Zip Codes

In [25]:
zip_codes = var_data.index.unique()

In [36]:
final_data = []
skipped_zipcodes = []

for code in zip_codes:
    try:
        # Drop unneeded columns for model
        var_data_slim = var_data.loc[code].drop(columns=['Mobility_Rate',
                'Expense_Index', 'Crime_Index', 'Total_Vacant', 'Total_Dwellings',
                'Rent_Affordability', 
                'Home_Affordability'])
       
    
        X = var_data_slim.values

        train= X[:24]
        test = X[24:]

        # Create model and fit
        ar_model = VAR(endog=train)
        ar_model_fit = ar_model.fit()

        # Run prediction model to extend to April 
        prediction = ar_model_fit.forecast(ar_model_fit.endog, steps=4)

        pred = pd.DataFrame(prediction)

        pred.rename(columns = {0: 'Date_Code', 1: 'Prediction_Sales',
                    2:'Prediction_Count', 3: 'Prediction_Price'}, inplace=True)
        pred = pred.drop(columns='Date_Code')

        #Extract April data prediction for later calculation
        april = pred.iloc[-1]
        index = pd.Series([0])

        april_predict = pd.DataFrame(april).transpose().set_index([index])

        # Create false data to fill in columns where no data to compare
        april_false = {'Date_Code':202104, 'Testing_Sales': 0, 
                                'Testing_Count': 0, 'Testing_Price':0}
        april_series = pd.Series(data=april_false, index=['Date_Code', 'Testing_Sales',
                                'Testing_Count', 'Testing_Price'])
        april_testing = pd.DataFrame(april_series).transpose()

        # Joint the two dataframes to create complete row
        april_data = april_testing.join(april_predict, lsuffix='test', rsuffix='pre')


        # Establish 
        zip_df = pd.DataFrame(test)
        zip_df.rename(columns = {0: 'Date_Code', 1:'Testing_Sales', 2: 'Testing_Count',
                    3:'Testing_Price'}, inplace=True)
        

        zip_df['Prediction_Sales'] = pred['Prediction_Sales']
        zip_df['Prediction_Count'] = pred['Prediction_Count']
        zip_df['Prediction_Price'] = pred['Prediction_Price']


        # Create Variables
        prediction_total_sales = []
        prediction_fha_count = []
        prediction_sales_price = []
        sales_mean_error = []
        count_mean_error = []
        price_mean_error = []
        zipcode.append(code)

        
        
        # Loop through dataframe to calculate mean error for our predictions
        for column in zip_df[['Testing_Sales', 'Testing_Count', 'Testing_Price',
                                'Prediction_Sales', 'Prediction_Count', 'Prediction_Price']]:
                
                testing_sales = zip_df['Testing_Sales'].values
                pred_sales = zip_df['Prediction_Sales'].values
                testing_count = zip_df['Testing_Count'].values
                pred_count = zip_df['Prediction_Count'].values
                testing_price = zip_df['Testing_Price'].values
                pred_price = zip_df['Prediction_Price'].values
  

                prediction_total_sales.append(pred_sales)
                prediction_fha_count.append(pred_count)
                prediction_sales_price.append(pred_price)
                sales_mean_error.append(np.sqrt(mean_squared_log_error(testing_sales, pred_sales)))
                count_mean_error.append(np.sqrt(mean_squared_log_error(testing_count, pred_count)))
                price_mean_error.append(np.sqrt(mean_squared_log_error(testing_price, pred_price)))
                
                
        # Add new colums to datafram for mean errors and add row with April predictions
        zip_df['Sales_Mean_Error'] = sales_mean_error[0]
        zip_df['Count_Mean_Error'] = count_mean_error[0]
        zip_df['Price_Mean_Error'] = price_mean_error[0]
        zip_df = zip_df.append(april_data, ignore_index=True)
        zip_df['Zip_Code'] = code
        
       # Append final datframe to variable 
        final_data.append(zip_df.values)

    except:
        skipped_zipcodes.append(code)

  omega = sse / df_resid
  omega = sse / df_resid


# Loop through Final Data and append to variables

In [37]:
testing_total_sales = []
testing__fha_count = []
testing_sales_price = []
prediction_total_sales = []
prediction_fha_count = []
prediction_sales_price = []
sales_mean_error = []
count_mean_error = []
price_mean_error = []
zipcode = []
date = []

for i in final_data:
    for row in i:
        date.append(row[0])
        testing_total_sales.append(row[1])
        testing__fha_count.append(row[2])
        testing_sales_price.append(row[3])
        prediction_total_sales.append(row[4])
        prediction_fha_count.append(row[5])
        prediction_sales_price.append(row[6])
        sales_mean_error.append(row[7])
        count_mean_error.append(row[8])
        price_mean_error.append(row[9])
        zipcode.append(row[10])

# Create DataFrame with all final data

In [38]:
var_model_predict = pd.DataFrame({'Zip_Code':zipcode, 'Date_Code':date, 
                                'Testing_Total_Sales': testing_total_sales,
                                'Testing_FHA_Count': testing__fha_count,
                                'Testing_Sale_Price':testing_sales_price,
                                'Total_Sales_Prediction':prediction_total_sales,
                                'FHA_Count_Prediction': prediction_fha_count,
                                'Sale_Price_Prediction':prediction_sales_price,
                                'Sales_Mean_Error':sales_mean_error,
                                'FHA_Count_Mean_Error':count_mean_error, 'Sale_Price_Mean_Error': price_mean_error})
var_model_predict

Unnamed: 0,Zip_Code,Date_Code,Testing_Total_Sales,Testing_FHA_Count,Testing_Sale_Price,Total_Sales_Prediction,FHA_Count_Prediction,Sale_Price_Prediction,Sales_Mean_Error,FHA_Count_Mean_Error,Sale_Price_Mean_Error
0,32003,202101.0,39.0,6.0,299900.0,46.486150,7.030309,298619.427118,0.351924,0.328928,0.094037
1,32003,202102.0,30.0,8.0,333800.0,46.310881,7.919587,299917.831882,0.351924,0.328928,0.094037
2,32003,202103.0,70.0,4.0,339950.0,46.392005,7.691014,300696.568632,0.351924,0.328928,0.094037
3,32003,202104.0,0.0,0.0,0.0,46.335468,7.811788,301194.146141,,,
4,32011,202101.0,28.0,4.0,220450.0,24.836821,5.108163,205008.521054,0.250680,0.156858,0.313080
...,...,...,...,...,...,...,...,...,...,...,...
3110,34996,202104.0,0.0,0.0,0.0,51.033321,0.348912,339190.742927,,,
3111,34997,202101.0,81.0,10.0,305000.0,124.397458,13.576777,276938.709501,1.608839,0.342077,0.467749
3112,34997,202102.0,116.0,7.0,300000.0,123.990744,12.454764,271037.002628,1.608839,0.342077,0.467749
3113,34997,202103.0,7.0,12.0,599000.0,124.546443,12.513559,269696.282665,1.608839,0.342077,0.467749


# Export Final DataFrame to CSV

In [29]:
var_model_predict.to_csv('Resources/ML_Outputs/var_model_predict.csv')