In [1]:
import re
import quandl
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from matplotlib import style
style.use('fivethirtyeight')
api_key = open('quandlapikey.txt', 'r').read()

In [2]:
def state_list():
    fifty_states = pd.read_html('https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States')
    return fifty_states[0].iloc[:, 1]


def grab_initial_state_data():
    
    states = state_list()
    
    main_df = pd.DataFrame()
        
    df = quandl.get('FMAC/HPI', authtoken=api_key)
    for abbv in states:
        df[abbv] = (df[abbv]-df[abbv][0]) / df[abbv][0] * 100.0 # change to % info and look graph1
        #main_df['NSA Value' + abbv] = df['NSA Value']
        #main_df['SA Value' + abbv] = df['SA Value']

        #print(query)
        if main_df.empty:
            main_df = df
        else:
            main_df = main_df.join(df, lsuffix=abbv)
            
    pickle_out = open('fifty_states4.pickle', 'wb')
    pickle.dump(main_df, pickle_out)
    pickle_out.close()

    
def HPI_Benchmark():
    df = quandl.get("FMAC/HPI_USA", authtoken=api_key)
    df["NSA Value"] = (df['NSA Value'] - df['NSA Value'][0]) / df['NSA Value'][0] * 100.0
    df.rename(columns={'NSA Value':'US_HPI'}, inplace=True)
    return df
    
def mortgage_30y():
    df = quandl.get("FMAC/MORTG", trim_start='1975-01-01', authtoken=api_key)
    df["Value"] = (df['Value'] - df['Value'][0]) / df['Value'][0] * 100.0
    df.rename(columns={'Value':'M30'}, inplace=True)
    df = df.resample('M').mean()
    df.columns = ['M30']      
    return df



In [3]:
HPI_data = pd.read_pickle('fiddy_states3.pickle')
m30 = mortgage_30y()
HPI_Bench = HPI_Benchmark()
m30.columns=['M30']
HPI = HPI_Bench.join(m30)
print(HPI.head())

              US_HPI   SA Value       M30
Date                                     
1975-01-31  0.000000  23.587964  0.000000
1975-02-28  0.620939  23.738887 -3.393425
1975-03-31  1.666642  23.954488 -5.620361
1975-04-30  3.034074  24.224848 -6.468717
1975-05-31  3.851987  24.367024 -5.514316


In [4]:
print(HPI.corr())

            US_HPI  SA Value       M30
US_HPI    1.000000  0.999811 -0.777935
SA Value  0.999811  1.000000 -0.778349
M30      -0.777935 -0.778349  1.000000


In [5]:
HPI.to_pickle('HPI.pickle')

## Rolling Apply and Mapping Functions 

In [6]:
housing_data = pd.read_pickle('HPI.pickle')

In [7]:
housing_data.head()

Unnamed: 0_level_0,US_HPI,SA Value,M30
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1975-01-31,0.0,23.587964,0.0
1975-02-28,0.620939,23.738887,-3.393425
1975-03-31,1.666642,23.954488,-5.620361
1975-04-30,3.034074,24.224848,-6.468717
1975-05-31,3.851987,24.367024,-5.514316


### percentage change over the whole data

In [8]:
housing_data = housing_data.pct_change()

In [9]:
housing_data.head()

Unnamed: 0_level_0,US_HPI,SA Value,M30
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1975-01-31,,,
1975-02-28,inf,0.006398,-inf
1975-03-31,1.684066,0.009082,0.65625
1975-04-30,0.820471,0.011286,0.150943
1975-05-31,0.269576,0.005869,-0.147541


### dealing with -inf values

In [10]:
housing_data.replace([np.inf, -np.inf], np.nan, inplace=True)

### Dealing with NaN values

In [11]:
housing_data.dropna(inplace=True)

In [12]:
housing_data.head()

Unnamed: 0_level_0,US_HPI,SA Value,M30
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1975-03-31,1.684066,0.009082,0.65625
1975-04-30,0.820471,0.011286,0.150943
1975-05-31,0.269576,0.005869,-0.147541
1975-06-30,0.038142,-0.001025,0.038462
1975-07-31,0.057269,0.001231,0.0


In [13]:
housing_data['US_HPI_Future'] = housing_data['SA Value'].shift(-1)

In [14]:
housing_data.dropna(inplace=True)

In [15]:
housing_data[['US_HPI_Future', 'SA Value']].head()

Unnamed: 0_level_0,US_HPI_Future,SA Value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1975-03-31,0.011286,0.009082
1975-04-30,0.005869,0.011286
1975-05-31,-0.001025,0.005869
1975-06-30,0.001231,-0.001025
1975-07-31,0.004444,0.001231


In [16]:
def create_labels(cur_hpi, fut_hpi):
    if fut_hpi > cur_hpi:
        return 1
    else:
        return 0

In [17]:
housing_data['label'] = list(map(create_labels, housing_data['SA Value'], housing_data['US_HPI_Future']))

In [18]:
print(housing_data.head())

              US_HPI  SA Value       M30  US_HPI_Future  label
Date                                                          
1975-03-31  1.684066  0.009082  0.656250       0.011286      1
1975-04-30  0.820471  0.011286  0.150943       0.005869      0
1975-05-31  0.269576  0.005869 -0.147541      -0.001025      0
1975-06-30  0.038142 -0.001025  0.038462       0.001231      1
1975-07-31  0.057269  0.001231  0.000000       0.004444      1


In [19]:
from statistics import mean

In [20]:
def moving_average(values):
    return mean(values)

In [21]:
housing_data['ma_apply_exam'] = housing_data['M30'].rolling(10).apply(moving_average)


  """Entry point for launching an IPython kernel.


In [22]:
print(housing_data.tail())

              US_HPI  SA Value  M30  US_HPI_Future  label  ma_apply_exam
Date                                                                    
2019-01-31  0.001378  0.003768  0.0       0.004404      1            0.0
2019-02-28  0.006430  0.004404  0.0       0.001670      0            0.0
2019-03-31  0.010017  0.001670  0.0       0.002218      1            0.0
2019-04-30  0.011743  0.002218  0.0       0.003128      1            0.0
2019-05-31  0.011412  0.003128  0.0       0.002673      0            0.0


In [23]:
from sklearn import svm, preprocessing
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

In [41]:
housing_data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [42]:
housing_data.dropna(inplace=True)

In [43]:
X = np.array(housing_data.drop(['label', 'US_HPI_Future'], 1))
X = preprocessing.scale(X)

In [44]:
y = np.array(housing_data['label'])

In [45]:
X.shape

(522, 4)

In [46]:
y.shape

(522,)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [48]:
clf = svm.SVC(kernel='linear')

In [49]:
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [32]:
print(clf.score(X_test, y_test))

0.5625
