In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from matplotlib import style
style.use("ggplot")
from sklearn.linear_model import LinearRegression
%matplotlib inline
from sklearn.linear_model import LogisticRegression



# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

In [2]:
# Setting up example data

file_path = Path("../Resources/State_MedianListingPrice_AllHomes.csv")
df_zillow = pd.read_csv(file_path)
df_zillow.head()

Unnamed: 0,RegionName,SizeRank,2010-01,2010-02,2010-03,2010-04,2010-05,2010-06,2010-07,2010-08,...,2016-12,2017-01,2017-02,2017-03,2017-04,2017-05,2017-06,2017-07,2017-08,2017-09
0,California,1,,,,,,,,,...,465000.0,468888,475000,494900.0,499900.0,499999.0,499900.0,499950.0,499900.0,499950
1,Texas,2,160000.0,161000.0,162500.0,163990.0,163000.0,164000.0,160000.0,160692.0,...,249999.0,249900,255000,263900.0,269900.0,272000.0,274681.0,270000.0,269900.0,269000
2,New York,3,,,,,,,,,...,329000.0,339400,349000,349900.0,350000.0,349174.5,345000.0,339900.0,329999.0,339000
3,Florida,4,,,,,,,,,...,264900.0,269900,274000,275000.0,279000.0,279900.0,279990.0,279999.0,279900.0,283990
4,Illinois,5,,,,,,,,,...,185000.0,185000,197900,209876.5,215900.0,219900.0,224900.0,219900.0,215000.0,214900


In [3]:
df_zillow.isnull().sum()

RegionName     0
SizeRank       0
2010-01       15
2010-02       15
2010-03       14
              ..
2017-05        0
2017-06        0
2017-07        0
2017-08        0
2017-09        0
Length: 95, dtype: int64

In [4]:
df_zillow.drop('SizeRank', inplace=True, axis=1)

In [5]:
zillow = df_zillow.set_index('RegionName')

zillow.head()

Unnamed: 0_level_0,2010-01,2010-02,2010-03,2010-04,2010-05,2010-06,2010-07,2010-08,2010-09,2010-10,...,2016-12,2017-01,2017-02,2017-03,2017-04,2017-05,2017-06,2017-07,2017-08,2017-09
RegionName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
California,,,,,,,,,,,...,465000.0,468888,475000,494900.0,499900.0,499999.0,499900.0,499950.0,499900.0,499950
Texas,160000.0,161000.0,162500.0,163990.0,163000.0,164000.0,160000.0,160692.0,169990.0,169385.0,...,249999.0,249900,255000,263900.0,269900.0,272000.0,274681.0,270000.0,269900.0,269000
New York,,,,,,,,,,,...,329000.0,339400,349000,349900.0,350000.0,349174.5,345000.0,339900.0,329999.0,339000
Florida,,,,,,,,,,,...,264900.0,269900,274000,275000.0,279000.0,279900.0,279990.0,279999.0,279900.0,283990
Illinois,,,,,,,,,,,...,185000.0,185000,197900,209876.5,215900.0,219900.0,224900.0,219900.0,215000.0,214900


In [6]:
zillow.dropna(inplace=True, axis=0)

zillow

Unnamed: 0_level_0,2010-01,2010-02,2010-03,2010-04,2010-05,2010-06,2010-07,2010-08,2010-09,2010-10,...,2016-12,2017-01,2017-02,2017-03,2017-04,2017-05,2017-06,2017-07,2017-08,2017-09
RegionName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Texas,160000.0,161000.0,162500.0,163990.0,163000.0,164000.0,160000.0,160692.0,169990.0,169385.0,...,249999.0,249900,255000,263900.0,269900.0,272000.0,274681.0,270000.0,269900.0,269000
Pennsylvania,169900.0,169900.0,174900.0,174500.0,170000.0,169900.0,169900.0,169900.0,169999.0,169200.0,...,170000.0,169900,174900,189000.0,195000.0,195000.0,194900.0,194900.0,189900.0,189900
North Carolina,189900.0,189900.0,189900.0,189000.0,187500.0,185000.0,184000.0,179900.0,184900.0,179000.0,...,229000.0,225000,229900,235000.0,245000.0,249000.0,249900.0,249900.0,249000.0,249900
Virginia,230000.0,231750.0,234950.0,235000.0,234900.0,232000.0,230000.0,229900.0,235000.0,229900.0,...,269900.0,269000,274999,289000.0,299000.0,299900.0,299950.0,299900.0,295000.0,297970
Washington,259900.0,259000.0,260000.0,257500.0,250000.0,250000.0,254300.0,250000.0,254950.0,239900.0,...,309000.0,305950,315000,329006.0,338950.0,349000.0,350000.0,350000.0,350000.0,350000
Massachusetts,289900.0,304900.0,299900.0,299900.0,299900.0,299000.0,299000.0,299000.0,299900.0,299900.0,...,384900.0,379900,389900,409900.0,422000.0,429000.0,429000.0,424900.0,419000.0,419900
Indiana,124900.0,125000.0,127000.0,126900.0,126500.0,125900.0,124900.0,124900.0,125000.0,124900.0,...,139900.0,139900,140000,149900.0,159900.0,164900.0,165000.0,165900.0,164900.0,167000
Tennessee,159900.0,159000.0,156900.0,155000.0,154900.0,154900.0,154900.0,154900.0,163900.0,159900.0,...,199000.0,195000,199900,208000.0,214900.0,219900.0,222900.0,224950.0,224900.0,225000
Missouri,140000.0,141250.0,140000.0,139999.0,139900.0,139900.0,139900.0,139900.0,144900.0,142500.0,...,149900.0,149900,150825,159500.0,164900.0,168600.0,169900.0,169900.0,169900.0,169900
Maryland,245000.0,246000.0,245000.0,249000.0,243900.0,239900.0,234900.0,229900.0,239800.0,237500.0,...,275000.0,274900,279900,290790.0,299900.0,309000.0,307000.0,309000.0,300000.0,302900


In [7]:
#Build Linear Regression
#from sklearn.linear_model import LinearRegression
#model = LinearRegression()
#model

In [8]:
# Separate the data into the X and y components
# In this case, y will be the `Price` and X will be the remainign columns

#X = zdata.index.values.reshape(-1,1)
#y = zdata['Price']



In [9]:
# Fit the model to the data

#model.fit(X, y)

In [10]:
#predicted_y_values = model.predict(X)
#predicted_y_values

In [11]:
#Extracting coef, intercept

#model.coef_
#model.intercept_



In [12]:
#Plotting Results

#plt.scatter(X, y)
#plt.plot(X, predicted_y_values, color='red')


In [13]:
#Metrics - score, R2, mse, rmse, std

#from sklearn.metrics import mean_squared_error, r2_score

#score = model.score(X, y, sample_weight=None)
#r2 = r2_score(y, predicted_y_values)
#mse = mean_squared_error(y, predicted_y_values)
#rmse = np.sqrt(mse)
#std = np.std(y)

In [14]:
#Build Logistic Regression

#train test split
#from sklearn.model_selection import train_test_split

#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)

In [15]:
#Create Model
#classifier = LogisticRegression(solver='lbfgs', random_state=78)
#classifier

In [16]:
# Train the data
#classifier.fit(X_train, y_train)

In [17]:
# Score the model
#print(f"Training Data Score: {classifier.score(X_train, y_train)}")
#print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

In [18]:
# Predict outcomes for test data set
#predictions = classifier.predict(X_test)
#pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [19]:
#Confusion Matrix
#from sklearn.metrics import confusion matrix
#confusion_matrix(y_test, predictions)

In [20]:
#Classification Report
#from sklearn.metrics import classsification report
#print(classification_report(y_test, predictions))

In [21]:
#Build LSTM Model

#def window_data(df, window, feature_col_number, target_col_number):
#    X = []
#    y = []
#    for i in range(len(df) - window - 1):
#        features = df.iloc[i:(i + window), feature_col_number]
#        target = df.iloc[(i + window), target_col_number]
#        X.append(features)
#        y.append(target)
#    return np.array(X), np.array(y).reshape(-1, 1)

In [22]:
# Predict Closing Prices using a 10 day window of previous fng values
# Then, experiment with window sizes anywhere from 1 to 10 and see how the model performance changes
#window_size = 10

# Column index 0 is the 'fng_value' column
# Column index 1 is the `Close` column
#feature_column = 0
#target_column = 1
#X, y = window_data(df, window_size, feature_column, target_column)

In [23]:
# Use 70% of the data for training and the remaineder for testing
#split = int(0.7 * len(X))

#X_train = X[: split]
#X_test = X[split:]
#y_train = y [: split]
#y_test = y[split:]

In [24]:
from sklearn.preprocessing import MinMaxScaler
# Use the MinMaxScaler to scale data between 0 and 1.
#scaler = MinMaxScaler()
#scaler.fit(X)
#X_train = scaler.transform(X_train)
#X_test = scaler.transform(X_test)
#scaler.fit(y)
#y_train = scaler.transform(y_train)
#y_test = scaler.transform(y_test)



In [25]:
# Reshape the features for the model
#X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
#X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [27]:
# Build the LSTM model. 
# The return sequences need to be set to True if you are adding additional LSTM layers, but 
# You don't have to do this for the final layer. 
# Note: The dropouts help prevent overfitting
# Note: The input shape is the number of time steps and the number of indicators
# Note: Batching inputs has a different input shape of Samples/TimeSteps/Features

#model = Sequential()

#number_units = 5
#dropout_fraction = 0.2


# Layer 1
#model.add(LSTM(
#    units=number_units,
#    return_sequences=True,
#    input_shape=(X_train.shape[1], 1))
#    )
#model.add(Dropout(dropout_fraction))

# Layer 2
#model.add(LSTM(units=number_units, return_sequences=True))
#model.add(Dropout(dropout_fraction))

# Layer 3
#model.add(LSTM(units=number_units))
#model.add(Dropout(dropout_fraction))

# Output layer
#model.add(Dense(1))

In [28]:
# Compile the model
#model.compile(optimizer='adam', loss="mean_squared_error")

In [29]:
# Summarize the model
#model.summary()

In [30]:
# Train the model
# Use at least 10 epochs
# Do not shuffle the data
# Experiement with the batch size, but a smaller batch size is recommended
#model.fit(X_train, y_train, epochs=10, shuffle=False, batch_size=1, verbose=1)

In [31]:
# Evaluate the model
#model.evaluate(X_test, y_test)

In [32]:
# Make some predictions
#predicted = model.predict(X_test)

In [33]:
# Recover the original prices instead of the scaled version
#predicted_prices = scaler.inverse_transform(predicted)
#real_prices = scaler.inverse_transform(y_test.reshape(-1, 1))

In [34]:
# Create a DataFrame of Real and Predicted values
#housing = pd.DataFrame({
#    "Real": real_prices.ravel(),
#    "Predicted": predicted_prices.ravel()
#}, index = df.index[-len(real_prices): ]) 

#housing.head()

In [35]:
# Plot the real vs predicted values as a line chart
#housing.plot()