In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# wrangling
import pandas as pd
import numpy as np

# preparing
from sklearn.model_selection import train_test_split

# visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# exploring
import scipy.stats as stats
import pandas_profiling

# modeling
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression
from sklearn.neighbors import KNeighborsRegressor
from math import sqrt

# 3D projection
from mpl_toolkits.mplot3d import Axes3D

pd.options.display.float_format = '{:20,.2f}'.format

# my modules
import acquire
import summarize
import prepare

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

### Acquire df

- Query brought in 52,169

In [2]:
# df = acquire.get_zillow_data()

In [3]:
df = pd.read_csv("zillow_dataframe.csv")

In [4]:
df.shape

(52169, 21)

- $H_0$: The Null Hypothesis is that the numeric values from the Zillow data do not predict the logerror.

- $H_a$: The Null Hypothesis is that the numeric values from the Zillow data do not predict the logerror.

### Summarize df

In [5]:
summarize.df_summary(df)

--- Shape: (52169, 21)
--- Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52169 entries, 0 to 52168
Data columns (total 21 columns):
Unnamed: 0                      52169 non-null int64
tax_rate                        52164 non-null float64
bathroomcnt                     52169 non-null float64
bedroomcnt                      52169 non-null float64
calculatedfinishedsquarefeet    52161 non-null float64
fips                            52169 non-null float64
garagecarcnt                    17966 non-null float64
garagetotalsqft                 17966 non-null float64
latitude                        52169 non-null float64
longitude                       52169 non-null float64
lotsizesquarefeet               51815 non-null float64
poolcnt                         11064 non-null float64
poolsizesum                     860 non-null float64
taxvaluedollarcnt               52168 non-null float64
yearbuilt                       52129 non-null float64
landtaxvaluedollarcnt           52168 

(-52.169, 5216.8]                    5217
(5216.8, 10433.6]                    5217
(10433.6, 15650.4]                   5217
(15650.4, 20867.2]                   5217
(20867.2, 26084.0]                   5217
(26084.0, 31300.8]                   5216
(31300.8, 36517.6]                   5217
(36517.6, 41734.4]                   5217
(41734.4, 46951.2]                   5217
(46951.2, 52168.0]                   5217
(-0.0017230000000000001, 0.0817]    52125
(0.0817, 0.163]                        30
(0.163, 0.245]                          3
(0.245, 0.327]                          2
(0.327, 0.408]                          1
(0.408, 0.49]                           1
(0.49, 0.572]                           1
(0.572, 0.653]                          0
(0.653, 0.735]                          0
(0.735, 0.816]                          1
(0.982, 2.7]                        36120
(2.7, 4.4]                          13787
(4.4, 6.1]                           2032
(6.1, 7.8]                        

- Here I use a function that takes in a dataframe of observations and attributes and returns a df where each row is an atttribute name, the first column is the number of rows with missing values for that attribute, and the second column is percent of total rows that have missing values for that attribute. Run the function and document takeaways from this on how you want to handle missing values.

In [6]:
summarize.nulls_by_col(df)

Unnamed: 0,num_rows_missing,pct_rows_missing
Unnamed: 0,0,0.0
tax_rate,5,0.0
bathroomcnt,0,0.0
bedroomcnt,0,0.0
calculatedfinishedsquarefeet,8,0.0
fips,0,0.0
garagecarcnt,34203,0.66
garagetotalsqft,34203,0.66
latitude,0,0.0
longitude,0,0.0


#### Takeaways from nulls in columns function

- I can see that there are columns that have no data in them, and those I will certainly drop. 


- There are others that are more than 50% NULL values, and I'm going to drop those as well. That is too high of a percentage of Nulls to make the data meaningful.

- Write a function that takes in a dataframe and returns a dataframe with 3 columns: the number of columns missing, percent of columns missing, number of rows with n columns missing. Run the function and document takeaways from this on how you want to handle missing values.

In [7]:
summarize.nulls_by_row(df)

Unnamed: 0,num_cols_missing,pct_cols_missing,num_rows
0,0,0.0,5
1,1,4.761904761904762,62
2,2,9.523809523809524,1092
3,3,14.285714285714285,3580
4,4,19.047619047619047,14909
5,5,23.809523809523807,6571
6,6,28.57142857142857,25687
7,7,33.33333333333333,256
8,8,38.095238095238095,7


#### Takeaways from the nulls by row function

- For my first iteration of the pipeline, I'm going to drop any rows that have missing values. 


- I will go back and add back and possibly impute values after my first iteration.

- This function will drop columns that are not 50% non-missing values and rows that are not 75% non-missing values

In [8]:
df = prepare.handle_missing_values(df)

In [9]:
df.isnull().sum()

Unnamed: 0                        0
tax_rate                          5
bathroomcnt                       0
bedroomcnt                        0
calculatedfinishedsquarefeet      8
fips                              0
latitude                          0
longitude                         0
lotsizesquarefeet               354
taxvaluedollarcnt                 1
yearbuilt                        40
landtaxvaluedollarcnt             1
logerror                          0
transactions                      0
county_name                       0
dtype: int64

- I am removing columns for the following reasons...

    - "unitcnt", "propertyzoningdesc", "heatingorsystemdesc", 
      "heatingorsystemtypeid", "buildingqualitytypeid" : missing 18,451 values
    
    - "assessmentyear": constant value of 2016
    
    - "calculatedbathnbr": has a correlation of 1.0 with bathroomcnt
    
    - "finishedsquarefeet12": has a correlcation of 1.0 with calculatedsquarefeet
    
    - "propertylandusedesc", "propertylandusetypeid": these are all "261" or single family residential.
    
    - "rawcensustractandblack": is highly correlated with fips/county
    
    - "state": are all California; not useful in analysis
    
    - "regionidcity": is represented by zipcodes
    
    - "roomcnt": over 36,000 values are 0 rooms
    
    - "parcelid": this is not needed beyond joining tables

In [10]:
# cols_to_remove = ["unitcnt", "propertyzoningdesc", "heatingorsystemdesc", 
#                   "heatingorsystemtypeid", "buildingqualitytypeid", "assessmentyear",
#                   "calculatedbathnbr", "finishedsquarefeet12", "propertylandusedesc", 
#                   "propertylandusetypeid", "rawcensustractandblock", "censustractandblock" , "regionidcity",
#                   "state", "id", "fullbathcnt", "roomcnt", "parcelid"]

- This function removes the columns above from the dataframe for the reasons I listed above.

In [11]:
#df = prepare.remove_columns(df, cols_to_remove)

- Drop rows with missing values using a df.dropna(), so I can run through my first iteration of exploration with visuals using numeric or categorical data types without any missing values. 


- I may decide later to impute values and save rows for the following columns.


    - "tax_rate": has five rows with missing values in tax_rate

    - "yearbuilt": has 40 missing values that I can't impute in a meaningful way. Dropping 40 rows out of the df should not be a problem.
    
    - "censustractandblock": has 112 values I can't impute in a meaningful way. I'll drop these rows.
    
    - "lotsizesquarefeet": 
    
    - "regionidcity":
    
    - "regionidzip":
    
    - "taxvaluedollarcnt":
    
    - "yearbuilt":
    
    - "structuretaxvaluedollarcnt":
    
    - "landtaxvaluedollarcnt":
    
    - "taxamount":

- Now I will drop remaining rows with missing values to get to a df that I can use to run a LR and DT baseline model

In [12]:
df.dropna(inplace=True)

- Verify there are no more missing values in my dataframe.


    - I'll be ready for visual exploration and transforming data types.

In [13]:
df.isnull().sum()

Unnamed: 0                      0
tax_rate                        0
bathroomcnt                     0
bedroomcnt                      0
calculatedfinishedsquarefeet    0
fips                            0
latitude                        0
longitude                       0
lotsizesquarefeet               0
taxvaluedollarcnt               0
yearbuilt                       0
landtaxvaluedollarcnt           0
logerror                        0
transactions                    0
county_name                     0
dtype: int64

- I have 51, 657 rows and 26 columns remaining in my df, and now I will deal with data types to get to a df of numeric values only.

In [14]:
df.shape

(51770, 15)

In [15]:
df.dtypes

Unnamed: 0                        int64
tax_rate                        float64
bathroomcnt                     float64
bedroomcnt                      float64
calculatedfinishedsquarefeet    float64
fips                            float64
latitude                        float64
longitude                       float64
lotsizesquarefeet               float64
taxvaluedollarcnt               float64
yearbuilt                       float64
landtaxvaluedollarcnt           float64
logerror                        float64
transactions                      int64
county_name                      object
dtype: object

- For the first iteration, I'm including transactiondate in column conversion to category. If I need to convert it to a datetime later, I will come back and run this.


- Later, I may decide to convert transactiondate column to data type datetime using the code below.

- Convert the following columns to category data types using a function.

In [16]:
cols = ["county", "transactiondate",  "fips", "propertycountylandusecode", "regionidcounty", "regionidzip", "yearbuilt", "transactions"]

In [17]:
df = prepare.numeric_to_category(df, cols)

KeyError: "['county', 'transactiondate', 'propertycountylandusecode', 'regionidcounty', 'regionidzip'] not in index"

In [None]:
df.dtypes

- Scale latitude and longitude, so they are not such large numbers for baseline models.

In [None]:
df.latitude = df.latitude / 1_000_000 

In [None]:
df.longitude = df.longitude / 1_000_000 

In [None]:
df.shape

- Remove outliers using Q1 - IQR * 3 and Q3 + IQR * 3

In [None]:
df = prepare.remove_outliers_iqr(df, columns=["bedroomcnt", "bathroomcnt"])
df.shape

### Split df into train, test - Whole df

In [21]:
train, test = train_test_split(df, test_size=.30, random_state=123)

In [22]:
train.shape

(36239, 15)

In [23]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,36239.0,25995.12,15110.09,0.0,12880.5,25958.0,39086.0,52167.0
tax_rate,36239.0,0.01,0.01,0.0,0.01,0.01,0.01,0.49
bathroomcnt,36239.0,2.3,1.01,1.0,2.0,2.0,3.0,13.0
bedroomcnt,36239.0,3.31,0.93,1.0,3.0,3.0,4.0,10.0
calculatedfinishedsquarefeet,36239.0,1917.8,994.61,152.0,1268.0,1656.0,2296.0,20612.0
fips,36239.0,6049.17,21.03,6037.0,6037.0,6037.0,6059.0,6111.0
latitude,36239.0,34021346.21,273647.36,33341224.0,33826144.5,34022742.0,34186598.0,34818767.0
longitude,36239.0,-118193247.54,356115.45,-119475416.0,-118400296.5,-118152731.0,-117929795.0,-117554636.0
lotsizesquarefeet,36239.0,11128.43,86789.47,236.0,5566.5,6822.0,8750.0,6971010.0
taxvaluedollarcnt,36239.0,528439.64,735844.11,3254.0,193163.0,373000.0,618535.0,25381250.0


- Create x and y variables for whole df train and test to prepare to model

In [24]:
X_train = train.drop(columns="logerror")

In [25]:
y_train = train[["logerror"]]

In [26]:
X_test = test.drop(columns="logerror")

In [27]:
y_test = test[["logerror"]]

- Create a train and test df that is numeric dtypes only for modeling and correlation exploration.

In [28]:
nums_train = train.select_dtypes(exclude="category")

In [29]:
nums_test = test.select_dtypes(exclude="category")

In [30]:
nums_train.shape

(36239, 15)

In [31]:
nums_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36239 entries, 35698 to 15849
Data columns (total 15 columns):
Unnamed: 0                      36239 non-null int64
tax_rate                        36239 non-null float64
bathroomcnt                     36239 non-null float64
bedroomcnt                      36239 non-null float64
calculatedfinishedsquarefeet    36239 non-null float64
fips                            36239 non-null float64
latitude                        36239 non-null float64
longitude                       36239 non-null float64
lotsizesquarefeet               36239 non-null float64
taxvaluedollarcnt               36239 non-null float64
yearbuilt                       36239 non-null float64
landtaxvaluedollarcnt           36239 non-null float64
logerror                        36239 non-null float64
transactions                    36239 non-null int64
county_name                     36239 non-null object
dtypes: float64(12), int64(2), object(1)
memory usage: 4.4+ MB


- Create x and y variables for numeric df train and test to prepare to model

In [None]:
X_nums_train = nums_train.drop(columns="logerror")

In [None]:
y_nums_train = nums_train[["logerror"]]

In [None]:
X_nums_test = nums_test.drop(columns="logerror")

In [None]:
y_nums_test = nums_test[["logerror"]]

In [None]:
X_nums_train.head().T

In [None]:
X_nums_train.corr()

In [None]:
plt.figure(figsize=(14,8))
sns.heatmap(X_nums_train.corr(), annot=True)

In [None]:
sns.boxplot(X_nums_train.bedroomcnt, X_nums_train.calculatedfinishedsquarefeet)

In [None]:
sns.boxplot(X_nums_train.bathroomcnt, X_nums_train.calculatedfinishedsquarefeet)

In [None]:
sns.lmplot(data=X_nums_train, y="calculatedfinishedsquarefeet", X="bedroomcnt")

In [None]:
y_nums_train.head().T

### Model df - Linear Regression Model

In [None]:
# create the LR object

lm1 = LinearRegression()
print(lm1)

In [None]:
# fit/train the model

model = lm1.fit(X_nums_train, y_nums_train)

In [None]:
# create yhat variable, numpy array, in-sample prediction values

yhat = model.predict(X_nums_train)
type(yhat)
yhat

In [None]:
# create df, actual in-sample target values

actual = y_nums_train
type(actual)
actual

- add yhat as a column to actual predictions df

In [None]:
actual['yhat'] = yhat

### Evaluate LR Model

In [None]:
actual.describe()

In [None]:
actual["residual"] = actual["yhat"] - actual["logerror"]

In [None]:
sns.distplot(actual["residual"])

In [None]:
sns.scatterplot(actual["logerror"], actual["yhat"])

In [None]:
actual["residual^2"] = actual["residual"] ** 2

In [None]:
actual.head()

In [None]:
sse = sum(actual["residual^2"])
print(f"sse = {sse}")
mse = sse / len(actual)
print(f"mse = {mse}")
rmse = sqrt(mse)
print(f"rmse = {rmse}")

In [None]:
df_eval = pd.DataFrame(np.array(["SSE", "MSE", "RMSE"]), columns=["metric"])
df_eval["model_error"] = np.array([sse, mse, rmse])
df_eval

- Create a numpy array from the y_df_nums_train df, so I can run eval metrics using yhat(prediction array) and y_array (array of actual y values)

In [None]:
y_array = np.array(y_nums_train)
type(y_array)

In [None]:
r2_score(actual.logerror, actual.yhat)

In [None]:
mse_lm1 = mean_squared_error(actual.logerror, actual.yhat)
print("linear model\n  mean squared error: {:.3}".format(mse_lm1)) 

r2_lm1 = r2_score(actual.logerror, actual.yhat)
print('  {:.2%} of the variance in the logerror can be explained by variables.'.format(r2_lm1))

### Takeaways from the Linear Regression Model:

- Based on the results above I fail to reject the Null Hypothesis that the numeric values alone can predict the logerror.

- Using a Linear Regression model to predict the logerror may not be the best choice.

- Finding a way to include categorical variables in a model may help predict logerror.

### Model df KNeighbors Regressor

In [None]:
# create the object
neigh = KNeighborsRegressor(n_neighbors=2)

In [None]:
# fit/train the model using train 
neigh.fit(X_nums_train, y_nums_train)

In [None]:
yhat = neigh.predict(X_nums_train)
type(yhat)

In [None]:
r2 = neigh.score(X_nums_train, y_nums_train)

In [None]:
print('  {:.2%} of the variance in the logerror can be explained by variables.'.format(r2))

### Takeaways from KNRegressor Model

- This model explained more than 50% more of the variance in logerror than my LR model using exactly the same data.

- The subset of data I used was comprised of the numeric values only from the original dataframe.

- I think this model could do even better with outliers handled and some feature engineering.