In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import graphviz
from graphviz import Graph

import env
import wrangle_zillow
import Ainslie_wrangle_zillow
import os

# turn off pink boxes for demo
import warnings
warnings.filterwarnings("ignore")

In [2]:
# change display settings to show all columns
pd.set_option("display.max_columns", None)

In [3]:
# use a function to pull in zillow data
df = wrangle_zillow.wrangle_zillow()
df.shape

(50782, 24)

In [4]:
# use a function to split data for exploring and modeling
train, validate, test = wrangle_zillow.split_data(df)
train.head(3)

Unnamed: 0,bathrooms,bedrooms,area,counties,latitude,longitude,lot_area,regionidcity,regionidcounty,regionidzip,roomcnt,yearbuilt,structuretaxvalue,taxvalue,assessmentyear,landtaxvalue,taxamount,logerror,transactiondate,los_angeles,orange,ventura,age,bath_bed_ratio
12794,2.0,4,1352,los_angeles,34273883,-118492869,6925,12447.0,3101,96370,0.0,1955.0,148000,505000,2016,357000,6228,0.03,2017-02-27,1,0,0,62,0.5
76021,3.0,4,1980,los_angeles,34362339,-117653901,8763,21395.0,3101,97324,0.0,1981.0,243342,303236,2016,59894,3275,0.15,2017-09-13,1,0,0,36,0.75
16536,3.0,3,1484,los_angeles,34288283,-118376120,12470,12447.0,3101,96368,0.0,1990.0,177997,311590,2016,133593,3881,-0.03,2017-03-13,1,0,0,27,1.0


In [5]:
# use a function to scale data for modeling
scaler, train_scaled, validate_scaled, test_scaled = wrangle_zillow.min_max_scaler(train, validate, test)
train_scaled.shape, validate_scaled.shape, test_scaled.shape

((28437, 24), (12188, 24), (10157, 24))

In [6]:
train.head()

Unnamed: 0,bathrooms,bedrooms,area,counties,latitude,longitude,lot_area,regionidcity,regionidcounty,regionidzip,roomcnt,yearbuilt,structuretaxvalue,taxvalue,assessmentyear,landtaxvalue,taxamount,logerror,transactiondate,los_angeles,orange,ventura,age,bath_bed_ratio
12794,0.11,0.3,0.11,los_angeles,0.63,0.5,0.0,0.02,3101,96370,0.0,0.56,0.09,0.25,2016,0.19,0.11,0.58,2017-02-27,1.0,0.0,0.0,0.44,0.04
76021,0.22,0.3,0.17,los_angeles,0.69,0.95,0.0,0.05,3101,97324,0.0,0.75,0.15,0.15,2016,0.03,0.06,0.6,2017-09-13,1.0,0.0,0.0,0.25,0.08
16536,0.22,0.2,0.13,los_angeles,0.64,0.57,0.0,0.02,3101,96368,0.0,0.82,0.11,0.15,2016,0.07,0.07,0.57,2017-03-13,1.0,0.0,0.0,0.18,0.11
114,0.11,0.2,0.15,orange,0.37,0.83,0.0,0.06,1286,97068,0.5,0.62,0.03,0.04,2016,0.01,0.02,0.58,2017-01-03,0.0,1.0,0.0,0.38,0.06
48044,0.0,0.2,0.08,los_angeles,0.39,0.63,0.0,0.29,3101,96040,0.0,0.5,0.03,0.07,2016,0.05,0.04,0.58,2017-06-20,1.0,0.0,0.0,0.5,0.02


In [None]:
train_scaled.head()

In [None]:
train.info()

In [None]:
train.describe().T

In [None]:
# check bathroom value counts
train.bathrooms.value_counts().plot.bar()

In [None]:
# check bedroom value counts
train.bedrooms.value_counts().plot.bar()

In [None]:
# check binned area value counts
train.area.value_counts(bins=10).plot.bar()

In [None]:
# check binned logerror value counts
train.logerror.value_counts(bins=10).plot.bar()

In [None]:
# check binned logerror value counts
train.counties.value_counts().plot.bar()

In [None]:
train.columns

In [None]:
# use a heatmap to see if there are any obvious correlations
plt.figure(figsize=(12,10))
sns.heatmap(train.corr(), annot=True)

In [None]:
train.info()