In [1]:
import os
import pandas as pd
from scipy import stats
from pydataset import data
import numpy as np
import wrangle
import env

In [2]:
def get_connection(db, username=env.username, host=env.host, password=env.password):
    return f'mysql+pymysql://{username}:{password}@{host}/{db}'
    '''
    this function acts as a part of the function below to establish a connection
    with the sql server
    '''

In [3]:
def get_zillow_sfr_data():
    
    '''
    this function retrieves the zillow info from the sql server
    or calls up the csv if it's saved in place
    
    '''
    
    filename = "zillow_sfr.csv"

    if os.path.isfile(filename):
        return pd.read_csv(filename)
    else:
        # read the SQL query into a dataframe
        df = pd.read_sql('SELECT bedroomcnt, bathroomcnt, calculatedfinishedsquarefeet, taxvaluedollarcnt, yearbuilt, taxamount, fips FROM properties_2017 JOIN propertylandusetype USING (propertylandusetypeid) WHERE propertylandusedesc = "Single Family Residential"', get_connection ('zillow'))

        # Write that dataframe to disk for later. Called "caching" the data for later.
        df.to_csv(filename)

        # Return the dataframe to the calling code
        return df  

In [4]:
sfr_df = get_zillow_sfr_data()

In [5]:
sfr_df.head()

Unnamed: 0.1,Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
0,0,0.0,0.0,,27516.0,,,6037.0
1,1,0.0,0.0,,10.0,,,6037.0
2,2,0.0,0.0,,10.0,,,6037.0
3,3,0.0,0.0,,2108.0,,174.21,6037.0
4,4,4.0,2.0,3633.0,296425.0,2005.0,6941.39,6037.0


In [6]:
train, validate, test = wrangle.clean_prep_zillow(sfr_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop([df.columns[0]], axis = 1, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns = {'bedroomcnt' : 'bedroom',


In [7]:
train.head()

Unnamed: 0,bedroom,bathroom,sqrft,tax_value,year_built,taxamount,fips
1766466,3.0,2.0,1642.0,231120.0,1960.0,2898.13,6037.0
1073889,3.0,2.0,1858.0,178499.0,1955.0,2204.12,6059.0
1223806,5.0,5.0,4433.0,913447.0,2001.0,10717.51,6037.0
1694410,2.0,1.0,888.0,54621.0,1953.0,606.26,6037.0
1436362,2.0,2.0,1966.0,760368.0,1925.0,8290.04,6037.0


In [7]:
sfr_df.isna().sum()

Unnamed: 0                         0
bedroomcnt                        11
bathroomcnt                       11
calculatedfinishedsquarefeet    8484
taxvaluedollarcnt                493
yearbuilt                       9337
taxamount                       4442
fips                               0
dtype: int64

In [8]:
sfr_clean = sfr_df.dropna()

In [9]:
sfr_clean.isna().sum()

Unnamed: 0                      0
bedroomcnt                      0
bathroomcnt                     0
calculatedfinishedsquarefeet    0
taxvaluedollarcnt               0
yearbuilt                       0
taxamount                       0
fips                            0
dtype: int64

In [10]:
sfr_clean.isna()

Unnamed: 0.1,Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
4,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False
11,False,False,False,False,False,False,False,False
14,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
2152856,False,False,False,False,False,False,False,False
2152858,False,False,False,False,False,False,False,False
2152859,False,False,False,False,False,False,False,False
2152861,False,False,False,False,False,False,False,False


In [11]:
sfr_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2140235 entries, 4 to 2152862
Data columns (total 8 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   Unnamed: 0                    int64  
 1   bedroomcnt                    float64
 2   bathroomcnt                   float64
 3   calculatedfinishedsquarefeet  float64
 4   taxvaluedollarcnt             float64
 5   yearbuilt                     float64
 6   taxamount                     float64
 7   fips                          float64
dtypes: float64(7), int64(1)
memory usage: 147.0 MB


In [17]:
sfr_clean.rename(columns = {'bedroomcnt' : 'bedroom',
                       'bathroomcnt': 'bathroom',
                       'calculatedfinishedsquarefeet':'sqrft',
                       'taxvaluedollarcnt':'tax_value',
                       'yearbuilt':'year_built'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sfr_clean.rename(columns = {'bedroomcnt' : 'bedroom',


In [18]:
sfr_clean.head()

Unnamed: 0.1,Unnamed: 0,bedroom,bathroom,sqrft,tax_value,year_built,taxamount,fips
4,4,4.0,2.0,3633.0,296425.0,2005.0,6941.39,6037.0
6,6,3.0,4.0,1620.0,847770.0,2011.0,10244.94,6037.0
7,7,3.0,2.0,2077.0,646760.0,1926.0,7924.68,6037.0
11,11,0.0,0.0,1200.0,5328.0,1972.0,91.6,6037.0
14,14,0.0,0.0,171.0,6920.0,1973.0,255.17,6037.0


In [14]:
sfr_clean.drop_duplicates(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sfr_clean.drop_duplicates(inplace = True)
