In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

from acquire import get_zillow_data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20394 entries, 0 to 20393
Data columns (total 78 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            20394 non-null  int64  
 1   parcelid                      20394 non-null  int64  
 2   airconditioningtypeid         6512 non-null   float64
 3   architecturalstyletypeid      49 non-null     float64
 4   basementsqft                  11 non-null     float64
 5   bathroomcnt                   20394 non-null  float64
 6   bedroomcnt                    20394 non-null  float64
 7   buildingclasstypeid           0 non-null      object 
 8   buildingqualitytypeid         12641 non-null  float64
 9   calculatedbathnbr             20310 non-null  float64
 10  decktypeid                    175 non-null    float64
 11  finishedfloor1squarefeet      1673 non-null   float64
 12  calculatedfinishedsquarefeet  20341 non-null  float64
 13  f

In [8]:
df= get_zillow_data()
df.head()

Unnamed: 0.1,Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,parcelid.1,logerror,transactiondate,buildingclasstypeid.1,buildingclassdesc,storytypeid.1,storydesc,typeconstructiontypeid.1,typeconstructiondesc,parcelid.2
0,0,2061546,11289917,1.0,,,2.0,3.0,,6.0,...,11289917,-0.362001,2017-06-23,,,,,,,11289917
1,1,1834372,11705026,,,,1.0,2.0,,6.0,...,11705026,-0.146056,2017-06-30,,,,,,,11705026
2,2,1923117,14269464,,,,3.0,4.0,,,...,14269464,0.021085,2017-06-01,,,,,,,14269464
3,3,2121349,11389003,,,,2.0,3.0,,6.0,...,11389003,-0.325393,2017-06-01,,,,,,,11389003
4,4,2093710,11967869,,,,1.0,2.0,,5.0,...,11967869,-0.005566,2017-06-29,,,,,,,11967869


In [51]:
# Let's figure out how much data is missing where
def nulls_by_col(df):
    num_missing = df.isnull().sum()
    rows = df.shape[0]
    pct_missing = num_missing / rows
    cols_missing = pd.DataFrame({'number_missing_rows': num_missing, 'percent_rows_missing': pct_missing})
    return cols_missing

In [15]:
def prepare_zillow():
    '''
    This function acquires and prepares the Zillow property values data.
    Returns df, train, validate, test, X_train, X_validate, X_test, y_train, y_validate, y_test.
    Takes no arguments.
    '''
    df = get_zillow_data()

    # Drop features to create an MVP (first iteration)
    df = df.drop(columns=['Unnamed: 0','calculatedbathnbr', 'fips', 'latitude', 'longitude', 'regionidcounty', 'roomcnt', 'yearbuilt', 'assessmentyear', 'propertycountylandusecode', 'propertylandusetypeid', 'parcelid.2','parcelid.1'])
    # Drop rows with NaNs
    df = df.dropna()

    #Split the data
    #train_validate, test = train_test_split(df, test_size=.3, random_state=42)
    #train, validate = train_test_split(train_validate, test_size=.4, random_state=42)

    #X_train = train.drop(columns='taxvaluedollarcnt')
    #X_validate = validate.drop(columns='taxvaluedollarcnt')
    #X_test = test.drop(columns='taxvaluedollarcnt')

    #y_train = train['taxvaluedollarcnt']
    #y_validate = validate['taxvaluedollarcnt']
    #y_test = test['taxvaluedollarcnt']

    return df #train, validate, test, X_train, X_validate, X_test, y_train, y_validate, y_test


In [84]:
def clean_zillow(cached=True):
   
    
    # use my aquire function to read data into a df from a csv file
    df = get_zillow_data()
    # drop duplicates
    df.drop_duplicates(inplace=True)
    # drop duplicate columns and remove columns with more than 50% nulls
    df = df.drop(columns=['heatingorsystemtypeid','taxdelinquencyflag','taxdelinquencyyear','yardbuildingsqft17','finishedsquarefeet50','finishedfloor1squarefeet','fireplacecnt','threequarterbathnbr','pooltypeid7','poolcnt','numberofstories','airconditioningdesc','garagetotalsqft','garagecarcnt','regionidneighborhood','hashottuborspa','pooltypeid2','poolsizesum','pooltypeid10','typeconstructiontypeid','typeconstructiondesc','architecturalstyledesc','finishedsquarefeet6','fireplaceflag','yardbuildingsqft26','finishedsquarefeet13','storytypeid','storydesc','basementsqft','finishedsquarefeet15','buildingclassdesc','architecturalstyletypeid','airconditioningtypeid','buildingclasstypeid','buildingqualitytypeid','decktypeid','architecturalstyletypeid.1','airconditioningtypeid.1','heatingorsystemtypeid.1','propertylandusetypeid.1','buildingclasstypeid.1', 'storytypeid.1', 'typeconstructiontypeid.1','id.1','Unnamed: 0','calculatedbathnbr', 'fips', 'latitude', 'longitude', 'regionidcounty', 'roomcnt', 'yearbuilt', 'assessmentyear', 'propertycountylandusecode', 'propertylandusetypeid', 'parcelid.2','parcelid.1'])
    # Drop rows with NaNs
    return df

In [81]:
df=clean_zillow()

In [82]:
df.head()

Unnamed: 0,id,parcelid,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,finishedsquarefeet12,fullbathcnt,lotsizesquarefeet,propertyzoningdesc,rawcensustractandblock,...,unitcnt,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount,censustractandblock,heatingorsystemdesc,propertylandusedesc,logerror,transactiondate
0,2061546,11289917,2.0,3.0,1458.0,1458.0,2.0,8284.0,LRR6000*,60379010.0,...,1.0,108890.0,136104.0,27214.0,2319.9,60379010000000.0,Central,Single Family Residential,-0.362001,2017-06-23
1,1834372,11705026,1.0,2.0,1421.0,1421.0,1.0,6707.0,LAR1,60372320.0,...,1.0,11982.0,35606.0,23624.0,543.69,60372320000000.0,Central,Single Family Residential,-0.146056,2017-06-30
2,1923117,14269464,3.0,4.0,2541.0,2541.0,3.0,4975.0,,60590640.0,...,,434887.0,880456.0,445569.0,9819.72,60590640000000.0,,Single Family Residential,0.021085,2017-06-01
3,2121349,11389003,2.0,3.0,1650.0,1650.0,2.0,7300.0,LCR1YY,60377030.0,...,1.0,165000.0,614000.0,449000.0,7673.19,60377030000000.0,Central,Single Family Residential,-0.325393,2017-06-01
4,2093710,11967869,1.0,2.0,693.0,693.0,1.0,2908.0,LAR1,60371850.0,...,1.0,82416.0,274237.0,191821.0,3267.47,60371850000000.0,Floor/Wall,Single Family Residential,-0.005566,2017-06-29


In [83]:
nulls_by_column = nulls_by_col(df)
nulls_by_column.sort_values(by="percent_rows_missing", ascending=False, inplace=True)
nulls_by_column
                            

Unnamed: 0,number_missing_rows,percent_rows_missing
unitcnt,7642,0.374718
propertyzoningdesc,7613,0.373296
heatingorsystemdesc,7298,0.35785
lotsizesquarefeet,2263,0.110964
regionidcity,366,0.017946
finishedsquarefeet12,112,0.005492
fullbathcnt,84,0.004119
censustractandblock,78,0.003825
calculatedfinishedsquarefeet,53,0.002599
structuretaxvaluedollarcnt,33,0.001618
