In [108]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import env

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

from acquire import get_zillow_data

In [109]:
df= get_zillow_data()
df.head()

Unnamed: 0.1,Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,parcelid.1,logerror,transactiondate,buildingclasstypeid.1,buildingclassdesc,storytypeid.1,storydesc,typeconstructiontypeid.1,typeconstructiondesc,parcelid.2
0,0,2061546,11289917,1.0,,,2.0,3.0,,6.0,...,11289917,-0.362001,2017-06-23,,,,,,,11289917
1,1,1834372,11705026,,,,1.0,2.0,,6.0,...,11705026,-0.146056,2017-06-30,,,,,,,11705026
2,2,1923117,14269464,,,,3.0,4.0,,,...,14269464,0.021085,2017-06-01,,,,,,,14269464
3,3,2121349,11389003,,,,2.0,3.0,,6.0,...,11389003,-0.325393,2017-06-01,,,,,,,11389003
4,4,2093710,11967869,,,,1.0,2.0,,5.0,...,11967869,-0.005566,2017-06-29,,,,,,,11967869


In [110]:
# Let's figure out how much data is missing where
def nulls_by_col(df):
    num_missing = df.isnull().sum()
    rows = df.shape[0]
    pct_missing = num_missing / rows
    cols_missing = pd.DataFrame({'number_missing_rows': num_missing, 'percent_rows_missing': pct_missing})
    return cols_missing

In [111]:
#checking which columns have the most null rows
nulls_by_column = nulls_by_col(df)
nulls_by_column.sort_values(by="percent_rows_missing", ascending=False, inplace=True)
nulls_by_column           

Unnamed: 0,number_missing_rows,percent_rows_missing
buildingclassdesc,20394,1.000000
buildingclasstypeid.1,20394,1.000000
buildingclasstypeid,20394,1.000000
finishedsquarefeet15,20391,0.999853
storytypeid.1,20383,0.999461
...,...,...
fips,0,0.000000
bedroomcnt,0,0.000000
bathroomcnt,0,0.000000
parcelid,0,0.000000


In [115]:
def clean_zillow(cached=True):
    '''This function acquires and prepares the zillow data from a local csv, default. Passing cached=False acquires fresh data from sql and writes to csv.'''
    # use my aquire function to read data into a df from a csv file
    df = get_zillow_data()
    # drop duplicates
    df.drop_duplicates(inplace=True)
    # drop duplicate columns and remove columns with more than 50% nulls
    df = df.drop(columns=['heatingorsystemtypeid','taxdelinquencyflag','taxdelinquencyyear','yardbuildingsqft17','finishedsquarefeet50','finishedfloor1squarefeet','fireplacecnt','threequarterbathnbr','pooltypeid7','poolcnt','numberofstories','airconditioningdesc','garagetotalsqft','garagecarcnt','regionidneighborhood','hashottuborspa','pooltypeid2','poolsizesum','pooltypeid10','typeconstructiontypeid','typeconstructiondesc','architecturalstyledesc','finishedsquarefeet6','fireplaceflag','yardbuildingsqft26','finishedsquarefeet13','storytypeid','storydesc','basementsqft','finishedsquarefeet15','buildingclassdesc','architecturalstyletypeid','airconditioningtypeid','buildingclasstypeid','buildingqualitytypeid','decktypeid','architecturalstyletypeid.1','airconditioningtypeid.1','heatingorsystemtypeid.1','propertylandusetypeid.1','buildingclasstypeid.1', 'storytypeid.1', 'typeconstructiontypeid.1','id.1','Unnamed: 0','calculatedbathnbr', 'fips', 'latitude', 'longitude', 'regionidcounty', 'roomcnt', 'yearbuilt', 'assessmentyear', 'propertycountylandusecode', 'propertylandusetypeid', 'parcelid.2','parcelid.1'])
    return df

In [116]:
df=clean_zillow()

In [117]:
df.head()

Unnamed: 0,id,parcelid,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,finishedsquarefeet12,fullbathcnt,lotsizesquarefeet,propertyzoningdesc,rawcensustractandblock,...,unitcnt,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount,censustractandblock,heatingorsystemdesc,propertylandusedesc,logerror,transactiondate
0,2061546,11289917,2.0,3.0,1458.0,1458.0,2.0,8284.0,LRR6000*,60379010.0,...,1.0,108890.0,136104.0,27214.0,2319.9,60379010000000.0,Central,Single Family Residential,-0.362001,2017-06-23
1,1834372,11705026,1.0,2.0,1421.0,1421.0,1.0,6707.0,LAR1,60372320.0,...,1.0,11982.0,35606.0,23624.0,543.69,60372320000000.0,Central,Single Family Residential,-0.146056,2017-06-30
2,1923117,14269464,3.0,4.0,2541.0,2541.0,3.0,4975.0,,60590640.0,...,,434887.0,880456.0,445569.0,9819.72,60590640000000.0,,Single Family Residential,0.021085,2017-06-01
3,2121349,11389003,2.0,3.0,1650.0,1650.0,2.0,7300.0,LCR1YY,60377030.0,...,1.0,165000.0,614000.0,449000.0,7673.19,60377030000000.0,Central,Single Family Residential,-0.325393,2017-06-01
4,2093710,11967869,1.0,2.0,693.0,693.0,1.0,2908.0,LAR1,60371850.0,...,1.0,82416.0,274237.0,191821.0,3267.47,60371850000000.0,Floor/Wall,Single Family Residential,-0.005566,2017-06-29


In [118]:
#checking null count after making clean data frame
nulls_by_column = nulls_by_col(df)
nulls_by_column.sort_values(by="percent_rows_missing", ascending=False, inplace=True)
nulls_by_column           

Unnamed: 0,number_missing_rows,percent_rows_missing
unitcnt,7642,0.374718
propertyzoningdesc,7613,0.373296
heatingorsystemdesc,7298,0.35785
lotsizesquarefeet,2263,0.110964
regionidcity,366,0.017946
finishedsquarefeet12,112,0.005492
fullbathcnt,84,0.004119
censustractandblock,78,0.003825
calculatedfinishedsquarefeet,53,0.002599
structuretaxvaluedollarcnt,33,0.001618


In [119]:
# #making my split, train, test data
train_validate, test = train_test_split(df, test_size=.2, 
                                         random_state=42,
                                           )
train, validate = train_test_split(train_validate, test_size=.3, 
                                  random_state=42,
                                         ) 

In [121]:
#combining my split, train, test data and my clean data into one dataframe
def prep_zillow_data():
    '''This function will return a data frame holding both my clean data and the split/train/test data.'''
    df = clean_zillow()
    train_validate, test = train_test_split(df, test_size=.2, random_state=42)
    train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=42)
    return train, validate, test

In [None]:
X_train = train.drop(columns='logerror')
    #X_validate = validate.drop(columns='logerror')
    #X_test = test.drop(columns='logerror')

    #y_train = train['logerror']
    #y_validate = validate['logerror']
    #y_test = test['logerror']
    #train, validate, test, X_train, X_validate, X_test, y_train, y_validate, y_test

In [122]:
! git add 'prepare.ipynb'

In [123]:
! git commit -m 'prepare'

[main 3eb32bb] prepare
 1 file changed, 182 insertions(+), 136 deletions(-)


In [124]:
! git push

Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 4 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 1.34 KiB | 686.00 KiB/s, done.
Total 3 (delta 2), reused 0 (delta 0)
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To github.com:Gabby-B-B/clustering-project.git
   c0cd2fd..3eb32bb  main -> main
