In [1]:
#Ignore warnings
import warnings
warnings.filterwarnings("ignore")

import math
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

import scipy.stats as stats

import matplotlib.pyplot as plt 
import seaborn as sns

import wrangle as wr

In [2]:
zillow = wr.acquire()
zillow.head()

Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,censustractandblock,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,1727539,14297519,,,,3.5,4.0,,,3.5,...,60590630000000.0,0.025595,2017-01-01,,,,,Single Family Residential,,
1,1387261,17052889,,,,1.0,2.0,,,1.0,...,61110010000000.0,0.055619,2017-01-01,,,,,Single Family Residential,,
2,11677,14186244,,,,2.0,3.0,,,2.0,...,60590220000000.0,0.005383,2017-01-01,,,,,Single Family Residential,,
3,2288172,12177905,,,,3.0,4.0,,8.0,3.0,...,60373000000000.0,-0.10341,2017-01-01,,,,Central,Single Family Residential,,
4,1970746,10887214,1.0,,,3.0,3.0,,8.0,3.0,...,60371240000000.0,0.00694,2017-01-01,Central,,,Central,Condominium,,


In [3]:
zillow.shape

(77380, 68)

In [4]:
zillow.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,77380.0,1495126.0,860905.7,349.0,752050.0,1497870.0,2240480.0,2982274.0
parcelid,77380.0,13007150.0,3481368.0,10711860.0,11538300.0,12531550.0,14211840.0,167689300.0
airconditioningtypeid,24953.0,1.813289,2.967894,1.0,1.0,1.0,1.0,13.0
architecturalstyletypeid,206.0,7.38835,2.734542,2.0,7.0,7.0,7.0,21.0
basementsqft,50.0,679.72,689.7035,38.0,273.0,515.0,796.5,3560.0
bathroomcnt,77380.0,2.299134,0.9966566,0.0,2.0,2.0,3.0,18.0
bedroomcnt,77380.0,3.053489,1.139103,0.0,2.0,3.0,4.0,16.0
buildingclasstypeid,15.0,3.933333,0.2581989,3.0,4.0,4.0,4.0,4.0
buildingqualitytypeid,49671.0,6.534638,1.721933,1.0,6.0,6.0,8.0,12.0
calculatedbathnbr,76771.0,2.316871,0.9797606,1.0,2.0,2.0,3.0,18.0


### Observation takeaways:
- Data must only inluce single family homes.
- Outliers need to be dealt with
- Nulls need to be dealt with
- Only information set for observation against the target variable need to be collected into the dataframe. Else can be dropped.

In [5]:
# function to return only single unit homes
zillow = wr.get_single_unit_homes(zillow)

In [6]:
zillow.shape

(71693, 68)

In [7]:
zillow.head()

Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,censustractandblock,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,1727539,14297519,,,,3.5,4.0,,,3.5,...,60590630000000.0,0.025595,2017-01-01,,,,,Single Family Residential,,
1,1387261,17052889,,,,1.0,2.0,,,1.0,...,61110010000000.0,0.055619,2017-01-01,,,,,Single Family Residential,,
2,11677,14186244,,,,2.0,3.0,,,2.0,...,60590220000000.0,0.005383,2017-01-01,,,,,Single Family Residential,,
3,2288172,12177905,,,,3.0,4.0,,8.0,3.0,...,60373000000000.0,-0.10341,2017-01-01,,,,Central,Single Family Residential,,
4,1970746,10887214,1.0,,,3.0,3.0,,8.0,3.0,...,60371240000000.0,0.00694,2017-01-01,Central,,,Central,Condominium,,


Now to check for nulls

In [8]:
wr.nulls_data(zillow)

Unnamed: 0,rows_missing,percent_missing
id,0,0.000000
parcelid,0,0.000000
airconditioningtypeid,48670,0.678867
architecturalstyletypeid,71487,0.997127
basementsqft,71646,0.999344
...,...,...
buildingclassdesc,71693,1.000000
heatingorsystemdesc,25133,0.350564
propertylandusedesc,0,0.000000
storydesc,71646,0.999344


In [9]:
wr.null_cols(zillow)

Unnamed: 0,cols_missing,rows,percent_missing
0,23,2,0.338235
1,24,13,0.352941
2,25,24,0.367647
3,26,65,0.382353
4,27,312,0.397059
5,28,451,0.411765
6,29,5146,0.426471
7,30,3233,0.441176
8,31,9166,0.455882
9,32,11679,0.470588


#### from here, rows with more than 33% missing values will be dropped.

In [10]:
zillow = wr.handle_missing_values(zillow)
wr.nulls_data(zillow)

Unnamed: 0,rows_missing,percent_missing
id,0,0.0
parcelid,0,0.0
bathroomcnt,0,0.0
bedroomcnt,0,0.0
calculatedbathnbr,215,0.002999
calculatedfinishedsquarefeet,149,0.002078
finishedsquarefeet12,328,0.004575
fips,0,0.0
fullbathcnt,215,0.002999
latitude,0,0.0


#### Now, I will focus on the important columns for my expected observations:

In [11]:
zillow = wr.import_observed_columns(zillow)

In [12]:
zillow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71687 entries, 0 to 77379
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      71687 non-null  int64  
 1   logerror                      71687 non-null  float64
 2   bathroomcnt                   71687 non-null  float64
 3   bedroomcnt                    71687 non-null  float64
 4   calculatedfinishedsquarefeet  71538 non-null  float64
 5   fips                          71687 non-null  float64
 6   yearbuilt                     71504 non-null  float64
 7   propertylandusedesc           71687 non-null  object 
dtypes: float64(6), int64(1), object(1)
memory usage: 4.9+ MB


#### The Questions to be asked when using this altered dataset: 
- Is log error significantly different per number of bathrooms?
- Is log error significantly different per number of bedrooms?
- Is log error significantly different per average sqaure feet in LA vs Orange vs Ventura? 
- Is log error significantly different per age of a home? 
- Is log error significantly different depending on the type of property. 

### Primary Takeaways:
- There are still nulls. These will be imputed once split into train, validate, test.
- The columns should be renamed 