In [7]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# wrangling
import pandas as pd
import numpy as np

# visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# exploring
import scipy.stats as stats
import pandas_profiling

# modeling
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

# 3D projection
from mpl_toolkits.mplot3d import Axes3D

pd.options.display.float_format = '{:20,.2f}'.format

# my modules
import acquire
import summarize
import prepare

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

### Acquire df

- Query brought in 77,381

In [8]:
df = acquire.get_zillow_data()

In [9]:
df.shape

(77381, 72)

### Summarize df

In [10]:
summarize.df_summary(df)

--- Shape: (77381, 72)
--- Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77381 entries, 0 to 77380
Data columns (total 72 columns):
county                          77381 non-null object
tax_rate                        77375 non-null float64
id                              77381 non-null int64
parcelid                        77381 non-null int64
airconditioningtypeid           24953 non-null float64
airconditioningdesc             24953 non-null object
architecturalstyletypeid        206 non-null float64
architecturalstyledesc          206 non-null object
basementsqft                    50 non-null float64
bathroomcnt                     77381 non-null float64
bedroomcnt                      77381 non-null float64
buildingclasstypeid             15 non-null float64
buildingclassdesc               15 non-null object
buildingqualitytypeid           49672 non-null float64
calculatedbathnbr               76772 non-null float64
calculatedfinishedsquarefeet    77185 non-null float64


   num_cols_missing    pct_cols_missing  num_rows
0                23  31.944444444444443         2
1                24   33.33333333333333        13
2                25   34.72222222222222        24
3                26   36.11111111111111        65
4                27                37.5       316
5                28   38.88888888888889       455
6                29   40.27777777777778      5270
7                30   41.66666666666667      3455
8                31   43.05555555555556      9891
9                32   44.44444444444444     12578
10               33   45.83333333333333     14783
11               34   47.22222222222222     13326
12               35   48.61111111111111      5147
13               36                50.0      5776
14               37  51.388888888888886      3620
15               38   52.77777777777778      1926
16               39  54.166666666666664       285
17               40   55.55555555555556       230
18               41   56.94444444444444        29


- Here I use a function that takes in a dataframe of observations and attributes and returns a df where each row is an atttribute name, the first column is the number of rows with missing values for that attribute, and the second column is percent of total rows that have missing values for that attribute. Run the function and document takeaways from this on how you want to handle missing values.

In [12]:
summarize.nulls_by_col(df)

Unnamed: 0,num_rows_missing,pct_rows_missing
county,0,0.00
tax_rate,6,0.00
id,0,0.00
parcelid,0,0.00
airconditioningtypeid,52428,0.68
airconditioningdesc,52428,0.68
architecturalstyletypeid,77175,1.00
architecturalstyledesc,77175,1.00
basementsqft,77331,1.00
bathroomcnt,0,0.00


#### Takeaways from nulls in columns function

- I can see that there are columns that have no data in them, and those I will certainly drop. 


- There are others that are more than 50% NULL values, and I'm going to drop those as well. That is too high of a percentage of Nulls to make the data meaningful.

In [13]:
summarize.nulls_by_row(df)

Unnamed: 0,num_cols_missing,pct_cols_missing,num_rows
0,23,31.944444444444443,2
1,24,33.33333333333333,13
2,25,34.72222222222222,24
3,26,36.11111111111111,65
4,27,37.5,316
5,28,38.88888888888889,455
6,29,40.27777777777778,5270
7,30,41.66666666666667,3455
8,31,43.05555555555556,9891
9,32,44.44444444444444,12578


- Write a function that takes in a dataframe and returns a dataframe with 3 columns: the number of columns missing, percent of columns missing, number of rows with n columns missing. Run the function and document takeaways from this on how you want to handle missing values.

In [14]:
summarize.nulls_by_row(df)

Unnamed: 0,num_cols_missing,pct_cols_missing,num_rows
0,23,31.944444444444443,2
1,24,33.33333333333333,13
2,25,34.72222222222222,24
3,26,36.11111111111111,65
4,27,37.5,316
5,28,38.88888888888889,455
6,29,40.27777777777778,5270
7,30,41.66666666666667,3455
8,31,43.05555555555556,9891
9,32,44.44444444444444,12578


#### Takeaways from the nulls by row function

- For my first iteration of the pipeline, I'm going to drop any rows that have missing values. 


- I will go back and add back and possibly impute values after my first iteration.

- This function will drop columns that are not 50% non-missing values and rows that are not 75% non-missing values

In [15]:
df = prepare.handle_missing_values(df)

In [16]:
df.isnull().sum()

county                              0
tax_rate                            6
id                                  0
parcelid                            0
bathroomcnt                         0
bedroomcnt                          0
buildingqualitytypeid           27633
calculatedbathnbr                 533
calculatedfinishedsquarefeet      120
finishedsquarefeet12             3556
fips                                0
state                               0
fullbathcnt                       533
heatingorsystemtypeid           27865
heatingorsystemdesc             27865
latitude                            0
longitude                           0
lotsizesquarefeet                8171
propertycountylandusecode           0
propertylandusetypeid               0
propertylandusedesc                 0
propertyzoningdesc              26987
rawcensustractandblock              0
regionidcity                     1460
regionidcounty                      0
regionidzip                        45
roomcnt     