In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from acquire_g import get_zillow_data
from prepare import acquire_and_prep_data, fips_labels

## Acquire data

In [5]:
df = get_zillow_data()
df.head()

Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,id.1,parcelid.1,logerror,transactiondate,propertylandusetypeid.1,propertylandusedesc
0,1248,17305333,,,,0.0,0.0,,,,...,212.46,,,61110020000000.0,1248,11289917,-0.362001,2017-06-23,263,Mobile Home
1,1772,10838338,,,,2.0,4.0,,6.0,2.0,...,6089.82,,,60371280000000.0,1772,11705026,-0.146056,2017-06-30,261,Single Family Residential
2,2028,10901531,1.0,,,3.0,3.0,,8.0,3.0,...,6679.55,,,60371250000000.0,2028,14269464,0.021085,2017-06-01,261,Single Family Residential
3,3273,11262089,1.0,,,2.0,3.0,,8.0,2.0,...,3876.31,,,60379010000000.0,3273,11389003,-0.325393,2017-06-01,261,Single Family Residential
4,3429,11323134,,,,2.0,2.0,,6.0,2.0,...,4206.15,,,60379010000000.0,3429,11967869,-0.005566,2017-06-29,261,Single Family Residential


In [12]:
# 19667 rows with 65 columns. 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19299 entries, 0 to 19666
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sqft               19299 non-null  int64  
 1   bathroomcnt        19299 non-null  float64
 2   bedroomcnt         19299 non-null  int64  
 3   zipcode            19299 non-null  int64  
 4   taxamount          19299 non-null  float64
 5   taxvaluedollarcnt  19299 non-null  float64
 6   yearbuilt          19299 non-null  int64  
 7   tax_rate           19299 non-null  float64
 8   county             19299 non-null  object 
dtypes: float64(4), int64(4), object(1)
memory usage: 1.5+ MB


In [13]:
df.shape

(19299, 9)

In [15]:
df.describe()

Unnamed: 0,sqft,bathroomcnt,bedroomcnt,zipcode,taxamount,taxvaluedollarcnt,yearbuilt,tax_rate
count,19299.0,19299.0,19299.0,19299.0,19299.0,19299.0,19299.0,19299.0
mean,1743.394062,2.210684,3.072646,96495.213742,5444.767672,444348.8,1962.409296,1.321326
std,938.396722,0.988779,1.009545,3800.436448,6941.795704,598080.9,22.921862,0.355193
min,60.0,0.0,0.0,95982.0,37.65,3257.0,1862.0,0.0
25%,1180.0,2.0,2.0,96150.0,2476.985,184092.0,1949.0,1.19
50%,1520.0,2.0,3.0,96339.0,3963.2,313848.0,1960.0,1.24
75%,2038.0,3.0,4.0,96533.0,6166.21,510435.0,1979.0,1.35
max,26345.0,20.0,25.0,399675.0,228999.21,19129820.0,2015.0,15.51


In [16]:
# Have and object, float and int
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19299 entries, 0 to 19666
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sqft               19299 non-null  int64  
 1   bathroomcnt        19299 non-null  float64
 2   bedroomcnt         19299 non-null  int64  
 3   zipcode            19299 non-null  int64  
 4   taxamount          19299 non-null  float64
 5   taxvaluedollarcnt  19299 non-null  float64
 6   yearbuilt          19299 non-null  int64  
 7   tax_rate           19299 non-null  float64
 8   county             19299 non-null  object 
dtypes: float64(4), int64(4), object(1)
memory usage: 1.5+ MB


In [None]:
# df cut down to read 22 column brandon and i aggreed on. then cut more to get mvp
df = df[['parcelid','bathroomcnt','bedroomcnt','calculatedbathnbr','calculatedfinishedsquarefeet','finishedsquarefeet12','fips','fullbathcnt','latitude','longitude','propertycountylandusecode','propertylandusetypeid','regionidcity','regionidcounty','regionidzip','roomcnt','yearbuilt','structuretaxvaluedollarcnt','taxvaluedollarcnt','taxamount','transactiondate','propertylandusetypeid.1']]
df.head().T

In [None]:
# dropped 2 more columns
df = df.drop(columns = ['propertycountylandusecode', 'transactiondate'])

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.isnull().sum()

In [None]:
# only 3 fips
df.fips.nunique()

In [None]:
# These are the columns we will be using for our mvp
df = df[['calculatedfinishedsquarefeet','bathroomcnt', 'bedroomcnt', 'regionidzip', 'fips', 'taxamount', 'taxvaluedollarcnt', 'yearbuilt']]
df

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()
df.isnull().sum()

In [None]:
df.info()

### Changing data types

In [None]:
df['bedroomcnt'] = df['bedroomcnt'].astype(int)
df['calculatedfinishedsquarefeet'] = df['calculatedfinishedsquarefeet'].astype(int)
df['regionidzip']=df['regionidzip'].astype(int)
df['fips'] = df['fips'].astype(int)
df['yearbuilt']=df['yearbuilt'].astype(int)

In [None]:
df.info()

In [None]:
# Renaming features
df = df.rename(columns={'calculatedfinishedsquarefeet': 'sqft', 'regionidzip': 'zipcode'})
df.head()

In [None]:
# Calculating tax rate
df['tax_rate'] = round((df['taxamount'] / df['taxvaluedollarcnt']) * 100 , 2)
df.head()

In [None]:
# Creating a function to change fips to the county name
def fips_labels(x):
    if x['fips'] == 6037:
        return 'Los Angeles County'
    elif x['fips'] == 6059:
        return 'Orange County'
    elif x['fips'] == 6111:
        return 'Ventura County'

In [None]:
# Creating a county column
df['county'] = df.apply(lambda x: fips_labels(x), axis=1)
df.head()

In [None]:
# Dropping fips now that we have county column
df = df.drop(columns = ['fips'])
df.head()

In [7]:
# Testing the acquire and prep function
df = acquire_and_prep_data()
df.head()

Unnamed: 0,sqft,bathroomcnt,bedroomcnt,zipcode,taxamount,taxvaluedollarcnt,yearbuilt,tax_rate,county
0,1000,0.0,0,97083,212.46,27400.0,2002,0.78,Ventura County
1,1604,2.0,4,96415,6089.82,498347.0,1950,1.22,Los Angeles County
2,2384,3.0,3,96452,6679.55,549917.0,1937,1.21,Los Angeles County
3,1574,2.0,3,97319,3876.31,235272.0,1990,1.65,Los Angeles County
4,1619,2.0,2,97329,4206.15,340000.0,1983,1.24,Los Angeles County


In [17]:
!git status

On branch master
Your branch is up to date with 'origin/master'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   acquire_prep_g.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [21]:
! git add -A

In [22]:
! git commit -m "adding edits to prepare"

[master 2f4c316] adding edits to prepare
 1 file changed, 14 insertions(+), 15 deletions(-)


In [23]:
!git push

Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 8 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 589 bytes | 589.00 KiB/s, done.
Total 3 (delta 2), reused 0 (delta 0)
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/The-BGs/zillow_regression_project.git
   9d33c3b..2f4c316  master -> master


In [None]:
# git pull