# **Prepare Notebook:**
    Here we will be preparing the data. This includes addressing the nulls in the data, standardize the text, and standardize the column names, ect.

# Imports 

In [1]:
import pandas as pd
import numpy as np
import acquire as a

In [2]:
df = pd.read_csv('zillow.csv',index_col= 0)

# What is our data like ?

In [3]:
df.shape

(2152863, 7)

In [4]:
df.describe()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
count,2152852.0,2152852.0,2144379.0,2152370.0,2143526.0,2148421.0,2152863.0
mean,3.287196,2.230688,1862.855,461896.2,1960.95,5634.866,6048.377
std,0.9547544,0.9992796,1222.125,699676.0,22.1622,8178.91,20.43329
min,0.0,0.0,1.0,1.0,1801.0,1.85,6037.0
25%,3.0,2.0,1257.0,188170.2,1949.0,2534.98,6037.0
50%,3.0,2.0,1623.0,327671.0,1958.0,4108.95,6037.0
75%,4.0,3.0,2208.0,534527.0,1976.0,6414.32,6059.0
max,25.0,32.0,952576.0,98428910.0,2016.0,1337756.0,6111.0


In [5]:
df.info(show_counts=True) 

<class 'pandas.core.frame.DataFrame'>
Index: 2152863 entries, 0 to 2152862
Data columns (total 7 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   bedroomcnt                    2152852 non-null  float64
 1   bathroomcnt                   2152852 non-null  float64
 2   calculatedfinishedsquarefeet  2144379 non-null  float64
 3   taxvaluedollarcnt             2152370 non-null  float64
 4   yearbuilt                     2143526 non-null  float64
 5   taxamount                     2148421 non-null  float64
 6   fips                          2152863 non-null  float64
dtypes: float64(7)
memory usage: 131.4 MB


### **Key takeaways:**
* Our data is composed of 2,152,863 rows and 7 columns
    * `bedroomcnt` : The total number count of rooms in a house. 
    * `bathroomcnt` : The total number counr of bathrooms in a house.
    * `calculatedfinishedsquarefee` : Total square feet.
    * `taxvaluedollarcnt` : The assessed value is an estimate of how much the property is worth.
    * `yearbuilt` : The year that the house was built.
    * `taxamount` : The monthly amountpaid in taxes. 
    * `fips` : The fips code that is associated with the county and state. 
* There are null values that needs to be addressed. We can see this from the inconstancies in our Non-null Count.
* All the items are floats which is good cause that means we dont need to fix any object columns. 


In [6]:
df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
0,0.0,0.0,,27516.0,,,6037.0
1,0.0,0.0,,10.0,,,6037.0
2,0.0,0.0,,10.0,,,6037.0
3,0.0,0.0,,2108.0,,174.21,6037.0
4,4.0,2.0,3633.0,296425.0,2005.0,6941.39,6037.0


# Addressing the null values

In [7]:
(df.isna().sum() / len(df)) * 100

bedroomcnt                      0.000511
bathroomcnt                     0.000511
calculatedfinishedsquarefeet    0.394080
taxvaluedollarcnt               0.022900
yearbuilt                       0.433702
taxamount                       0.206330
fips                            0.000000
dtype: float64

    Since the amount of the data that is missing equates to less than 1 % of the data we will use `dropna()` to remove rows with duplicates.

In [8]:
df.dropna(inplace=True)

In [9]:
df.info(show_counts= True)

<class 'pandas.core.frame.DataFrame'>
Index: 2140235 entries, 4 to 2152862
Data columns (total 7 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   bedroomcnt                    2140235 non-null  float64
 1   bathroomcnt                   2140235 non-null  float64
 2   calculatedfinishedsquarefeet  2140235 non-null  float64
 3   taxvaluedollarcnt             2140235 non-null  float64
 4   yearbuilt                     2140235 non-null  float64
 5   taxamount                     2140235 non-null  float64
 6   fips                          2140235 non-null  float64
dtypes: float64(7)
memory usage: 130.6 MB


    We can see the Non-Null Count is consistant across the board. This mean we have sucessfully dropped all the null values. 

In [10]:
df.head(15)

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
4,4.0,2.0,3633.0,296425.0,2005.0,6941.39,6037.0
6,3.0,4.0,1620.0,847770.0,2011.0,10244.94,6037.0
7,3.0,2.0,2077.0,646760.0,1926.0,7924.68,6037.0
11,0.0,0.0,1200.0,5328.0,1972.0,91.6,6037.0
14,0.0,0.0,171.0,6920.0,1973.0,255.17,6037.0
15,0.0,0.0,203.0,14166.0,1960.0,163.79,6037.0
18,3.0,1.0,1244.0,169471.0,1950.0,2532.88,6037.0
19,3.0,2.0,1300.0,233266.0,1950.0,3110.99,6037.0
20,3.0,2.0,1222.0,290492.0,1951.0,3870.25,6037.0
21,4.0,4.0,4144.0,1303522.0,2016.0,14820.1,6037.0


# Now we are going to pretty up the column name and make them more pythonic.

In [11]:
df.columns

Index(['bedroomcnt', 'bathroomcnt', 'calculatedfinishedsquarefeet',
       'taxvaluedollarcnt', 'yearbuilt', 'taxamount', 'fips'],
      dtype='object')

In [12]:
# First we make a dictionary of the column names and what their new names are. 
column_names = {
    'bedroomcnt' : 'bedroom_count',
    'bathroomcnt' : 'bathroom_count',
    'calculatedfinishedsquarefeet' : 'squarefoot',
    'taxvaluedollarcnt' : 'assessed_tax_value',
    'yearbuilt' : 'year_built',
    'taxamount' : 'tax_amount',
    
}

In [13]:
df.rename(columns=column_names, inplace=True)

In [15]:
df.head()

Unnamed: 0,bedroom_count,bathroom_count,squarefoot,assessed_tax_value,year_built,tax_amount,fips
4,4.0,2.0,3633.0,296425.0,2005.0,6941.39,6037.0
6,3.0,4.0,1620.0,847770.0,2011.0,10244.94,6037.0
7,3.0,2.0,2077.0,646760.0,1926.0,7924.68,6037.0
11,0.0,0.0,1200.0,5328.0,1972.0,91.6,6037.0
14,0.0,0.0,171.0,6920.0,1973.0,255.17,6037.0


# Fixing the dypes of floats that dont need decimal places.

In [16]:
df['year_built'] = df.year_built.astype(int)

In [17]:
df['fips'] = df.fips.astype(int)

In [19]:
df.to_csv('prepared_zillow.csv', index=False)