# Acquisition, Prep, and Initial Exploration

In [1]:
import pandas as pd
import numpy as np
from acquire import acquire_data
import math

In [2]:
df = acquire_data()
df.shape

(52442, 62)

## Missing Values

- Handle missing values

- backup/explain your decisions

- Prep.py: Write function for reproducibility.

- Run function in final notebook to complete the task.

In [3]:
def missing_value_percentage(series):
    return series.isna().sum() / series.size
    

In [4]:
df.apply(missing_value_percentage, axis = 0)

parcelid                    0.000000
id                          0.000000
airconditioningtypeid       0.739941
architecturalstyletypeid    0.998665
basementsqft                0.999104
                              ...   
taxdelinquencyyear          0.960356
censustractandblock         0.002345
id.1                        0.000000
logerror                    0.000000
transactiondate             0.000000
Length: 62, dtype: float64

In [5]:
def drop_useless_columns(df, percentage):
    #check the percentage of missing value of each column
    s = df.apply(missing_value_percentage, axis = 0)
    # drop columns whose missing values are more than  percentage
    df = df.drop(columns = s[s > percentage].index.tolist()) 
    return df
    

In [6]:
df = drop_useless_columns(df, .3)
df.shape

(52442, 29)

In [7]:
def drop_duplicated_observation(df):
    df = df.sort_values('transactiondate', ascending=False)
    #keep the latest transaction date
    return df.drop_duplicates(subset='parcelid', keep='last')

In [8]:
df.shape

(52442, 29)

In [9]:
df = drop_duplicated_observation(df)
df.shape

(52320, 29)

In [10]:
#drop nonuseful columns based on domin knowledge
def drop_ineffecitve_columns(df):
    ineffecitve_columns = ['id','id.1','calculatedbathnbr','finishedsquarefeet12',
                          'fullbathcnt','roomcnt','assessmentyear','censustractandblock',
                          'propertylandusetypeid','rawcensustractandblock','propertycountylandusecode',
                          'transactiondate','parcelid','regionidcounty']
    df = df.drop(columns=ineffecitve_columns)
    return df


In [11]:
df = drop_ineffecitve_columns(df)
df.shape

(52320, 15)

In [12]:
df.isna().sum()

bathroomcnt                        0
bedroomcnt                         0
calculatedfinishedsquarefeet      81
fips                               0
latitude                           0
longitude                          0
lotsizesquarefeet                366
regionidcity                    1036
regionidzip                       26
yearbuilt                        114
structuretaxvaluedollarcnt        82
taxvaluedollarcnt                  1
landtaxvaluedollarcnt              1
taxamount                          4
logerror                           0
dtype: int64

In [13]:
#drop null observations
def drop_null(df):
    return df.dropna()

In [14]:
df =drop_null(df)
df.shape

(50799, 15)

In [15]:
(df.taxvaluedollarcnt*10**df.logerror).sort_values()

6404     8.013270e+00
35632    1.163762e+01
44268    3.959205e+02
31628    5.809805e+02
43581    7.284792e+02
             ...     
44642    9.211423e+08
29023    1.212699e+09
51691    1.422160e+09
6536     1.761314e+09
392      1.213057e+12
Length: 50799, dtype: float64

## Create new features
- yearbuilt to age
- tax_rate = taxamount / taxvaluedollarcnt
- drop structuretaxvaluedollarcnt, taxvaluedollarcnt,	landtaxvaluedollarcnt,	taxamount

In [16]:
def create_new_features(df):
    df['age']=(2017 - df.yearbuilt)
    df['tax_rate']= df.taxamount/df.taxvaluedollarcnt*100
    df['estimate'] = df.taxvaluedollarcnt*10**df.logerror
    df = df.drop(columns=['yearbuilt',
                      'structuretaxvaluedollarcnt',  
                      'landtaxvaluedollarcnt',
                      'taxamount'])
    df = df.rename(columns={'calculatedfinishedsquarefeet':'house_size', 
                            'lotsizesquarefeet':'lotsize',
                           'taxvaluedollarcnt': 'actual_value'})
    return df

In [17]:
df = create_new_features(df)
df.head()

Unnamed: 0,bathroomcnt,bedroomcnt,house_size,fips,latitude,longitude,lotsize,regionidcity,regionidzip,actual_value,logerror,age,tax_rate,estimate
52440,2.0,3.0,1762.0,6037.0,33937685.0,-117996709.0,6347.0,14634.0,96171.0,522000.0,0.007204,62.0,1.210182,530731.0
52439,1.0,3.0,1032.0,6037.0,34040895.0,-118038169.0,5074.0,36502.0,96480.0,49546.0,0.037129,63.0,1.768922,53968.13
52438,2.0,4.0,1612.0,6111.0,34300140.0,-118706327.0,12105.0,27110.0,97116.0,67205.0,0.013209,53.0,1.647913,69280.37
52437,2.0,2.0,1286.0,6037.0,34245368.0,-118282383.0,47405.0,12447.0,96284.0,354621.0,0.020615,77.0,1.262878,371860.4
52311,4.0,4.0,2440.0,6037.0,34009367.0,-118430958.0,5553.0,12447.0,96047.0,1550000.0,-0.056152,80.0,1.203548,1362007.0


FIPS:

- 6037: Los Angeles County
- 6059: Orange County
- 6111: Ventura County

In [18]:
def creat_dummy_var(df):
    county_df = pd.get_dummies(df.fips)
    county_df.columns = ['LA', 'Orange', 'Ventura']
    # concatenate the dataframe with the 3 county columns to the original dataframe
    df= pd.concat([df, county_df], axis =1)
    # drop regionidcounty and fips columns
    df = df.drop(columns = ['fips'])
    return df
    

In [19]:
df=creat_dummy_var(df)
df.head()

Unnamed: 0,bathroomcnt,bedroomcnt,house_size,latitude,longitude,lotsize,regionidcity,regionidzip,actual_value,logerror,age,tax_rate,estimate,LA,Orange,Ventura
52440,2.0,3.0,1762.0,33937685.0,-117996709.0,6347.0,14634.0,96171.0,522000.0,0.007204,62.0,1.210182,530731.0,1,0,0
52439,1.0,3.0,1032.0,34040895.0,-118038169.0,5074.0,36502.0,96480.0,49546.0,0.037129,63.0,1.768922,53968.13,1,0,0
52438,2.0,4.0,1612.0,34300140.0,-118706327.0,12105.0,27110.0,97116.0,67205.0,0.013209,53.0,1.647913,69280.37,0,0,1
52437,2.0,2.0,1286.0,34245368.0,-118282383.0,47405.0,12447.0,96284.0,354621.0,0.020615,77.0,1.262878,371860.4,1,0,0
52311,4.0,4.0,2440.0,34009367.0,-118430958.0,5553.0,12447.0,96047.0,1550000.0,-0.056152,80.0,1.203548,1362007.0,1,0,0


## Data Types

- Prep.py: Write a function that takes in a dataframe and a list of column names (ones that are numeric and don't represent numbers) and returns the dataframe with the datatypes of those columns changed to a non-numeric type.

- In your notebook, use this function to appropriately transform any numeric columns that should not be treated as numbers.

- Prep.py: Do the same, but changing objects or categories to numeric types.

In [20]:
df.dtypes

bathroomcnt     float64
bedroomcnt      float64
house_size      float64
latitude        float64
longitude       float64
lotsize         float64
regionidcity    float64
regionidzip     float64
actual_value    float64
logerror        float64
age             float64
tax_rate        float64
estimate        float64
LA                uint8
Orange            uint8
Ventura           uint8
dtype: object

In [21]:
def regassign_dtypes(df):
    df[['regionidcity','regionidzip']] = df[['regionidcity','regionidzip']].astype('object')
    return df

In [22]:
df = regassign_dtypes(df)
df.dtypes

bathroomcnt     float64
bedroomcnt      float64
house_size      float64
latitude        float64
longitude       float64
lotsize         float64
regionidcity     object
regionidzip      object
actual_value    float64
logerror        float64
age             float64
tax_rate        float64
estimate        float64
LA                uint8
Orange            uint8
Ventura           uint8
dtype: object

## Outliers

- Prep.py: You can use what you did in exercises and adapt, enhance or improve if you find the time and need.

- Prep.py: Write a function that accepts a series (i.e. one column from a data frame) and summarizes how many outliers are in the series. This function should accept a second parameter that determines how outliers are detected, with the ability to detect outliers in 3 ways: IQR, standard deviations (z-score), percentiles).

- Run the function in your final notebook to identify/demonstrat columns where you should handle the outliers.

- Prep.py: Write a function that accepts the zillow data frame and removes the outliers. You should make a decision and document how you will remove outliers.

- Run the function in your final notebook.

- Is there erroneous data you have found that you need to remove or repair? If so, take action.

- Are there outliers you want to "squeeze in" to a max value? (e.g. all bathrooms > 6 => bathrooms = 6). If so, make those changes.

In [23]:
def identify_outliers(s):
    '''
    Given a series and a cutoff value, k, returns the upper outliers for the
    series.

    The values returned will be either 0 (if the point is not an outlier), or 
    a null value which will be used to drop later.
    '''
    q1, q3 = s.quantile([.25, .75])
    iqr = q3 - q1
    upper_bound = q3 + 3 * iqr
    lower_bound = q1 - 3*iqr
    return s.apply(lambda x: x if (x<upper_bound)&(x>lower_bound) else np.NaN)



In [24]:
def drop_outliers(df):
    for col in (df.select_dtypes(include=['float64'])
                .drop(columns = ['latitude', 'longitude'])
                .columns):
        df[col] = identify_outliers(df[col])
    df = df.dropna()
    return df

In [25]:
df =drop_outliers(df)
df.isna().sum()

bathroomcnt     0
bedroomcnt      0
house_size      0
latitude        0
longitude       0
lotsize         0
regionidcity    0
regionidzip     0
actual_value    0
logerror        0
age             0
tax_rate        0
estimate        0
LA              0
Orange          0
Ventura         0
dtype: int64

In [26]:
df.shape

(41741, 16)

In [27]:
df.estimate.plot.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x114f19e90>

## Other

- Be sure to not forget about the common tasks for this phase of the pipeline (e.g. summarize data, plotting distributions of individual variables).

- Documentation: markdown, docstrings in functions, and comments in code.

In [28]:
def prepare_data():
    df = acquire_data()
    df = drop_useless_columns(df, .3)
    df = drop_duplicated_observation(df)
    df = drop_ineffecitve_columns(df)
    df = drop_null(df)
    df = create_new_features(df)
    df = creat_dummy_var(df)
    df = regassign_dtypes(df)
    df = drop_outliers(df)
    return df
    
    

In [29]:
df = prepare_data()
df.shape

(41741, 16)