In [1]:
import acquire
import prepare
import wrangle
import explore
import pandas as pd
import numpy as np
import seaborn as sns
pd.options.display.float_format = '{:20,.2f}'.format
import matplotlib.pyplot as plt
from math import sqrt
from scipy import stats
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression, SelectKBest, RFE 
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cluster import KMeans

##### Train, Validate, Test Data Frames Cleaned (Outliers Removed)

In [2]:
#Clean Data with Outliers Removed
train, validate, test = wrangle.clean_zillow(wrangle.get_zillow_data()) 
train.shape, validate.shape, test.shape

((43332, 19), (18572, 19), (15476, 19))

In [3]:
# Set option to see all colums in dataframe
pd.set_option('display.max_columns', None)
train.head(1)

Unnamed: 0,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,rawcensustractandblock,roomcnt,unitcnt,assessmentyear,censustractandblock,logerror,heatingorsystemdesc,propertylandusedesc,county,age,taxrate,acres,structure_dollar_per_sqft,land_dollar_per_sqft,bed_bath_ratio
14505,1,936,34.07,-117.76,60374023,0,1,2016,60374023033004.0,-0.04,Floor/Wall,Single Family Residential,Los Angeles,93,0.02,0.09,9.48,2.55,2.0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43332 entries, 14505 to 54422
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   bathroomcnt                   43332 non-null  int64  
 1   calculatedfinishedsquarefeet  43332 non-null  int64  
 2   latitude                      43332 non-null  float64
 3   longitude                     43332 non-null  float64
 4   rawcensustractandblock        43332 non-null  int64  
 5   roomcnt                       43332 non-null  int64  
 6   unitcnt                       43332 non-null  int64  
 7   assessmentyear                43332 non-null  int64  
 8   censustractandblock           43332 non-null  float64
 9   logerror                      43332 non-null  float64
 10  heatingorsystemdesc           43332 non-null  object 
 11  propertylandusedesc           43332 non-null  object 
 12  county                        43332 non-null  object 
 1

#### Split and Scale Data for Clustering and Modeling

In [5]:
def label_county(row):
    if row['fips'] == 6037:
        return 'Los Angeles'
    elif row['fips'] == 6059:
        return 'Orange'
    elif row['fips'] == 6111:
        return 'Ventura'
    
###########################################################

def create_features(df):
    df['age'] = 2017 - df.yearbuilt
    # create taxrate variable
    df['taxrate'] = df.taxamount/df.taxvaluedollarcnt
    # create acres variable
    df['acres'] = df.lotsizesquarefeet/43560
    # dollar per square foot-structure
    df['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt/df.calculatedfinishedsquarefeet
    # dollar per square foot-land
    df['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt/df.lotsizesquarefeet
    # ratio of beds to baths
    df['bed_bath_ratio'] = df.bedroomcnt/df.bathroomcnt
    return df

###########################################################

def remove_outliers(df):
    '''
    remove outliers in bed, bath, zip, square feet, acres & tax rate
    '''
    df[((train.bathroomcnt <= 7) & (df.bedroomcnt <= 7) & 
               (df.regionidzip < 100000) & 
               (df.bathroomcnt > 0) & 
               (df.bedroomcnt > 0) & 
               (df.acres < 10) &
               (df.calculatedfinishedsquarefeet < 7000) & 
               (df.taxrate < .05)
              )]
    return df

###########################################################

def col_to_drop_post_feature_creation(df):
    cols_to_drop = ['bedroomcnt', 'taxamount', 
               'taxvaluedollarcnt', 'structuretaxvaluedollarcnt',
               'landtaxvaluedollarcnt','lotsizesquarefeet', "regionidzip", "yearbuilt"]
    df = df.drop(columns = cols_to_drop)
    return df

###########################################################

def cat_columns(df):
    cols = ["heatingorsystemdesc","propertylandusedesc","county"]
    df[cols] = df[cols].astype("category")
    return df 

###########################################################

def modify_columns(df):
    '''
    This function drops colums that are duplicated or unneessary, creates new features, and changes column labels
    '''
    df['county'] = df.apply(lambda row: label_county(row), axis=1)
    df.drop(columns = ['id','pid','id.1',"propertylandusetypeid", "heatingorsystemtypeid",'fips',"propertyzoningdesc","calculatedbathnbr"], inplace = True)
    df.heatingorsystemdesc = df.heatingorsystemdesc.fillna("None")
    df.latitude = df.latitude / 1000000
    df.longitude = df.longitude / 1000000
    #df = processing(df)  # must move after NaN have beeb addressed
    df = create_features(df)
    df = remove_outliers(df)
    df = col_to_drop_post_feature_creation(df)
    df = cat_columns(df)
    return df

###########################################################

def split(df, target_var):
    # split df into train_validate (80%) and test (20%)
    train_validate, test = train_test_split(df, test_size=.20, random_state=13)
    # split train_validate into train(70% of 80% = 56%) and validate (30% of 80% = 24%)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=13)
    
def label_county(row):
    if row['fips'] == 6037:
        return 'Los Angeles'
    elif row['fips'] == 6059:
        return 'Orange'
    elif row['fips'] == 6111:
        return 'Ventura'
    
###########################################################

def create_features(df):
    df['age'] = 2017 - df.yearbuilt
    # create taxrate variable
    df['taxrate'] = df.taxamount/df.taxvaluedollarcnt
    # create acres variable
    df['acres'] = df.lotsizesquarefeet/43560
    # dollar per square foot-structure
    df['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt/df.calculatedfinishedsquarefeet
    # dollar per square foot-land
    df['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt/df.lotsizesquarefeet
    # ratio of beds to baths
    df['bed_bath_ratio'] = df.bedroomcnt/df.bathroomcnt
    return df

###########################################################

def remove_outliers(df):
    '''
    remove outliers in bed, bath, zip, square feet, acres & tax rate
    '''
    df[((train.bathroomcnt <= 7) & (df.bedroomcnt <= 7) & 
               (df.regionidzip < 100000) & 
               (df.bathroomcnt > 0) & 
               (df.bedroomcnt > 0) & 
               (df.acres < 10) &
               (df.calculatedfinishedsquarefeet < 7000) & 
               (df.taxrate < .05)
              )]
    return df

###########################################################

def col_to_drop_post_feature_creation(df):
    cols_to_drop = ['bedroomcnt', 'taxamount', 
               'taxvaluedollarcnt', 'structuretaxvaluedollarcnt',
               'landtaxvaluedollarcnt','lotsizesquarefeet', "regionidzip", "yearbuilt"]
    df = df.drop(columns = cols_to_drop)
    return df

###########################################################

def modify_columns(df):
    '''
    This function drops colums that are duplicated or unneessary, creates new features, and changes column labels
    '''
    df['county'] = df.apply(lambda row: label_county(row), axis=1)
    df.drop(columns = ['id','pid','id.1',"propertylandusetypeid", "heatingorsystemtypeid",'fips',"propertyzoningdesc","calculatedbathnbr"], inplace = True)
    df.heatingorsystemdesc = df.heatingorsystemdesc.fillna("None")
    df.latitude = df.latitude / 1000000
    df.longitude = df.longitude / 1000000
    df = create_features(df)
    df = remove_outliers(df)
    df = col_to_drop_post_feature_creation(df)
    return df

###########################################################

def split(df):
    # split df into train_validate (80%) and test (20%)
    train_validate, test = train_test_split(df, test_size=.20, random_state=13)
    # split train_validate into train(70% of 80% = 56%) and validate (30% of 80% = 24%)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=13)
    return train, validate, test 
    
###########################################################

def clean_data(train, validate, test):
    # Continuous valued columns to use median to replace nulls
    cols = [
        "structuretaxvaluedollarcnt",
        "taxamount",
        "taxvaluedollarcnt",
        "landtaxvaluedollarcnt",
        "structuretaxvaluedollarcnt",
        "finishedsquarefeet12",
        "calculatedfinishedsquarefeet",
        "fullbathcnt",
        "lotsizesquarefeet",
        "unitcnt",
        "regionidcity",
        "buildingqualitytypeid",
        "regionidcity",
        "regionidzip",
        "yearbuilt",
        "censustractandblock"
    ]
    for col in cols:
        median = train[col].median()
        train[col].fillna(median, inplace=True)
        validate[col].fillna(median, inplace=True)
        test[col].fillna(median, inplace=True)
    return train, validate, test

###########################################################

def processing(train, validate, test):
    
    cols = ["yearbuilt","calculatedfinishedsquarefeet","regionidzip",
            "bathroomcnt","bedroomcnt","lotsizesquarefeet","rawcensustractandblock",
            "roomcnt","unitcnt","assessmentyear"]
    train[cols] = train[cols].astype('int')
    validate[cols] = validate[cols].astype('int')
    test[cols] = test[cols].astype('int')
    return train, validate, test     

###########################################################

def remove_columns(train, validate, test, cols_to_remove):  
    train = train.drop(columns=cols_to_remove)
    validate = validate.drop(columns=cols_to_remove)
    test = test.drop(columns=cols_to_remove)
    return train, validate, test

###########################################################

def handle_missing_values(train, validate, test, prop_required_column = .5, prop_required_row = .75):
    threshold = int(round(prop_required_column*len(train.index),0))
    train.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_column*len(validate.index),0))
    validate.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_column*len(test.index),0))
    test.dropna(axis=1, thresh=threshold, inplace=True)

    threshold = int(round(prop_required_row*len(train.columns),0))
    train.dropna(axis=0, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(validate.columns),0))
    validate.dropna(axis=0, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(test.columns),0))
    test.dropna(axis=0, thresh=threshold, inplace=True)
    return train, validate, test

###########################################################

def x_train(train, validate, test, target_var):
    # create X_train by dropping the target variable 
    X_train = train.drop(columns=[target_var])
    # create y_train by keeping only the target variable.
    y_train = train[[target_var]]

    # create X_validate by dropping the target variable 
    X_validate = validate.drop(columns=[target_var])
    # create y_validate by keeping only the target variable.
    y_validate = validate[[target_var]]

    # create X_test by dropping the target variable 
    X_test = test.drop(columns=[target_var])
    # create y_test by keeping only the target variable.
    y_test = test[[target_var]]
    
    return X_train, y_train, X_validate, y_validate, X_test, y_test

###########################################################

def col_to_drop_post_processing(train, validate, test):
    cols_to_drop = ['bedroomcnt', 'taxamount', 
               'taxvaluedollarcnt', 'structuretaxvaluedollarcnt',
               'landtaxvaluedollarcnt', 'yearbuilt', 
               'lotsizesquarefeet','regionidzip']
    train = train.drop(columns = cols_to_drop)
    validate = validate.drop(columns = cols_to_drop)
    test = test.drop(columns = cols_to_drop)
    return train, validate, test

###########################################################

def clean_zillow(df):
    modify_columns(df)
    train, validate, test = split(df)
    train, validate, test = clean_data(train, validate, test)
    train, validate, test = remove_columns(train, validate, test, cols_to_remove=['buildingqualitytypeid','finishedsquarefeet12','fullbathcnt', 'regionidcounty',"regionidcity",'tdate', 'parcelid', 'propertycountylandusecode'])
    train, validate, test = handle_missing_values(train, validate, test)
    train, validate, test = processing(train, validate, test) 
    train, validate, test = col_to_drop_post_processing(train, validate, test)
    X_train, y_train, X_validate, y_validate, X_test, y_test = x_train(train, validate, test, 'logerror')
    return X_train, y_train, X_validate, y_validate, X_test, y_test  

###########################################################

def cat_columns(X_train, X_validate, X_test):
    cols = ["heatingorsystemdesc","propertylandusedesc","county"]
    X_train[cols] = X_train[cols].astype("category")
    X_validate[cols] = X_validate[cols].astype("category")
    X_test[cols] = X_test[cols].astype("category")
    return X_train, X_validate, X_test 

###########################################################

def scale_min_max(X_train, X_validate, X_test):
    # create the scaler object and fit to X_train (get the min and max from X_train for each column)
    scaler = MinMaxScaler(copy=True, feature_range=(0,1)).fit(X_train)

    # transform X_train values to their scaled equivalent and create df of the scaled features
    X_train_scaled = pd.DataFrame(scaler.transform(X_train), 
                                  columns=X_train.columns.values).set_index([X_train.index.values])
    
    # transform X_validate values to their scaled equivalent and create df of the scaled features
    X_validate_scaled = pd.DataFrame(scaler.transform(X_validate),
                                    columns=X_validate.columns.values).set_index([X_validate.index.values])

    # transform X_test values to their scaled equivalent and create df of the scaled features   
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), 
                                 columns=X_test.columns.values).set_index([X_test.index.values])
    return X_train_scaled, X_validate_scaled, X_test_scaled

###########################################################

def model_zillow(X_train, X_validate, X_test):
    X_train, X_validate, X_test = cat_columns(X_train, X_validate, X_test)
    X_train_scaled, X_validate_scaled, X_test_scaled = scale_min_max(X_train, X_validate, X_test)
    return X_train_scaled, X_validate_scaled, X_test_scaled

In [6]:
df = wrangle.get_zillow_data()

In [7]:
X_train, y_train, X_validate, y_validate, X_test, y_test  = clean_zillow(df)
X_train_scaled, X_validate_scaled, X_test_scaled = model_zillow(X_train, X_validate, X_test)

TypeError: cat_columns() missing 2 required positional arguments: 'X_validate' and 'X_test'

In [None]:
X_train_scaled.info()

***

**Create New Features**  (def create_features(df))


age: 2017 - year built   


tax_rate: taxamount/taxvaluedollarcnt fields (total, land & structure). We can then remove taxamount and 
          taxvaluedollarcnt, and will keep taxrate, structuretaxvaluedollarcnt, and landtaxvalue
          
          
acres: lotsizesquarefeet/43560   



structure_dollar_per_sqft: structure tax value/finished square feet  


land_dollar_per_sqft: land tax value/lot size square feet


bed_bath_ratio: bedroomcnt/bathroomcnt   

***

***

**Remove Outliers** (remove_outliers())

1. remove extremes in bedrooms and baths, we will keeps homes with between 1 and 7 baths, between 0 and 7 
    bedrooms

2. there is an error in zip, so we will remove those whose zips are invalid numbers (> 99999).

3. remove square feet > 7000 for now

4. remove lot size (acres) > 10 for now

5. remove tax rate > 5% for now.

***

***

**Drop Columns**

- For now, I will focus on the most difficult and diverse county, LA county. I'll add the others in after I see what I can find.

- I'm not sure where I will use bins and where I will use actual values, so for now I think i'll go with bins and see what happens.

- I will remove the following variables:

    parcelid: can tie back to parcels later

    bedroomcnt: info captured in bed_bath_ratio + bathroomcnt

    taxamount, taxvaluedollarcnt, structuretaxvaluedollarcnt, landtaxvaluedollarcnt: info captured in tax_bin + 
    structure_dollar_per_sqft + land_dollar_per_sqft + acres + calculatedfinishedsquarefeet

    yearbuilt: info captured in age

    lotsizesquarefeet: info captured in acres

    regionidcity: using boolean of whether in city of LA or not

    regionidzip: not using at this time

    LA, Orange, Ventura: will look at LA county only right now.


***

In [None]:
# train, validate, test = remove_outliers(train, validate, test)
# train.shape, validate.shape, test.shape

In [None]:
# #Convert categorical to cat.codes (Outliers)
# train_c, validate_c, test_c = wrangle.catcode_zillow(train, validate, test)
# # Scale train (Outliers)
# X_train_scaled, X_validate_scaled, X_test_scaled = wrangle.scale_df(train_c, validate_c, test_c)
# X_train_scaled.shape

#### Clustering Train (Target Variable = Log Error) W/Outliers

In [None]:
# X = train_scaled[['age', 'bmi', 'children', 'smoker', 'charges']]
# kmeans = KMeans(n_clusters=5)
# kmeans.fit(X)
# train['cluster'] = kmeans.labels_

#### Basic Visualizations (Train)

In [None]:
# # How bedroomcount affects the relationship between squarefeet and logerror

# sns.scatterplot(x='calculatedfinishedsquarefeet', y='logerror',
#                data=train)
# plt.title("Visualizing the relationship between logerror and squarefeet")
# plt.show()

In [None]:
# # How tax value affects logerror

# sns.scatterplot(x='taxvaluedollarcnt', y='logerror',
#                data=train)
# plt.title("Visualizing the relationship between logerror and Assessed Tax Value")
# plt.show()

#### New Dataframes based on County w/ Outliers Removed

In [None]:
# # Split in to train df per county, remove outliers using IQR (6)
# #df = wrangle.get_zillow_data()
# train, validate, test = wrangle.clean_zillow(wrangle.get_zillow_data()) 
# la_train_df, vc_train_df, oc_train_df = explore.counties_no_outliers(train)
# la_train_df.shape, vc_train_df.shape, oc_train_df.shape

In [None]:
# # LA County

# #Convert categorical to cat.codes (Outliers) - Only using Train
# la_train_c, validate_c, test_c = wrangle.catcode_zillow(la_train_df, validate, test)
# # Scale train (Outliers)
# la_train_scaled = wrangle.county_scaler(la_train_c)
# la_train_scaled.shape

In [None]:
# # Ventura County

# #Convert categorical to cat.codes (Outliers) - Only using Train
# la_train_c, validate_c, test_c = wrangle.catcode_zillow(la_train_df, validate, test)
# # Scale train (Outliers)
# la_train_scaled = wrangle.county_scaler(la_train_c)
# la_train_scaled.shape

In [None]:
# def county_catcode(train):
    

In [None]:
# def county_scaler(train):
#     X_train = train
#     # Scale data
#     scaler = MinMaxScaler(copy=True).fit(X_train)
#     X_train_scaled = scaler.transform(X_train)
#     X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns.values).set_index([X_train.index.values])
#     return X_train_scaled

In [None]:
# la_train_scaled = county_scaler(la_train_c)
# la_train_scaled.shape

In [None]:
# la_train_scaled.head()

In [None]:
# Orange County


In [None]:
# la_X_train_scaled.head()

#### Basic Visualizations w/Outliers Removed (Per County)

In [None]:
# # How bedroomcount affects the relationship between squarefeet and logerror (La County)

# sns.scatterplot(x='calculatedfinishedsquarefeet', y='logerror',
#                data=la_train_df)
# plt.title("Visualizing the relationship between logerror and squarefeet in LA County")
# plt.show()

In [None]:
# # How bedroomcount affects the relationship between squarefeet and logerror (Ventura County)

# sns.scatterplot(x='calculatedfinishedsquarefeet', y='logerror',
#                data=vc_train_df)
# plt.title("Visualizing the relationship between logerror and squarefeet in Ventura County")
# plt.show()

In [None]:
# # How bedroomcount affects the relationship between squarefeet and logerror (Orange County)

# sns.scatterplot(x='calculatedfinishedsquarefeet', y='logerror',
#                data=oc_train_df)
# plt.title("Visualizing the relationship between logerror and squarefeet in Orange County")
# plt.show()

In [None]:
# # How tax value affects logerror in LA County

# sns.scatterplot(x='taxvaluedollarcnt', y='logerror',
#                data=la_train_df)
# plt.title("Visualizing the relationship between logerror and Assessed Tax Value in LA County")
# plt.show()

In [None]:
# # How tax value affects logerror in Ventura County

# sns.scatterplot(x='taxvaluedollarcnt', y='logerror',
#                data=vc_train_df)
# plt.title("Visualizing the relationship between logerror and Assessed Tax Value in Ventura County")
# plt.show()

In [None]:
# # How tax value affects logerror in Orange County

# sns.scatterplot(x='taxvaluedollarcnt', y='logerror',
#                data=oc_train_df)
# plt.title("Visualizing the relationship between logerror and Assessed Tax Value in Orange County")
# plt.show()