In [1]:
import acquire
import prepare
import wrangle
import explore
import pandas as pd
import numpy as np
import seaborn as sns
pd.options.display.float_format = '{:20,.2f}'.format
import matplotlib.pyplot as plt
from math import sqrt
from scipy import stats
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression, SelectKBest, RFE 
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cluster import KMeans

In [2]:
#df = wrangle.get_zillow_data()
train, validate, test = wrangle.clean_zillow(wrangle.get_zillow_data()) 
train.shape, validate.shape, test.shape

((43332, 21), (18572, 21), (15476, 21))

In [3]:
# Set option to see all colums in dataframe
pd.set_option('display.max_columns', None)
train.head(1)

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,latitude,longitude,lotsizesquarefeet,rawcensustractandblock,regionidzip,roomcnt,unitcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,heatingorsystemdesc,propertylandusedesc,county
14505,1,2,936,34.07,-117.76,4057,60374023,96508,0,1,1924,8877.0,19208.0,2016,10331.0,413.73,60374023033004.0,-0.04,Floor/Wall,Single Family Residential,Los Angeles


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43332 entries, 14505 to 54422
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   bathroomcnt                   43332 non-null  int64   
 1   bedroomcnt                    43332 non-null  int64   
 2   calculatedfinishedsquarefeet  43332 non-null  int64   
 3   latitude                      43332 non-null  float64 
 4   longitude                     43332 non-null  float64 
 5   lotsizesquarefeet             43332 non-null  int64   
 6   rawcensustractandblock        43332 non-null  int64   
 7   regionidzip                   43332 non-null  category
 8   roomcnt                       43332 non-null  int64   
 9   unitcnt                       43332 non-null  int64   
 10  yearbuilt                     43332 non-null  category
 11  structuretaxvaluedollarcnt    43332 non-null  float64 
 12  taxvaluedollarcnt             43332 non-nu

In [6]:
def scale_zillow(train, validate, test):
      '''
    This function take categorical variables and splits them in to cat.codes for modeling
    '''
    ############################################################################################
    ############################################################################################
    train["county"] = train["county"].cat.codes
    validate["county"] = validate["county"].cat.codes
    test["county"] = test["county"].cat.codes
    ############################################################################################
    train["heatingorsystemdesc"] = train["heatingorsystemdesc"].cat.codes
    validate["heatingorsystemdesc"] = validate["heatingorsystemdesc"].cat.codes
    test["heatingorsystemdesc"] = test["heatingorsystemdesc"].cat.codes
    ############################################################################################
    train["propertylandusedesc"] = train["propertylandusedesc"].cat.codes
    validate["propertylandusedesc"] = validate["propertylandusedesc"].cat.codes
    test["propertylandusedesc"] = test["propertylandusedesc"].cat.codes
    ############################################################################################
    train["regionidzip"] = train["regionidzip"].astype('category')
    train["regionidzip"] = train["regionidzip"].cat.codes # Not a Number
    validate["regionidzip"] = validate["regionidzip"].astype('category')
    validate["regionidzip"] = validate["regionidzip"].cat.codes # Not a Number
    test["regionidzip"] = test["regionidzip"].astype('category')
    test["regionidzip"] = test["regionidzip"].cat.codes # Not a Number
    ############################################################################################
    train["yearbuilt"] = train["yearbuilt"].astype('category')
    train["yearbuilt"] = train["yearbuilt"].cat.codes # Not a Number   
    validate["yearbuilt"] = validate["yearbuilt"].astype('category')
    validate["yearbuilt"] = validate["yearbuilt"].cat.codes # Not a Number 
    test["yearbuilt"] = test["yearbuilt"].astype('category')
    test["yearbuilt"] = test["yearbuilt"].cat.codes # Not a Number  
    
    return train_s, validate_s, test_s

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 7)

#### Clustering Train (Target Variable = Log Error)

In [None]:
X = train_scaled[['age', 'bmi', 'children', 'smoker', 'charges']]
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)
train['cluster'] = kmeans.labels_

#### Basic Visualizations (Train)

In [None]:
# How bedroomcount affects the relationship between squarefeet and logerror

sns.scatterplot(x='calculatedfinishedsquarefeet', y='logerror',
               data=train)
plt.title("Visualizing the relationship between logerror and squarefeet")
plt.show()

In [None]:
# How tax value affects logerror

sns.scatterplot(x='taxvaluedollarcnt', y='logerror',
               data=train)
plt.title("Visualizing the relationship between logerror and Assessed Tax Value")
plt.show()

#### New Dataframes based on County w/ Outliers Removed

In [None]:
# Split in to train df per county, remove outliers using IQR (6)

la_train_df, vc_train_df, oc_train_df = explore.counties_no_outliers(train)
la_train_df.shape, vc_train_df.shape, oc_train_df.shape

#### Basic Visualizations w/Outliers Removed (Per County)

In [None]:
# How bedroomcount affects the relationship between squarefeet and logerror (La County)

sns.scatterplot(x='calculatedfinishedsquarefeet', y='logerror',
               data=la_train_df)
plt.title("Visualizing the relationship between logerror and squarefeet in LA County")
plt.show()

In [None]:
# How bedroomcount affects the relationship between squarefeet and logerror (Ventura County)

sns.scatterplot(x='calculatedfinishedsquarefeet', y='logerror',
               data=vc_train_df)
plt.title("Visualizing the relationship between logerror and squarefeet in Ventura County")
plt.show()

In [None]:
# How bedroomcount affects the relationship between squarefeet and logerror (Orange County)

sns.scatterplot(x='calculatedfinishedsquarefeet', y='logerror',
               data=oc_train_df)
plt.title("Visualizing the relationship between logerror and squarefeet in Orange County")
plt.show()

In [None]:
# How tax value affects logerror in LA County

sns.scatterplot(x='taxvaluedollarcnt', y='logerror',
               data=la_train_df)
plt.title("Visualizing the relationship between logerror and Assessed Tax Value in LA County")
plt.show()

In [None]:
# How tax value affects logerror in Ventura County

sns.scatterplot(x='taxvaluedollarcnt', y='logerror',
               data=vc_train_df)
plt.title("Visualizing the relationship between logerror and Assessed Tax Value in Ventura County")
plt.show()

In [None]:
# How tax value affects logerror in Orange County

sns.scatterplot(x='taxvaluedollarcnt', y='logerror',
               data=oc_train_df)
plt.title("Visualizing the relationship between logerror and Assessed Tax Value in Orange County")
plt.show()