# Modeling (Clustering Unit)
## Corey Solitaire
#### 10.15.2020

In [1]:
import acquire
import prepare
import wrangle
import pandas as pd
import numpy as np
import seaborn as sns
pd.options.display.float_format = '{:20,.2f}'.format
import matplotlib.pyplot as plt
from math import sqrt
from scipy import stats
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression, SelectKBest, RFE 
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

In [2]:
#df = wrangle.get_zillow_data()
train, validate, test = wrangle.clean_zillow(wrangle.get_zillow_data()) 
train.shape, validate.shape, test.shape

((43332, 25), (18572, 25), (15476, 25))

In [3]:
train.head()

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,latitude,longitude,lotsizesquarefeet,propertycountylandusecode,rawcensustractandblock,regionidcity,...,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,tdate,heatingorsystemdesc,propertylandusedesc,county
14505,12883459,1,2,936,34069666.0,-117755930.0,4057,0100,60374023,20008,...,19208.0,2016,10331.0,413.73,60374023033004.0,-0.04,2017-03-05,Floor/Wall,Single Family Residential,Los Angeles
69608,12457381,1,2,861,33847368.0,-118180236.0,7013,0100,60375715,46298,...,305308.0,2016,244248.0,3778.39,60375715021000.0,0.0,2017-08-24,Central,Single Family Residential,Los Angeles
51965,13928906,2,5,2072,33781263.0,-118092369.0,6100,122,60591100,54352,...,98101.0,2016,28002.0,1608.16,60591100072013.0,0.34,2017-06-29,,Single Family Residential,Orange
30435,11709424,2,4,3154,34009306.0,-118325111.0,4298,0200,60372343,12447,...,294531.0,2016,119595.0,3682.72,60372343001000.0,0.1,2017-04-27,,"Duplex (2 Units, Any Combination)",Los Angeles
76615,10790864,3,3,1579,34183400.0,-118611000.0,122753,010C,60371351,12447,...,400174.0,2016,219606.0,4902.28,60371351141001.0,-0.02,2017-09-15,Central,Condominium,Los Angeles


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43332 entries, 14505 to 54422
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      43332 non-null  int64  
 1   bathroomcnt                   43332 non-null  int64  
 2   bedroomcnt                    43332 non-null  int64  
 3   calculatedfinishedsquarefeet  43332 non-null  int64  
 4   latitude                      43332 non-null  float64
 5   longitude                     43332 non-null  float64
 6   lotsizesquarefeet             43332 non-null  int64  
 7   propertycountylandusecode     43332 non-null  object 
 8   rawcensustractandblock        43332 non-null  int64  
 9   regionidcity                  43332 non-null  int64  
 10  regionidzip                   43332 non-null  int64  
 11  roomcnt                       43332 non-null  int64  
 12  unitcnt                       43332 non-null  int64  
 1

In [5]:
# def post_selection_processing(train, validate, test):
#     train ["yearbuilt"] = train["yearbuilt"].astype('int')
#     validate ["yearbuilt"] = vaidate["yearbuilt"].astype('int')
#     test ["yearbuilt"] = test["yearbuilt"].astype('int')
    
#     train["squarefeet"] = train["squarefeet"].astype('int')
#     validate["squarefeet"] = validate["squarefeet"].astype('int')
#     test["squarefeet"] = test["squarefeet"].astype('int')
    
#     return train, validate, test 
    

In [6]:
# New Dataframes based on County

# LA County
la_df = train[train.county=='Los Angeles']
la_df.shape

# Ventura County
vc_df = train[train.county=='Ventura']
vc_df.shape

# Orange County
oc_df = train[train.county=='Orange']

la_df.shape, vc_df.shape, oc_df.shape

((28462, 25), (3452, 25), (11418, 25))

In [7]:
# Zip Codes per County

cols = ["regionidzip", 'assessmentyear']
la_df[cols] = la_df[cols].astype('int')


In [8]:
la_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28462 entries, 14505 to 2572
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      28462 non-null  int64  
 1   bathroomcnt                   28462 non-null  int64  
 2   bedroomcnt                    28462 non-null  int64  
 3   calculatedfinishedsquarefeet  28462 non-null  int64  
 4   latitude                      28462 non-null  float64
 5   longitude                     28462 non-null  float64
 6   lotsizesquarefeet             28462 non-null  int64  
 7   propertycountylandusecode     28462 non-null  object 
 8   rawcensustractandblock        28462 non-null  int64  
 9   regionidcity                  28462 non-null  int64  
 10  regionidzip                   28462 non-null  int64  
 11  roomcnt                       28462 non-null  int64  
 12  unitcnt                       28462 non-null  int64  
 13

In [9]:
la_df.censustractandblock.value_counts()

60,376,026,002,005.50    74
60,371,371,031,000.00    29
60,372,766,032,001.00    28
60,371,393,022,000.00    24
60,379,203,391,054.00    21
                         ..
60,375,016,004,008.00     1
60,371,082,021,000.00     1
60,375,331,051,012.00     1
60,374,803,041,007.00     1
60,371,092,001,004.00     1
Name: censustractandblock, Length: 18511, dtype: int64