In [1]:
import acquire
import prepare
import wrangle
import pandas as pd
import numpy as np
import seaborn as sns
pd.options.display.float_format = '{:20,.2f}'.format
import matplotlib.pyplot as plt
from math import sqrt
from scipy import stats
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression, SelectKBest, RFE 
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cluster import KMeans

In [2]:
#df = wrangle.get_zillow_data()
train, validate, test = wrangle.clean_zillow(wrangle.get_zillow_data()) 
train.shape, validate.shape, test.shape

((43332, 21), (18572, 21), (15476, 21))

In [3]:
# What happens if we eliminate outliers from train
def outliers(train):
    # Remove Outliers (Train) Using IQR
    
    # Make new dataframe from categorical variables
    cat_df = train[["regionidzip","county","propertylandusedesc","heatingorsystemdesc"]].copy()
    # remove categorical variavbles for outlier examination
    train_df = train.drop(columns=["regionidzip","county","propertylandusedesc","heatingorsystemdesc"])
    # Drop Outliers in Dataframe (Set = 6)
    Q1 = train.quantile(0.25)
    Q3 = train.quantile(0.75)
    IQR = Q3 - Q1
    train_df_out = train_df[~((train_df < (Q1 - 6 * IQR)) |(train_df > (Q3 + 6 * IQR))).any(axis=1)]
    train = pd.concat([train_df_out, cat_df], axis=1).reindex(train_df_out.index)
    return train

In [4]:
train = outliers(train)
train.shape

(26371, 21)

In [5]:
def la_county(train):
    # Create LA County df
    la_train_df = train[train.county=='Los Angeles']
    la_validate_df = validate[validate.county=='Los Angeles']
    la_test_df = test[test.county=='Los Angeles'] 
    
    # Remove Outliers (Train) Using IQR
    
    # Make new dataframe from categorical variables
    cat_df = la_train_df[["regionidzip","county","propertylandusedesc","heatingorsystemdesc"]].copy()
    # remove categorical variavbles for outlier examination
    la_df = la_train_df.drop(columns=["regionidzip","county","propertylandusedesc","heatingorsystemdesc"])
    # Drop Outliers in Dataframe (Set = 6)
    Q1 = la_train_df.quantile(0.25)
    Q3 = la_train_df.quantile(0.75)
    IQR = Q3 - Q1
    la_df_out = la_df[~((la_df < (Q1 - 6 * IQR)) |(la_df > (Q3 + 6 * IQR))).any(axis=1)]
    la_train_df = pd.concat([la_df_out, cat_df], axis=1).reindex(la_df_out.index)
    return la_train_df

In [6]:
la_trial = la_county(train)
la_trial.shape

(21200, 21)

In [7]:
def vc_county(train):
    # Create Venture County df
    vc_train_df = train[train.county=='Ventura']
    vc_validate_df = validate[validate.county=='Ventura']
    vc_test_df = test[test.county== 'Ventura'] 
    
    # Remove Outliers (Train) Using IQR
    
    # Make new dataframe from categorical variables
    cat_df = vc_train_df[["regionidzip","county","propertylandusedesc","heatingorsystemdesc"]].copy()
    # remove categorical variavbles for outlier examination
    vc_df = vc_train_df.drop(columns=["regionidzip","county","propertylandusedesc","heatingorsystemdesc"])
    # Drop Outliers in Dataframe (Set = 6)
    Q1 = vc_train_df.quantile(0.25)
    Q3 = vc_train_df.quantile(0.75)
    IQR = Q3 - Q1
    vc_df_out = vc_df[~((vc_df < (Q1 - 6 * IQR)) |(vc_df > (Q3 + 6 * IQR))).any(axis=1)]
    vc_train_df = pd.concat([vc_df_out, cat_df], axis=1).reindex(vc_df_out.index)
    return vc_train_df

In [8]:
vc_trial = vc_county(train)
vc_trial.shape

(32, 21)

In [9]:
def oc_county(train):
    # Create Orange County df
    oc_train_df = train[train.county=='Orange']
    oc_validate_df = validate[validate.county=='Orange']
    oc_test_df = test[test.county== 'Orange'] 
    
    # Remove Outliers (Train) Using IQR
    
    # Make new dataframe from categorical variables
    cat_df = oc_train_df[["regionidzip","county","propertylandusedesc","heatingorsystemdesc"]].copy()
    # remove categorical variavbles for outlier examination
    oc_df = oc_train_df.drop(columns=["regionidzip","county","propertylandusedesc","heatingorsystemdesc"])
    # Drop Outliers in Dataframe (Set = 6)
    Q1 = oc_train_df.quantile(0.25)
    Q3 = oc_train_df.quantile(0.75)
    IQR = Q3 - Q1
    oc_df_out = oc_df[~((oc_df < (Q1 - 6 * IQR)) |(oc_df > (Q3 + 6 * IQR))).any(axis=1)]
    oc_train_df = pd.concat([oc_df_out, cat_df], axis=1).reindex(oc_df_out.index)
    return oc_train_df

In [10]:
oc_trial = oc_county(train)
oc_trial.shape

(4592, 21)

#### New Dataframes based on County

In [11]:
# LA County
la_df = train[train.county=='Los Angeles']
la_df.shape

# Ventura County
vc_df = train[train.county=='Ventura']
vc_df.shape

# Orange County
oc_df = train[train.county=='Orange']

la_df.shape, vc_df.shape, oc_df.shape

((21600, 21), (38, 21), (4733, 21))

#### Remove Outliers  using IQR, and recombining Dataframe Afterwards

In [12]:
from scipy import stats
import numpy as np

# Must remove categorical variables before outlier elimination

# Make new dataframe from categorical variables
cat_df = la_df[["regionidzip","county","propertylandusedesc","heatingorsystemdesc"]].copy()
# remove categorical variavbles for outlier examination
la_df = la_df.drop(columns=["regionidzip","county","propertylandusedesc","heatingorsystemdesc"])


# Drop Outliers in Dataframe (Set = 6 not 1.5)
# IQR of 1.5 and 3 removed too many values, so left at 6, and still removed 5,000 Properties

Q1 = la_df.quantile(0.25)
Q3 = la_df.quantile(0.75)
IQR = Q3 - Q1

la_df_out = la_df[~((la_df < (Q1 - 6 * IQR)) |(la_df > (Q3 + 6 * IQR))).any(axis=1)]
la_df_out.shape

#Recombine New datframe with categorical columns
la_df = pd.concat([la_df_out, cat_df], axis=1).reindex(la_df_out.index)
la_df.shape

(21200, 21)