## Exploratory Data Analysis(Zillow Dataframe)
### Corey Solitaire
#### 10.13. 2020

In [1]:
import acquire
import prepare
import wrangle_zillow
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt
from scipy import stats
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression, SelectKBest, RFE 
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

In [2]:
df, X_train_explore, X_train_scaled, X_validate_scaled, X_test_scaled = wrangle_zillow.wrangle_zillow(acquire.get_zillow_data(cached=False)) 

In [3]:
df.shape, X_train_scaled.shape, X_test_scaled.shape

((43778, 34), (24515, 34), (8756, 34))

In [4]:
def get_upper_outliers(s, k):
    '''
    Given a series and a cutoff value, k, returns the upper outliers for the
    series.

    The values returned will be either 0 (if the point is not an outlier), or a
    number that indicates how far away from the upper bound the observation is.
    '''
    q1, q3 = s.quantile([.25, .75])
    iqr = q3 - q1
    upper_bound = q3 + k * iqr
    return s.apply(lambda x: max([x - upper_bound, 0]))

In [5]:
def add_upper_outlier_columns(df, k):
    '''
    Add a column with the suffix _outliers for all the numeric columns
    in the given dataframe.
    '''
    # outlier_cols = {col + '_outliers': get_upper_outliers(df[col], k)
    #                 for col in df.select_dtypes('number')}
    # return df.assign(**outlier_cols)
    for col in df.select_dtypes('number'):
        df[col + '_outliers'] = get_upper_outliers(df[col], k)
    return df

In [6]:
add_upper_outlier_columns(X_train_explore, k=1.5)

Unnamed: 0,parcelid,propertylandusetypeid,heatingorsystemtypeid,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,...,taxvaluedollarcnt_outliers,assessmentyear_outliers,landtaxvaluedollarcnt_outliers,taxamount_outliers,censustractandblock_outliers,id_outliers,logerror_outliers,tdate_outliers,heatingorsystemdesc_outliers,propertylandusedesc_outliers
73915,10768234,261.0,2.0,2.0,3.0,6.0,2.0,1313.0,1313.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0
48981,12071067,266.0,2.0,2.0,2.0,8.0,2.0,1046.0,1046.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0
29282,11905813,261.0,7.0,1.0,2.0,4.0,1.0,1166.0,1166.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0
51818,12226280,261.0,7.0,2.0,3.0,4.0,2.0,735.0,735.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0
19625,12626656,261.0,2.0,2.0,5.0,6.0,2.0,1948.0,1948.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73581,12904203,261.0,7.0,1.0,3.0,4.0,1.0,1220.0,1220.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0
19092,12758672,261.0,7.0,1.0,3.0,4.0,1.0,1145.0,1145.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0
8915,11475333,266.0,2.0,2.0,2.0,8.0,2.0,1252.0,1252.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0
34132,10965278,266.0,2.0,3.0,3.0,8.0,3.0,1429.0,1429.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0


In [7]:
'''
This text prints information regrding the outlier columns created
'''
add_upper_outlier_columns(df, k=1.5)    
outlier_cols = [col for col in df if col.endswith('_outliers')]
for col in outlier_cols:
    print('~~~\n' + col)
    data = df[col][df[col] > 0]
    print(data.describe())

~~~
parcelid_outliers
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: parcelid_outliers, dtype: float64
~~~
propertylandusetypeid_outliers
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: propertylandusetypeid_outliers, dtype: float64
~~~
heatingorsystemtypeid_outliers
count    39.0
mean      5.5
std       0.0
min       5.5
25%       5.5
50%       5.5
75%       5.5
max       5.5
Name: heatingorsystemtypeid_outliers, dtype: float64
~~~
bathroomcnt_outliers
count    1058.000000
mean        0.982042
std         0.848037
min         0.500000
25%         0.500000
50%         0.500000
75%         1.500000
max         6.500000
Name: bathroomcnt_outliers, dtype: float64
~~~
bedroomcnt_outliers
count    25.000000
mean      1.440000
std       0.768115
min       1.000000
25%       1.000000
50%       1.000000
75%       2.000000
max       4.000000
Name: bedroomcnt_outliers, dty

In [8]:
'''
This code restors your dataframe
'''       
X_train_explore.drop([x for x in df if x.endswith('_outliers')], 1, inplace = True)

In [9]:
X_train_explore.shape

(24515, 34)