In [1]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
import pandas as pd
from scipy import stats
from pydataset import data
import numpy as np
import env
import matplotlib.pyplot as plt
import os
import prepare
import acquire
import seaborn as sns
import explore
from math import sqrt

In [2]:
df = acquire.get_zillow_sfr_data()

In [3]:
train, val, test = prepare.clean_prep_zillow(df)

In [4]:
train.head()

Unnamed: 0,bedroom,bathroom,sqrft,tax_value,year_built,fips
682835,2.0,1.0,1362.0,282575.0,1922.0,6037.0
1897297,2.0,1.0,1265.0,91774.0,1938.0,6037.0
948519,4.0,2.0,1924.0,457410.0,1975.0,6037.0
1549960,1.0,2.5,1680.0,519967.0,1996.0,6059.0
13498,3.0,2.0,1418.0,460000.0,1938.0,6037.0


In [5]:
x_train, y_train, x_val, y_val, x_test, y_test = prepare.modeling_split(train, val, test, 'tax_value')

In [6]:
x_train.head()

Unnamed: 0,bedroom,bathroom,sqrft,year_built,fips
682835,2.0,1.0,1362.0,1922.0,6037.0
1897297,2.0,1.0,1265.0,1938.0,6037.0
948519,4.0,2.0,1924.0,1975.0,6037.0
1549960,1.0,2.5,1680.0,1996.0,6059.0
13498,3.0,2.0,1418.0,1938.0,6037.0


In [7]:
def dummy_scale(df, train_df):
    
    '''
    this function generates scaled sqrft and creates dummy
    columns for modeling
    '''
    df['year_bin'] = pd.qcut(df['year_built'], 10, labels = ['a','b','c','d','e','f',
                                                          'g','h','i','j'])
    
    df = pd.get_dummies(df, columns = ['bedroom','bathroom','fips','year_bin'], 
                             drop_first = [True, True])   
    
    mm_scaler = MinMaxScaler()
    mm_scaler.fit(train_df[['sqrft']])
    sqft_scaled = mm_scaler.transform(df[['sqrft']])
    df['sqft_scaled'] = sqft_scaled
    df.drop(columns = ['sqrft','year_built'], inplace = True)
    
    return df

In [8]:
df = dummy_scale(x_train, x_train)

In [11]:
df.drop(columns = ['sqrft'], inplace = True)

In [12]:
df.head()

Unnamed: 0,year_built,bedroom_1.0,bedroom_2.0,bedroom_3.0,bedroom_4.0,bedroom_5.0,bedroom_6.0,bedroom_7.0,bathroom_0.5,bathroom_1.0,...,year_bin_b,year_bin_c,year_bin_d,year_bin_e,year_bin_f,year_bin_g,year_bin_h,year_bin_i,year_bin_j,sqft_scaled
682835,1922.0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0.176639
1897297,1938.0,0,1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0.164049
948519,1975.0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0.249578
1549960,1996.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.21791
13498,1938.0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0.183907


In [None]:
df = dummies_maker()

In [None]:
x_train2 = dummy_scale(x_train,x_train)
x_train2.head()

In [13]:
x_train2.isna().sum()

sqrft                0
sqft_scaled     459961
sqrft                0
bedroom_1.0          0
bedroom_2.0          0
bedroom_3.0          0
bedroom_4.0          0
bedroom_5.0          0
bedroom_6.0          0
bedroom_7.0          0
bathroom_0.5         0
bathroom_1.0         0
bathroom_1.5         0
bathroom_2.0         0
bathroom_2.5         0
bathroom_3.0         0
bathroom_3.5         0
bathroom_4.0         0
bathroom_4.5         0
bathroom_5.0         0
bathroom_5.5         0
bathroom_6.0         0
bathroom_6.5         0
bathroom_7.0         0
fips_6059.0          0
fips_6111.0          0
year_bin_b           0
year_bin_c           0
year_bin_d           0
year_bin_e           0
year_bin_f           0
year_bin_g           0
year_bin_h           0
year_bin_i           0
year_bin_j           0
dtype: int64

In [9]:
x_train.nunique()

bedroom           8
bathroom         15
sqrft          7171
fips              3
year_bin         10
sqft_scaled    6954
dtype: int64

In [10]:
x_train.head().T

Unnamed: 0,682835,1897297,948519,1549960,13498
bedroom,2.0,2.0,4.0,1.0,3.0
bathroom,1.0,1.0,2.0,2.5,2.0
sqrft,1362.0,1265.0,1924.0,1680.0,1418.0
fips,6037.0,6037.0,6037.0,6059.0,6037.0
year_bin,a,b,h,j,b
sqft_scaled,0.261389,,0.186892,,0.251785


In [11]:
x_train.columns

Index(['bedroom', 'bathroom', 'sqrft', 'fips', 'year_bin', 'sqft_scaled'], dtype='object')