In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import wrangle
import prep
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

wrangle.py functions loaded successfully
acquire.py functions loaded successfully


In [2]:
df = prep.prep_zillow_data()
df.isna().sum()

bathroomcnt                     0
bedroomcnt                      0
calculatedbathnbr               0
calculatedfinishedsquarefeet    0
fips                            0
fullbathcnt                     0
heatingorsystemtypeid           0
latitude                        0
longitude                       0
lotsizesquarefeet               0
propertylandusetypeid           0
rawcensustractandblock          0
regionidcity                    0
regionidcounty                  0
regionidzip                     0
roomcnt                         0
unitcnt                         0
yearbuilt                       0
structuretaxvaluedollarcnt      0
taxvaluedollarcnt               0
assessmentyear                  0
landtaxvaluedollarcnt           0
taxamount                       0
propertylandusetypeid           0
dtype: int64

In [3]:
X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle.train_validate_test(df)

In [4]:
X_train_scaled, X_validate_scaled, X_test_scaled = wrangle.min_max_scale(X_train, X_validate, X_test)

In [5]:
X_train_scaled

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fullbathcnt
9313,0.25,0.166667,0.166667,0.173552,0.111111
1308,0.20,0.250000,0.111111,0.256174,0.111111
11475,0.20,0.333333,0.111111,0.358643,0.111111
16038,0.30,0.250000,0.222222,0.376875,0.222222
1198,0.10,0.083333,0.000000,0.156935,0.000000
...,...,...,...,...,...
2465,0.30,0.333333,0.222222,0.520194,0.222222
6773,0.20,0.166667,0.111111,0.150012,0.111111
5750,0.20,0.250000,0.111111,0.304408,0.111111
17625,0.30,0.333333,0.222222,0.538195,0.222222


In [6]:
from sklearn.feature_selection import SelectKBest, f_regression

In [7]:
f_selector = SelectKBest(f_regression, k=5)

In [8]:
f_selector = f_selector.fit(X_train_scaled, y_train.taxvaluedollarcnt)

In [9]:
X_train_reduced = f_selector.transform(X_train_scaled)
print(X_train.shape)
print(X_train_reduced.shape)

(11746, 5)
(11746, 5)


In [10]:
f_support = f_selector.get_support()
f_support

array([ True,  True,  True,  True,  True])

In [11]:
f_feature = X_train_scaled.iloc[:, f_support].columns.tolist()
f_feature

['bathroomcnt',
 'bedroomcnt',
 'calculatedbathnbr',
 'calculatedfinishedsquarefeet',
 'fullbathcnt']

In [12]:
X_reduced_scaled = X_train_scaled.iloc[:, f_support]
X_reduced_scaled.head()

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fullbathcnt
9313,0.25,0.166667,0.166667,0.173552,0.111111
1308,0.2,0.25,0.111111,0.256174,0.111111
11475,0.2,0.333333,0.111111,0.358643,0.111111
16038,0.3,0.25,0.222222,0.376875,0.222222
1198,0.1,0.083333,0.0,0.156935,0.0


In [13]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [14]:
lm = LinearRegression()

rfe = RFE(lm, 5)

X_rfe = rfe.fit_transform(X_train,y_train) 

In [15]:
mask = rfe.support_ 
rfe_features = X_train.columns[mask]
print(f'selected {len(rfe_features)} features:', ', '.join(rfe_features))

selected 5 features: bathroomcnt, bedroomcnt, calculatedbathnbr, calculatedfinishedsquarefeet, fullbathcnt


In [16]:
lm.fit(X_rfe, y_train)

LinearRegression()

In [17]:
print("Linear Model:", lm)

print("intercept: ", lm.intercept_)

print("features: ", rfe_features)

print("coefficients: ", lm.coef_)

Linear Model: LinearRegression()
intercept:  [44807.1713353]
features:  Index(['bathroomcnt', 'bedroomcnt', 'calculatedbathnbr',
       'calculatedfinishedsquarefeet', 'fullbathcnt'],
      dtype='object')
coefficients:  [[ 129714.6420099  -108982.63989217  -26781.71316692     339.57650261
   -31021.79859825]]


In [18]:
y_train['yhat_lm'] = lm.predict(X_rfe)

y_train.head()

Unnamed: 0,taxvaluedollarcnt,yhat_lm
9313,222641.0,379365.097212
1308,261391.0,340484.380834
11475,427992.0,382273.708102
16038,176043.0,589994.021946
1198,503743.0,340520.634249


In [19]:
RMSE_lm = np.sqrt(mean_squared_error(y_train.taxvaluedollarcnt, y_train.yhat_lm))
RMSE_lm

387621.87590380665

In [20]:
r2_lm = lm.score(X_rfe, y_train.taxvaluedollarcnt)

r2_lm

0.29686157496714216

In [21]:
output = "{} = {:.4} + {:.2} * {} + {:.3} * {}".format(
    y_train.columns[0],
    lm.intercept_[0],
    lm.coef_[0][0],
    rfe_features[0],
    lm.coef_[0][1],
    rfe_features[1],
)
output

'taxvaluedollarcnt = 4.481e+04 + 1.3e+05 * bathroomcnt + -1.09e+05 * bedroomcnt'

In [22]:
X_rfe = rfe.fit_transform(X_validate,y_validate)
mask = rfe.support_ 
rfe_features = X_validate.columns[mask]
print(f'selected {len(rfe_features)} features:', ', '.join(rfe_features))

selected 5 features: bathroomcnt, bedroomcnt, calculatedbathnbr, calculatedfinishedsquarefeet, fullbathcnt


In [23]:
y_validate['yhat_lm'] = lm.predict(X_rfe).round(1)

y_validate.head()

Unnamed: 0,taxvaluedollarcnt,yhat_lm
7557,294232.0,170096.1
111,203113.0,570356.3
7759,592933.0,236595.4
10968,785799.0,1078459.2
14557,381637.0,286152.1


In [24]:
RMSE_lm = np.sqrt(mean_squared_error(y_validate.taxvaluedollarcnt, y_validate.yhat_lm))
RMSE_lm

396767.3522806633

In [25]:
r2_lm = lm.score(X_rfe, y_validate.taxvaluedollarcnt)

r2_lm

0.28081357670979523

In [26]:
output = "{} = {:.4} + {:.2} * {} + {:.3} * {}".format(
    y_train.columns[0],
    lm.intercept_[0],
    lm.coef_[0][0],
    rfe_features[0],
    lm.coef_[0][1],
    rfe_features[1],
)
output

'taxvaluedollarcnt = 4.481e+04 + 1.3e+05 * bathroomcnt + -1.09e+05 * bedroomcnt'

In [27]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=4)
X_poly = poly.fit_transform(X_rfe)

poly.get_feature_names()

['1',
 'x0',
 'x1',
 'x2',
 'x3',
 'x4',
 'x0^2',
 'x0 x1',
 'x0 x2',
 'x0 x3',
 'x0 x4',
 'x1^2',
 'x1 x2',
 'x1 x3',
 'x1 x4',
 'x2^2',
 'x2 x3',
 'x2 x4',
 'x3^2',
 'x3 x4',
 'x4^2',
 'x0^3',
 'x0^2 x1',
 'x0^2 x2',
 'x0^2 x3',
 'x0^2 x4',
 'x0 x1^2',
 'x0 x1 x2',
 'x0 x1 x3',
 'x0 x1 x4',
 'x0 x2^2',
 'x0 x2 x3',
 'x0 x2 x4',
 'x0 x3^2',
 'x0 x3 x4',
 'x0 x4^2',
 'x1^3',
 'x1^2 x2',
 'x1^2 x3',
 'x1^2 x4',
 'x1 x2^2',
 'x1 x2 x3',
 'x1 x2 x4',
 'x1 x3^2',
 'x1 x3 x4',
 'x1 x4^2',
 'x2^3',
 'x2^2 x3',
 'x2^2 x4',
 'x2 x3^2',
 'x2 x3 x4',
 'x2 x4^2',
 'x3^3',
 'x3^2 x4',
 'x3 x4^2',
 'x4^3',
 'x0^4',
 'x0^3 x1',
 'x0^3 x2',
 'x0^3 x3',
 'x0^3 x4',
 'x0^2 x1^2',
 'x0^2 x1 x2',
 'x0^2 x1 x3',
 'x0^2 x1 x4',
 'x0^2 x2^2',
 'x0^2 x2 x3',
 'x0^2 x2 x4',
 'x0^2 x3^2',
 'x0^2 x3 x4',
 'x0^2 x4^2',
 'x0 x1^3',
 'x0 x1^2 x2',
 'x0 x1^2 x3',
 'x0 x1^2 x4',
 'x0 x1 x2^2',
 'x0 x1 x2 x3',
 'x0 x1 x2 x4',
 'x0 x1 x3^2',
 'x0 x1 x3 x4',
 'x0 x1 x4^2',
 'x0 x2^3',
 'x0 x2^2 x3',
 'x0 x2^2 x4',
 'x0

In [28]:
lm_poly = LinearRegression()
lm_poly.fit(X_poly, y_train.taxvaluedollarcnt)
y_train['yhat_poly'] = lm_poly.predict(X_poly)

ValueError: Found input variables with inconsistent numbers of samples: [5035, 11746]

In [None]:
RMSE_poly = np.sqrt(mean_squared_error(y_train.taxvaluedollarcnt, y_train.yhat_poly))
RMSE_poly

In [None]:
y_train.head()

In [None]:
y_train['yhat_baseline'] = df['taxvaluedollarcnt'].mean()

# compute the RMSE
RMSE_bl = np.sqrt(mean_squared_error(y_train.taxvaluedollarcnt, y_train.yhat_baseline))
print(RMSE_bl)

# no need to compute R-2 because it will be a 0! But we will demonstrate here:
evs = explained_variance_score(y_train.taxvaluedollarcnt, y_train.yhat_baseline)
print(evs)


In [None]:
y_train.head()

In [None]:
plt.figure(figsize=(9, 9))

plt.scatter(y_train.taxvaluedollarcnt, y_train.yhat_lm, label='Linear Regression Model Predictions', marker='o')
plt.scatter(y_train.taxvaluedollarcnt, y_train.yhat_poly, label='Polynomial Regression Model Predictions', marker='o')
plt.scatter(y_train.taxvaluedollarcnt, y_train.yhat_baseline, label='Baseline Predicitions', marker='o')

In [None]:
X_rfe = rfe.fit_transform(X_validate,y_validate) 

In [None]:
poly = PolynomialFeatures(degree=4)
X_poly = poly.fit_transform(X_rfe)

poly.get_feature_names()

In [None]:
lm_poly = LinearRegression()
lm_poly.fit(X_poly, y_validate.taxvaluedollarcnt)
y_validate['yhat_poly'] = lm_poly.predict(X_poly).round(1)

In [None]:
RMSE_poly = np.sqrt(mean_squared_error(y_validate.taxvaluedollarcnt, y_validate.yhat_poly))
RMSE_poly

In [None]:
y_validate.head()

In [None]:
plt.scatter(y_validate.taxvaluedollarcnt, y_validate.yhat_poly, label='Polynomial Regression Model Predictions', marker='o')
plt.scatter(y_train.taxvaluedollarcnt, y_train.yhat_baseline, label='Baseline Predicitions', marker='o')