In [1]:
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')

from env import get_db_url
import wrangle as wr
import explore as ex
import model as mo
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, LassoLars
from sklearn.preprocessing import PolynomialFeatures
α = .05

In [2]:
df = wr.get_zillow_data()
df.head()

Unnamed: 0,bedrooms,bathrooms,sq_ft,tax_value,year_built,lot_size,LA,orange,ventura
0,4,3.5,3100,1023282,1998,4506,0,1,0
1,2,1.0,1465,464000,1967,12647,0,0,1
2,3,2.0,1243,564778,1962,8432,0,1,0
3,4,3.0,2376,145143,1970,13038,1,0,0
4,4,3.0,2962,773303,1950,63000,1,0,0


In [3]:
# Creating Train, Validate, and Test sets
train, validate, test = mo.train_test_validate_split(df)

train	 n = 27767
test	 n = 9918
validate n = 11901


In [4]:
# Scaling the data
train_scaled, validate_scaled, test_scaled = mo.MM_scale_zillow(train, validate, test)

In [5]:
train_scaled.shape, validate_scaled.shape, test_scaled.shape

((27767, 9), (9918, 9), (11901, 9))

In [6]:
#Splitting Train Set:
X_train = train_scaled.drop(columns = ['tax_value'])
y_train = train.drop(columns = X_train)

#Splitting Validate Set:
X_validate = validate_scaled.drop(columns = ['tax_value'])
y_validate = validate.drop(columns = X_validate)

#Splitting Test Set:
X_test = test_scaled.drop(columns = ['tax_value'])
y_test = test.drop(columns = X_test)

In [7]:
#Adding mean baseline value to y_train:
y_train['baseline_mean'] = y_train.tax_value.mean()
#Adding mean baseline value to y_validate:
y_validate['baseline_mean'] = y_validate.tax_value.mean()

In [8]:
# Creating an empty list for baseline results:
results = []

# Creating a dictionary of baseline mean values:
baseline_mean = {
    'model':'baseline_mean',
    'RMSE_train': mean_squared_error(y_train['tax_value'], y_train['baseline_mean']) ** 0.5,
    'RMSE_validate': mean_squared_error(y_validate['tax_value'], y_validate['baseline_mean'])** 0.5
}
#Appending baseline mean to results list:
results.append(baseline_mean)

#Creating a DataFrame from the list of result dictionaries:
results = pd.DataFrame(results)
results

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,389831.755763,382873.659124


In [9]:
# Creating an empty list for baseline results:
results = []

# Creating a dictionary of baseline mean values:
baseline_mean = {
    'model':'baseline_mean',
    'RMSE_train': mean_squared_error(y_train['tax_value'], y_train['baseline_mean']) ** 0.5,
    'RMSE_validate': mean_squared_error(y_validate['tax_value'], y_validate['baseline_mean'])** 0.5
}
#Appending baseline mean to results list:
results.append(baseline_mean)

#Creating a DataFrame from the list of result dictionaries:
results = pd.DataFrame(results)
results

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,389831.755763,382873.659124


# OLS Model

In [10]:
#Creating an empty list for the error calculations:
error = []

# Creating the Model Object:
lm = LinearRegression(normalize=True)

# Fitting the model to the value in the training set:
lm.fit(X_train, y_train.tax_value)

# Using OLS to make predictions on training set:
y_train['OLS_pred'] = lm.predict(X_train)

# Calculating the RMSE for train:
rmse_train = mean_squared_error(y_train.tax_value, y_train.OLS_pred)**(0.5)

# Using OLS to make predictions on validate set:
y_validate['OLS_pred'] = lm.predict(X_validate)

# Calculating the RMSE for validate:
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.OLS_pred)**(0.5)

#Creating a dictionary of ols_regression stats:
ols_regression = {
    'model':'ols_regression',
    'RMSE_train': mean_squared_error(y_train['tax_value'], y_train['OLS_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['tax_value'], y_validate['OLS_pred'])**(0.5)
}

#Adding the ols_regression stats to the empty list:
error.append(ols_regression)
#Converting the list to a DataFrame:
error = pd.DataFrame(error)

#Concatenating the RMSE DataFrames, ignoring index so it auto-updates in the final DataFrame:
results = pd.concat([results, error], ignore_index = True)
results

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,389831.755763,382873.659124
1,ols_regression,310510.727308,304374.085637


In [11]:
## Lasso Lars

In [12]:
#Creating an empty list for the error calculations:
error = []

# Creating the Model Object:
lars = LassoLars(alpha=1)

# Fitting the model object to the training set:
lars.fit(X_train, y_train.tax_value)

# Using the LassoLars model to make predictions on the training set:
y_train['lasso_pred'] = lars.predict(X_train)

# Calculating the RMSE for the training set:
rmse_train = mean_squared_error(y_train.tax_value, y_train.lasso_pred) ** (0.5)

# Using the LassoLars model to make predictions on the validate set:
y_validate['lasso_pred'] = lars.predict(X_validate)

# Calculating the RMSE for the validate set:
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.lasso_pred) ** (0.5)

#Creating a dictionary of LassoLars stats:
lasso_lars = {
    'model':'LassoLars',
    'RMSE_train': mean_squared_error(y_train['tax_value'], y_train['lasso_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['tax_value'], y_validate['lasso_pred'])**(0.5)
}


#Adding the lasso_lars stats to the empty list:
error.append(lasso_lars)
#Converting the list to a DataFrame:
error = pd.DataFrame(error)

#Concatenating the RMSE DataFrames, ignoring index so it auto-updates in the final DataFrame:
results = pd.concat([results, error], ignore_index = True)
results

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,389831.755763,382873.659124
1,ols_regression,310510.727308,304374.085637
2,LassoLars,310511.337222,304375.596423


## Polynomial Regression

### Features

In [13]:
# make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree = 4)

# fit and transform X_train_scaled
X_train4 = pf.fit_transform(X_train)

# transform X_validate_scaled & X_test_scaled
X_validate4 = pf.transform(X_validate)
X_test4 = pf.transform(X_test)

In [14]:
#Creating an empty list for the error calculations:
error = []

# create the model object
lm2 = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm2.fit(X_train4, y_train.tax_value)

# predict train
y_train['poly_pred'] = lm2.predict(X_train4)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.poly_pred) ** 0.5

# predict validate
y_validate['poly_pred'] = lm2.predict(X_validate4)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.poly_pred) ** 0.5

#Creating a dictionary of polynomial stats:
polynomial_regressor = {
    'model':'poly_regressor',
    'RMSE_train': mean_squared_error(y_train['tax_value'], y_train['poly_pred']) ** (0.5),
    'RMSE_validate': mean_squared_error(y_validate['tax_value'], y_validate['poly_pred']) ** (0.5)
}

#Adding the tweedie_regressor stats to the empty list:
error.append(polynomial_regressor)
#Converting the list to a DataFrame:
error = pd.DataFrame(error)

#Concatenating the RMSE DataFrames, ignoring index so it auto-updates in the final DataFrame:
results = pd.concat([results, error], ignore_index = True)
results

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,389831.755763,382873.659124
1,ols_regression,310510.727308,304374.085637
2,LassoLars,310511.337222,304375.596423
3,poly_regressor,298114.181354,296797.160662


# Polynomial Model: Test

In [15]:
#Creating an empty list for the error calculations:
error = []

# create the model object
lm2 = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm2.fit(X_train4, y_train.tax_value)

# predict train
y_train['poly_pred'] = lm2.predict(X_train4)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.poly_pred) ** 0.5

# predict test
y_test['poly_pred'] = lm2.predict(X_test4)

# evaluate: rmse
rmse_validate = mean_squared_error(y_test.tax_value, y_test.poly_pred) ** 0.5

#Creating a dictionary of polynomial stats:
polynomial_regressor = {
    'model':'poly_regressor',
    'RMSE_train': mean_squared_error(y_train['tax_value'], y_train['poly_pred']) ** (0.5),
    'RMSE_test': mean_squared_error(y_test['tax_value'], y_test['poly_pred']) ** (0.5)
}

#Adding the tweedie_regressor stats to the empty list:
error.append(polynomial_regressor)
#Converting the list to a DataFrame:
error = pd.DataFrame(error)

#Concatenating the RMSE DataFrames, ignoring index so it auto-updates in the final DataFrame:
results = pd.concat([results, error], ignore_index = True)
results

Unnamed: 0,model,RMSE_train,RMSE_validate,RMSE_test
0,baseline_mean,389831.755763,382873.659124,
1,ols_regression,310510.727308,304374.085637,
2,LassoLars,310511.337222,304375.596423,
3,poly_regressor,298114.181354,296797.160662,
4,poly_regressor,298114.181354,,304171.807861


In [16]:
test_performance = (results.RMSE_validate[0] - results.RMSE_test[4]) / results.RMSE_validate[0]
print(f"The model beats the baseline by {round((test_performance * 100),2)}%.")

The model beats the baseline by 20.56%.
