In [14]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import env
import numpy as np
import scipy.stats as stats
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, RobustScaler
import warnings 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, LassoLars
from sklearn.preprocessing import PolynomialFeatures
warnings.filterwarnings("ignore")
from pydataset import data
import wrangle as wg
import explore as ex
pd.set_option('display.max_rows', 100)
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
np.random.seed(77)
pd.set_option('display.max_columns', None)
random_state = 77

In [2]:
df = wg.acquire()

In [3]:
df = wg.prepare(df)

In [4]:
train, validate, test = wg.train_validate_test_split(df)

train	 n = 28967
test	 n = 10346
validate n = 12415


In [5]:
train_scaled, validate_scaled, test_scaled = wg.MM_scale_zillow(train, validate, test)

In [6]:
def add_clusters(train_scaled, validate_scaled, test_scaled):
    '''
    This function takes in the train, validate, and test samples from the zillow dataset.
    It then performs clustering on various combinations of features in the train sample, 
    Those clusters are then given useful names where appropriate, and added
    as categorical features to the dataset.
    The train, validate, and test df's are returned, in that order.
    '''
    
    # cluster_BedBath

    # identify features
    features = ['bedroomcnt', 'bathroomcnt']
    # create the df to cluster on 
    x = train_scaled[features]
    # create and fit the KMeans object
    kmeans = KMeans(n_clusters=3, random_state=random_state)
    kmeans.fit(x)

    # create cluster labels for each of the samples and add as an additional column
    for sample in [train_scaled, validate_scaled, test_scaled]:
        x = sample[features]
        sample['cluster_BedBath'] = kmeans.predict(x)
        sample['cluster_BedBath'] = sample.cluster_BedBath.map({1:'low', 0:'mid', 2:'high'})

    # repeat the process for each of the desired feature combinations on which to cluster

    # cluster_BedBathSqft

    features = ['bedroomcnt', 'bathroomcnt', 'sq_ft']
    x = train_scaled[features]
    kmeans = KMeans(n_clusters=3, random_state=random_state)
    kmeans.fit(x)

    for sample in [train_scaled, validate_scaled, test_scaled]:
        x = sample[features]
        sample['cluster_BedBathSqft'] = kmeans.predict(x)
        sample['cluster_BedBathSqft'] = sample.cluster_BedBathSqft.map({1:'low', 0:'mid', 2:'high'})

    
    # cluster_BedBathTaxvaluepersqft
    features = ['bedroomcnt', 'bathroomcnt', 'taxval_sqft']
    x = train_scaled[features]
    kmeans = KMeans(n_clusters=3, random_state=random_state)
    kmeans.fit(x)

    for sample in [train_scaled, validate_scaled, test_scaled]:
        x = sample[features]
        sample['cluster_BedBathTaxvaluepersqft'] = kmeans.predict(x)
        sample['cluster_BedBathTaxvaluepersqft'] = sample.cluster_BedBathTaxvaluepersqft.astype(str)
    
        
    # cluster_LatLong
    features = ['latitude', 'longitude']
    x = train_scaled[features]
    kmeans = KMeans(n_clusters=4, random_state=random_state)
    kmeans.fit(x)

    for sample in [train_scaled, validate_scaled, test_scaled]:
        x = sample[features]
        sample['cluster_LatLong'] = kmeans.predict(x)
        sample['cluster_LatLong'] = sample.cluster_LatLong.map({0:'east', 1:'central', 2:'west', 3:'north'})


    return train_scaled, validate_scaled, test_scaled



In [7]:
train_scaled, validate_scaled, test_scaled = add_clusters(train_scaled, validate_scaled, test_scaled)

In [8]:
#First we'll encode our features, to make them usable in the final model
train_scaled,validate_scaled,test_scaled = wg.dummies(train_scaled,validate_scaled,test_scaled)



In [9]:
train_scaled.head()

Unnamed: 0,bathroomcnt,bedroomcnt,sq_ft,county,fireplacecnt,fullbathcnt,garagecarcnt,garagetotalsqft,hashottuborspa,latitude,longitude,lotsizesquarefeet,poolcnt,structuretaxvaluedollarcnt,taxvaluedollarcnt,taxdelinquencyflag,logerror,propertylandusedesc,hot_tub_or_spa,age,taxval_sqft,cluster_BedBath,cluster_BedBathSqft,cluster_BedBathTaxvaluepersqft,cluster_LatLong,cluster_BedBath_low,cluster_BedBath_mid,cluster_BedBathSqft_low,cluster_BedBathSqft_mid,cluster_BedBathTaxvaluepersqft_1,cluster_BedBathTaxvaluepersqft_2,cluster_LatLong_east,cluster_LatLong_north,cluster_LatLong_west
7677,0.4,0.363636,0.162172,LA County,0.0,0.4,0.0,0.0,0.0,0.55162,0.461458,0.000749,0.0,0.053904,0.036677,1,0.003152,Single Family Residential,0,0.065217,0.067361,low,low,1,central,1,0,1,0,1,0,0,0,0
12608,0.1,0.272727,0.07898,LA County,0.0,0.1,0.0,0.0,0.0,0.777917,0.444821,0.001746,0.0,0.039519,0.015908,0,0.004674,Single Family Residential,0,0.369565,0.05696,mid,mid,0,central,0,1,0,1,0,0,0,0,0
44211,0.1,0.272727,0.055461,LA County,0.0,0.1,0.0,0.0,0.0,0.580396,0.497229,0.00093,0.0,0.02999,0.017907,0,-0.037723,Single Family Residential,0,0.485507,0.088089,mid,mid,0,central,0,1,0,1,0,0,0,0,0
44400,0.1,0.272727,0.07781,LA County,0.0,0.1,0.0,0.0,0.0,0.343542,0.687752,0.000698,0.0,0.015154,0.020441,0,-0.014083,Single Family Residential,0,0.463768,0.074392,mid,mid,0,east,0,1,0,1,0,0,1,0,0
30257,0.2,0.363636,0.108348,Orange County,0.0,0.2,0.1,0.134436,0.0,0.30008,0.798239,0.001056,0.0,0.02164,0.017938,0,-0.096859,Single Family Residential,0,0.456522,0.047919,high,high,2,west,0,0,0,0,0,1,0,0,1


In [10]:
features = ['cluster_LatLong_west','cluster_LatLong_north','cluster_LatLong_east','cluster_BedBathTaxvaluepersqft_2','cluster_BedBathTaxvaluepersqft_1','cluster_BedBathSqft_mid','cluster_BedBathSqft_low','cluster_BedBath_mid','cluster_BedBath_low']

#Splitting Train Set:
X_train = train_scaled[features]
y_train = train[['logerror']]

#Splitting Validate Set:
X_validate = validate_scaled[features]
y_validate = validate[['logerror']]
#Splitting Test Set:
X_test = test_scaled[features]
y_test = test[['logerror']]

## Baseline

In [11]:
#Adding mean baseline value to y_train:
y_train['baseline_mean'] = y_train.logerror.mean()
#Adding mean baseline value to y_validate:
y_validate['baseline_mean'] = y_validate.logerror.mean()

## RMSE

In [15]:
# Creating an empty list for baseline results:
results = []

# Creating a dictionary of baseline mean values:
baseline_mean = {
    'model':'baseline_mean',
    'RMSE_train': mean_squared_error(y_train['logerror'], y_train['baseline_mean']) ** 0.5,
    'RMSE_validate': mean_squared_error(y_validate['logerror'], y_validate['baseline_mean'])** 0.5
}
#Appending baseline mean to results list:
results.append(baseline_mean)

#Creating a DataFrame from the list of result dictionaries:
results = pd.DataFrame(results)
results

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,0.177254,0.182948


In [16]:
## OLS

In [17]:
#Creating an empty list for the error calculations:
error = []

# Creating the Model Object:
lm = LinearRegression(normalize=True)

# Fitting the model to the value in the training set:
lm.fit(X_train, y_train.logerror)

# Using OLS to make predictions on training set:
y_train['OLS_pred'] = lm.predict(X_train)

# Calculating the RMSE for train:
rmse_train = mean_squared_error(y_train.logerror, y_train.OLS_pred)**(0.5)

# Using OLS to make predictions on validate set:
y_validate['OLS_pred'] = lm.predict(X_validate)

# Calculating the RMSE for validate:
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.OLS_pred)**(0.5)

#Creating a dictionary of ols_regression stats:
ols_regression = {
    'model':'ols_regression',
    'RMSE_train': mean_squared_error(y_train['logerror'], y_train['OLS_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['logerror'], y_validate['OLS_pred'])**(0.5)
}

#Adding the ols_regression stats to the empty list:
error.append(ols_regression)
#Converting the list to a DataFrame:
error = pd.DataFrame(error)

#Concatenating the RMSE DataFrames, ignoring index so it auto-updates in the final DataFrame:
results = pd.concat([results, error], ignore_index = True)
results

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,0.177254,0.182948
1,ols_regression,0.177005,0.182767


## polynomial

In [18]:
# make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree = 3)

# fit and transform X_train_scaled
X_train3 = pf.fit_transform(X_train)

# transform X_validate_scaled & X_test_scaled
X_validate3 = pf.transform(X_validate)
X_test3 = pf.transform(X_test)

In [19]:
#Creating an empty list for the error calculations:
error = []

# create the model object
lm2 = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm2.fit(X_train3, y_train.logerror)

# predict train
y_train['poly_pred'] = lm2.predict(X_train3)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.logerror, y_train.poly_pred) ** 0.5

# predict validate
y_validate['poly_pred'] = lm2.predict(X_validate3)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.poly_pred) ** 0.5

#Creating a dictionary of polynomial stats:
polynomial_regressor = {
    'model':'poly_regressor',
    'RMSE_train': mean_squared_error(y_train['logerror'], y_train['poly_pred']) ** (0.5),
    'RMSE_validate': mean_squared_error(y_validate['logerror'], y_validate['poly_pred']) ** (0.5)
}

#Adding the tweedie_regressor stats to the empty list:
error.append(polynomial_regressor)
#Converting the list to a DataFrame:
error = pd.DataFrame(error)

#Concatenating the RMSE DataFrames, ignoring index so it auto-updates in the final DataFrame:
results = pd.concat([results, error], ignore_index = True)
results

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,0.177254,0.182948
1,ols_regression,0.177005,0.182767
2,poly_regressor,0.176954,0.18279


## Lasso Lars

In [20]:
#Creating an empty list for the error calculations:
error = []

# Creating the Model Object:
lars = LassoLars(alpha=1)

# Fitting the model object to the training set:
lars.fit(X_train, y_train.logerror)

# Using the LassoLars model to make predictions on the training set:
y_train['lasso_pred'] = lars.predict(X_train)

# Calculating the RMSE for the training set:
rmse_train = mean_squared_error(y_train.logerror, y_train.lasso_pred) ** (0.5)

# Using the LassoLars model to make predictions on the validate set:
y_validate['lasso_pred'] = lars.predict(X_validate)

# Calculating the RMSE for the validate set:
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.lasso_pred) ** (0.5)

#Creating a dictionary of LassoLars stats:
lasso_lars = {
    'model':'LassoLars',
    'RMSE_train': mean_squared_error(y_train['logerror'], y_train['lasso_pred'])**(0.5),
    'RMSE_validate': mean_squared_error(y_validate['logerror'], y_validate['lasso_pred'])**(0.5)
}


#Adding the lasso_lars stats to the empty list:
error.append(lasso_lars)
#Converting the list to a DataFrame:
error = pd.DataFrame(error)

#Concatenating the RMSE DataFrames, ignoring index so it auto-updates in the final DataFrame:
results = pd.concat([results, error], ignore_index = True)
results

Unnamed: 0,model,RMSE_train,RMSE_validate
0,baseline_mean,0.177254,0.182948
1,ols_regression,0.177005,0.182767
2,poly_regressor,0.176954,0.18279
3,LassoLars,0.177254,0.182949


## the best model is the test model: LL

In [21]:
#Creating an empty list for the error calculations:
error = []

# Creating the Model Object:
lars = LassoLars(alpha=1)

# Fitting the model object to the training set:
lars.fit(X_train, y_train.logerror)

# Using the LassoLars model to make predictions on the training set:
y_train['lasso_pred'] = lars.predict(X_train)

# Calculating the RMSE for the training set:
rmse_train = mean_squared_error(y_train.logerror, y_train.lasso_pred) ** (0.5)

# Using the LassoLars model to make predictions on the validate set:
y_test['lasso_pred'] = lars.predict(X_test)

# Calculating the RMSE for the validate set:
rmse_validate = mean_squared_error(y_test.logerror, y_test.lasso_pred) ** (0.5)

#Creating a dictionary of LassoLars stats:
lasso_lars = {
    'model':'LassoLars',
    'RMSE_train': mean_squared_error(y_train['logerror'], y_train['lasso_pred'])**(0.5),
    'RMSE_test': mean_squared_error(y_test['logerror'], y_test['lasso_pred'])**(0.5)
}


#Adding the lasso_lars stats to the empty list:
error.append(lasso_lars)
#Converting the list to a DataFrame:
error = pd.DataFrame(error)

#Concatenating the RMSE DataFrames, ignoring index so it auto-updates in the final DataFrame:
results = pd.concat([results, error], ignore_index = True)
results

Unnamed: 0,model,RMSE_train,RMSE_validate,RMSE_test
0,baseline_mean,0.177254,0.182948,
1,ols_regression,0.177005,0.182767,
2,poly_regressor,0.176954,0.18279,
3,LassoLars,0.177254,0.182949,
4,LassoLars,0.177254,,0.157919


In [22]:
test_performance = (results.RMSE_validate[0] - results.RMSE_test[4]) / results.RMSE_validate[0]
print(f"The model beats the baseline by {round((test_performance * 100),2)}%.")

The model beats the baseline by 13.68%.
