In [1]:
import wrangle
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer
from IPython.display import display_html 
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor

In [2]:
df = wrangle.get_zillow()
df = wrangle.prep_zillow(df)
df = wrangle.remove_outliers(df)


In [3]:
train, validate, test = wrangle.my_split(df)

In [4]:
train.head()

Unnamed: 0,parcelid,bathrooms,bedrooms,sqft,county,latitude,garagesqft,longitude,lotsize,tract,...,structuretaxvalue,landtaxvalue,taxamount,logerror,age,abserror,dollarspersqft,Los Angeles,Orange,Ventura
404,17104278,1.5,3.0,2287.0,Ventura,34.246512,624.0,-119.074377,20140.0,52,...,344314.0,639441.0,10544.42,0.036208,60.0,0.036208,430.150853,0,0,1
21579,17108515,2.0,3.0,1727.0,Ventura,34.269965,470.0,-119.165052,6283.0,14,...,317359.0,170881.0,5133.44,0.028381,45.0,0.028381,282.709902,0,0,1
10592,12084446,1.0,2.0,1507.0,Los Angeles,34.129645,0.0,-118.238843,4166.0,3021,...,83877.0,335512.0,4645.87,0.02293,95.0,0.02293,278.293962,1,0,0
46716,11444738,1.0,2.0,875.0,Los Angeles,33.864036,0.0,-118.358191,6416.0,6206,...,27436.0,154415.0,2646.11,-0.051681,67.0,0.051681,207.829714,1,0,0
12777,10876057,2.0,4.0,2255.0,Los Angeles,34.214165,0.0,-118.390276,8117.0,1218,...,150000.0,250000.0,4961.66,-0.169398,81.0,0.169398,177.383592,1,0,0


In [5]:
train.columns

Index(['parcelid', 'bathrooms', 'bedrooms', 'sqft', 'county', 'latitude',
       'garagesqft', 'longitude', 'lotsize', 'tract', 'regionidzip',
       'structuretaxvalue', 'landtaxvalue', 'taxamount', 'logerror', 'age',
       'abserror', 'dollarspersqft', 'Los Angeles', 'Orange', 'Ventura'],
      dtype='object')

In [12]:
def predict_baseline(train):
    '''
    Function to calculate the RMSE for the mean and median logerror of zillow properties
    accepts train dataframe, displays a table of formatted results, and returns a results table
    '''

    # create y_train and y_validate
    y_train = train['logerror']
        
    y_train = pd.DataFrame(y_train)
    
    value_pred_mean = y_train['logerror'].mean()
    y_train['value_pred_mean'] = value_pred_mean

    # compute value_pred_median
    value_pred_median = y_train['logerror'].median()
    y_train['value_pred_median'] = value_pred_median

    # RMSE of value_pred_mean
    rmse_train = mean_squared_error(y_train.logerror, y_train.value_pred_mean)**(1/2)

    results = pd.DataFrame(columns = ['model', 'RMSE_train', 'RMSE_validate', 'R2'])
    newresult = ['Mean','{:,.4f}'.format(rmse_train),'N/A', 'N/A']
    results.loc[len(results)] = newresult

    # RMSE of value_pred_median
    rmse_train = mean_squared_error(y_train.logerror, y_train.value_pred_median)**(1/2)

    # create and display tabular formatted data
    newresult = ['Median','{:,.4f}'.format(rmse_train),'N/A', 'N/A']
    results.loc[len(results)] = newresult

    df_style = results.style.set_table_attributes("style='display:inline; margin-right:100px;'").set_caption("RESULTS")
    display_html(df_style._repr_html_(), raw=True)

    return newresult

In [10]:
def LRmodel(model, X_train, y_train, X_2, y_2):
    '''
    Function to fit a model, make predictions on two sets of data, and return a 
    row of evaluation data
    Accepts: a model (not fit)
             a dataframe of X_train data that will be used to fit the model
             a dataframe of y_train data that will be used to fit the model
             a second dataframe of X data (can be validate or test) to make predictions
             a second dataframe of y data (can be validate or test) to evaluate predictions
    Returns: a list containing the model, the RMSE for train data, the RMSE of the second dataset,
             and the R2 score of the second dataset
    '''
    # fit the model object
    model.fit(X_train, y_train)

    # predict train
    yhat_train = model.predict(X_train)
    yhat_2 = model.predict(X_2)

    # evaluate: rmse for train
    rmse_train = mean_squared_error(y_train, yhat_train)**(1/2)

    # evaluate: rmse for validate (or test if that is what is sent to the function)
    rmse_2 = mean_squared_error(y_2, yhat_2)**(1/2)

    # format results and save as a list that will be returned
    newresult = [model,'{:,.4f}'.format(rmse_train),'{:,.4f}'.format(rmse_2), round(r2_score(y_2, yhat_2),4)]

    return newresult

In [11]:
predict_baseline(train)

Unnamed: 0,model,RMSE_train,RMSE_validate,R2
0,Mean,0.1637,,
1,Median,0.164,,
