3 models: 
    1. cluster with logerr
    2. cluster with estimate
    3. cluster with the independetn variable(square footage)
    4. cluster with one more independent variable(long lati)

In [5]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

from preprocessing import split_data
from acquire import acquire_data
from prepare import prepare_data

import warnings
warnings.filterwarnings('ignore')

# 1. Build and evaluate different models

In [88]:
df = prepare_data()
train, validate, test = split_data(df)

In [89]:
def build_model(train, feature_clustered, k):   
    # create cluster object, train and fit
    X = train[feature_clustered]
    kmeans = KMeans(k)
    kmeans.fit(X)

    #encode cluster into different colomns
    train['cluster'] = kmeans.predict(train[feature_clustered])
    cluster_df = pd.get_dummies(train.cluster)
    # concatenate the dataframe with the  one hot encoded cluster  to the original dataframe
    train= pd.concat([train, cluster_df], axis =1)
    # drop cluster columns
    train = train.drop(columns = ['cluster'])

    #
    X =train.drop(columns = ['regionidcity', 'regionidzip', 'actual_value', 'logerror','estimated'])\
    .columns
    Y =['estimated']

    # Initialize the Linear Regression Object 
    lm = LinearRegression()

    rfe = RFE(lm, 9)

    # Transforming data using RFE
    X_rfe = rfe.fit_transform(train[X],train[Y])  
    mask = rfe.support_

    # select the column names of the features that were selected and convert them to a list for future use. 
    rfe_features = train[X].columns[mask]

    lm = LinearRegression()
    lm.fit(X_rfe,train[Y])
    train['prediction'] = lm.predict(X_rfe)

    # select the column names of the features that were selected and convert them to a list for future use. 
    mask = rfe.support_
    rfe_features = train[X].columns[mask]
    rfe_features

    RMSE_train = np.sqrt(mean_squared_error(train.estimated, train.prediction))
    return RMSE_train, kmeans, lm, rfe_features

In [92]:
def validate_model(validate,feature_clustered,kmeans,lm,rfe_features):   
    validate['cluster'] = kmeans.predict(validate[feature_clustered])
    cluster_df = pd.get_dummies(validate.cluster)
    # concatenate the dataframe with the  one hot encoded cluster  to the original dataframe
    validate= pd.concat([validate, cluster_df], axis =1)
    # drop cluster column
    validate = validate.drop(columns = ['cluster'])
    validate['prediction'] = lm.predict(validate[rfe_features])

    RMSE_vali = np.sqrt(mean_squared_error(validate.estimated, validate.prediction))
    return RMSE_vali

In [101]:
def test_model(test,feature_clustered,kmeans,lm,rfe_features):   
    test['cluster'] = kmeans.predict(test[feature_clustered])
    cluster_df = pd.get_dummies(test.cluster)
    # concatenate the dataframe with the  one hot encoded cluster  to the original dataframe
    test= pd.concat([test, cluster_df], axis =1)
    # drop cluster column
    test = test.drop(columns = ['cluster'])
    test['prediction'] = lm.predict(test[rfe_features])

    RMSE_test = np.sqrt(mean_squared_error(test.estimated, test.prediction))
    return RMSE_test

### 1.1 Clustering logerror and build linear model

In [97]:
feature_clustered = ['logerror'] 
k =8
RMSE_train, kmeans, lm, rfe_features = build_model(train.copy(), feature_clustered, k)
print(f'RMSE_train = {RMSE_train:.2f}')

RMSE_vali = validate_model(validate.copy(),feature_clustered,kmeans,lm,rfe_features)
print(f'RMSE_validate = {RMSE_vali:.2f}')

RMSE_train = 263075.56
RMSE_validate = 259176.27


### 1.2 Clustering estimate and build linear model

In [98]:
feature_clustered = ['estimated'] 
k =6
RMSE_train, kmeans, lm, rfe_features = build_model(train.copy(), feature_clustered, k)
print(f'RMSE_train = {RMSE_train:.2f}')

RMSE_vali = validate_model(validate.copy(),feature_clustered,kmeans,lm,rfe_features)
print(f'RMSE_validate = {RMSE_vali:.2f}')

RMSE_train = 69268.63
RMSE_validate = 70070.64


### 1.3 Clustering house_size and build linear model

In [99]:
feature_clustered = ['house_size'] 
k =4
RMSE_train, kmeans, lm, rfe_features = build_model(train.copy(), feature_clustered, k)
print(f'RMSE_train = {RMSE_train:.2f}')

RMSE_vali = validate_model(validate.copy(),feature_clustered,kmeans,lm,rfe_features)
print(f'RMSE_validate = {RMSE_vali:.2f}')

RMSE_train = 254765.77
RMSE_validate = 251387.29


### 1.4 Clustering longitude and latitude and build linear model

In [100]:
feature_clustered = ['latitude','longitude'] 
k =6
RMSE_train, kmeans, lm, rfe_features = build_model(train.copy(), feature_clustered, k)
print(f'RMSE_train = {RMSE_train:.2f}')

RMSE_vali = validate_model(validate.copy(),feature_clustered,kmeans,lm,rfe_features)
print(f'RMSE_validate = {RMSE_vali:.2f}')

RMSE_train = 263967.58
RMSE_validate = 259553.44


***From above we could see the best performance model is by cluserimg estimated value***

# 2. Apply best performace model to the data frame

In [103]:
test_model(test,feature_clustered,kmeans,lm,rfe_features)
feature_clustered = ['estimated'] 
k =6
RMSE_train, kmeans, lm, rfe_features = build_model(train.copy(), feature_clustered, k)
print(f'RMSE_train = {RMSE_train:.2f}')

RMSE_test = test_model(test,feature_clustered,kmeans,lm,rfe_features)
print(f'RMSE_test = {RMSE_test:.2f}')

RMSE_train = 69266.30
RMSE_test = 67739.17
