#### Imports

In [1]:
import wrangle
import explore
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.float_format = '{:20,.2f}'.format

from math import sqrt
from scipy import stats

import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression, SelectKBest, RFE 
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cluster import KMeans

#### Import LA Dataframe For Explore

In [8]:
df = wrangle.get_zillow_data(cached=True)
df_la, df_v, df_o = wrangle.clean_zillow_data(df)
X_train, X_validate, X_test, X_train_explore, y_train, y_validate, y_test = wrangle.train_valid_test(df)
X_train_scaled, X_validate_scaled, X_test_scaled = wrangle.scale_min_max(X_train, X_validate, X_test)
X_train.shape, X_validate.shape, X_test.shape

((28511, 16), (12219, 16), (10183, 16))

In [9]:
df_la.shape, df_v.shape, df_o.shape

((32855, 16), (4350, 16), (13708, 16))

In [5]:
df = wrangle.get_zillow_data(cached=True)
df_v = wrangle.clean_zillow_data(df)
X_train, X_validate, X_test, X_train_explore, y_train, y_validate, y_test = wrangle.train_valid_test(df)
X_train_scaled, X_validate_scaled, X_test_scaled = wrangle.scale_min_max(X_train, X_validate, X_test)
X_train.shape, X_validate.shape, X_test.shape

((28511, 16), (12219, 16), (10183, 16))

In [None]:

def get_counties(df):
    # create dummy vars of fips id
    county_df = pd.get_dummies(df.fips)
    # rename columns by actual county name
    county_df.columns = ['LA', 'Orange', 'Ventura']
    # concatenate the dataframe with the 3 county columns to the original dataframe
    df_dummies = pd.concat([df, county_df], axis = 1)
    # drop regionidcounty and fips columns
    df = df_dummies.drop(columns = ['regionidcounty', 'fips'])
    return df

###########################################################

def create_features(df):
    df['age'] = 2017 - df.yearbuilt
    # create taxrate variable
    df['taxrate'] = df.taxamount/df.taxvaluedollarcnt
    # create acres variable
    df['acres'] = df.lotsizesquarefeet/43560
    # dollar per square foot-structure
    df['structure_dollar_per_sqft'] = df.structuretaxvaluedollarcnt/df.calculatedfinishedsquarefeet
    # dollar per square foot-land
    df['land_dollar_per_sqft'] = df.landtaxvaluedollarcnt/df.lotsizesquarefeet
    # ratio of beds to baths
    df['bed_bath_ratio'] = df.bedroomcnt/df.bathroomcnt
    df['bed_bath_ratio'].round(decimals=2)
    return df

###########################################################

def remove_outliers(df):
    '''
    remove outliers in bed, bath, zip, square feet, acres & tax rate
    '''
    df[((df.bathroomcnt <= 7) & (df.bedroomcnt <= 7) & 
               (df.regionidzip < 100000) & 
               (df.bathroomcnt > 0) & 
               (df.bedroomcnt > 1) & 
               (df.acres < 10) &
               (df.calculatedfinishedsquarefeet < 7000) & 
               (df.taxrate < .05)
              )]
    return df

###########################################################

def col_to_drop_post_feature_creation(df):
    cols_to_drop = ['bedroomcnt', 'taxamount', 
               'taxvaluedollarcnt', 'structuretaxvaluedollarcnt',
               'landtaxvaluedollarcnt','lotsizesquarefeet', "regionidzip", "yearbuilt"]
    df = df.drop(columns = cols_to_drop)
    return df

###########################################################

def county_df(df):
    df_la = df[df.LA==1]
    df_v = df[df.Ventura==1]
    df_o = df[df.Orange==1]
    return df_la, df_v, df_o

###########################################################

def clean_zillow_data(df):
    '''
    This function drops colums that are duplicated or unneessary, creates new features, and changes column labels
    '''
    df.dropna(inplace=True)
    df.latitude = df.latitude / 1000000
    df.longitude = df.longitude / 1000000
    df = get_counties(df)
    df = create_features(df)
    df = remove_outliers(df)
    df = col_to_drop_post_feature_creation(df)
    df_la, df_v, df_o = county_df(df)
    return df_la, df_v, df_o

###########################################################

def train_valid_test(df):
    train_validate, test = train_test_split(df, test_size = .2, random_state = 123)
    train, validate = train_test_split(train_validate, test_size = .3, random_state = 123)
    
    # Assign variables
    X_train = train.drop(columns=['logerror'])
    X_validate = validate.drop(columns=['logerror'])
    X_test = test.drop(columns=['logerror'])
    X_train_explore = train

    # I need X_train_explore set to train so I have access to the target variable.
    y_train = train[['logerror']]
    y_validate = validate[['logerror']]
    y_test = test[['logerror']]
    
    return X_train, X_validate, X_test, X_train_explore, y_train, y_validate, y_test

###########################################################

def scale_min_max(X_train, X_validate, X_test):
    # create the scaler object and fit to X_train (get the min and max from X_train for each column)
    scaler = MinMaxScaler(copy=True, feature_range=(0,1)).fit(X_train)

    # transform X_train values to their scaled equivalent and create df of the scaled features
    X_train_scaled = pd.DataFrame(scaler.transform(X_train), 
                                  columns=X_train.columns.values).set_index([X_train.index.values])
    
    # transform X_validate values to their scaled equivalent and create df of the scaled features
    X_validate_scaled = pd.DataFrame(scaler.transform(X_validate),
                                    columns=X_validate.columns.values).set_index([X_validate.index.values])

    # transform X_test values to their scaled equivalent and create df of the scaled features   
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), 
                                 columns=X_test.columns.values).set_index([X_test.index.values])
    
    return X_train_scaled, X_validate_scaled, X_test_scaled

###########################################################

In [None]:
df = wrangle.get_zillow_data(cached=True)
df_la = wrangle.clean_zillow_data(df)
X_train, X_validate, X_test, X_train_explore, y_train, y_validate, y_test = wrangle.train_valid_test(df)
X_train_scaled, X_validate_scaled, X_test_scaled = wrangle.scale_min_max(X_train, X_validate, X_test)

In [None]:
X_train_scaled.shape


In [None]:
def train_valid_test(df):
    train_validate, test = train_test_split(df, test_size = .2, random_state = 123)
    train, validate = train_test_split(train_validate, test_size = .3, random_state = 123)
    
    # Assign variables
    X_train = train.drop(columns=['logerror'])
    X_validate = validate.drop(columns=['logerror'])
    X_test = test.drop(columns=['logerror'])
    X_train_explore = train

    # I need X_train_explore set to train so I have access to the target variable.
    y_train = train[['logerror']]
    y_validate = validate[['logerror']]
    y_test = test[['logerror']]
    
    return X_train, X_validate, X_test, X_train_explore, y_train, y_validate, y_test

In [None]:
X_train, X_validate, X_test, X_train_explore, y_train, y_validate, y_test = train_valid_test(df)

In [None]:
train.head()

In [None]:
# Call in Dataframe
df = wrangle.get_zillow_data(cached=False)
# Clean Data with Outliers Removed
X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle.clean_zillow(df) 
# Clean Data With Outliers Scaled
X_train_scaled, X_validate_scaled, X_test_scaled = wrangle.model_zillow(X_train, X_validate, X_test)

In [None]:
# Test Split
X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape

In [None]:
# Test Scale
X_train_scaled.shape, X_validate_scaled.shape, X_test_scaled.shape

In [None]:
X_train.head()

In [None]:
X_train_scaled.head()

### Exploration:

**Target = Logerror** 


-A number that represents a ratio that is derived from two prior distributions - the real price distribution of homes and then Zillow's existing model of that distribution


#### New Dataframes Per County

In [None]:
# LA County (Train)
X_train_LA = X_train[X_train.LA==1]
X_train_scaled_LA = X_train_scaled[X_train_scaled.LA==1]
#################################################################
X_validate_LA = X_validate[X_validate.LA==1]
X_validate_scaled_LA = X_validate_scaled[X_validate_scaled.LA==1]
#################################################################
X_test_LA = X_test[X_test.LA==1]
X_test_scaled_LA = X_test_scaled[X_test_scaled.LA==1]
#################################################################
X_train_LA.shape, X_validate_LA.shape, X_test_LA.shape

In [None]:
# Ventura County
X_train_V = X_train[X_train.Ventura==1]
X_train_scaled_V = X_train_scaled[X_train_scaled.Ventura==1]
#################################################################
X_validate_V = X_validate[X_validate.Ventura==1]
X_validate_scaled_V = X_validate_scaled[X_validate_scaled.Ventura==1]
#################################################################
X_test_V = X_test[X_test.Ventura==1]
X_test_scaled_V= X_test_scaled[X_test_scaled.Ventura==1]
#################################################################
X_train_V.shape, X_validate_V.shape, X_test_V.shape

In [None]:
# Orange County
X_train_O = X_train[X_train.Orange==1]
X_train_scaled_O = X_train_scaled[X_train_scaled.Orange==1]
#################################################################
X_validate_O = X_validate[X_validate.Orange==1]
X_validate_scaled_O = X_validate_scaled[X_validate_scaled.Orange==1]
#################################################################
X_test_O = X_test[X_test.Orange==1]
X_test_scaled_O= X_test_scaled[X_test_scaled.Orange==1]
#################################################################
X_train_O.shape, X_validate_O.shape, X_test_O.shape

#### LA County

In [None]:
X_train_scaled_LA.head()

#### Inital Thoughts:

- From my inital investigation on regression project I know that room count has a large affect on taxrate and housing price.  I was unable to create a derived feature last go round so I want to test the affect of this feature now.     

- I want to examine how usefull our created feature of bedbathratio is in predicting logerror in LA County.  I chose LA County because it has the largest number of datapoints.  I want to cluster on bedbathratio, bathroomcnt, and caluculaedfinishedsquarefeet.   

## (Room Clusters)

#### Step 1. Elbow Plot

In [None]:
# #Reasign for formula to work correctly
# X_train_scaled = X_train_scaled_LA.copy()

cluster_vars = ['bathroomcnt', 'bed_bath_ratio', 'calculatedfinishedsquarefeet']
explore.elbow_plot(X_train_scaled_LA, cluster_vars)

#### Takeaway:

- Looks like 3 is the optimal K for this cluster

***

#### Step 2. Create Clusters

#### 2a. Train Cluster

In [None]:
LA_train_clusters, kmeans = explore.run_kmeans(X_train_LA, X_train_scaled_LA, k=3, cluster_vars=cluster_vars, cluster_col_name = 'room_cluster')

In [None]:
 LA_train_clusters

In [None]:
#Visualize distribution of clusters, they do not look even
LA_train_clusters.room_cluster.value_counts()

#### Get Centroids

In [None]:
centroid_col_names = ['centroid_' + i for i in cluster_vars]
centroid_col_names

LA_centroids = pd.DataFrame(kmeans.cluster_centers_, 
             columns=centroid_col_names).reset_index().rename(columns={'index': 'room_cluster'})

In [None]:
LA_centroids

#### Append cluster id onto X_train & X_train_scaled, then join with the centroids dataframe.


In [None]:
# concatenate cluster id on LA_X_Train
X_train_LA_cluster = pd.concat([X_train_LA, LA_train_clusters], axis=1)

In [None]:
X_train_LA_cluster.head()

In [None]:
# join on clusterid to get centroids
X_train_LA_cluster_centroid = X_train_LA_cluster.merge(LA_centroids, how='left', on='room_cluster').set_index(X_train_LA_cluster.index)

In [None]:
X_train_LA_cluster_centroid.head()

#### Clusters and Centroids on Train DF

In [None]:
# Visualize 

plt.scatter(X_train_LA_cluster_centroid.bathroomcnt, y_train.logerror, c=X_train_LA_cluster_centroid.room_cluster)
plt.show()

#### 2b. Validate Cluster

In [None]:
LA_validate_clusters, kmeans = explore.run_kmeans(X_validate_LA, X_validate_scaled_LA, k=3, cluster_vars=cluster_vars, cluster_col_name = 'room_clusters')

In [None]:
LA_validate_clusters

#### 2c. Test Cluster

In [None]:
LA_test_clusters, kmeans = explore.run_kmeans(X_test_LA, X_test_scaled_LA, k=3, cluster_vars=cluster_vars, cluster_col_name = 'room_clusters')

In [None]:
LA_test_clusters