#### Imports

In [1]:
import wrangle
import explore
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.float_format = '{:20,.2f}'.format

from math import sqrt
from scipy import stats

import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression, SelectKBest, RFE 
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cluster import KMeans

#### Import LA Dataframe For Explore

In [2]:
df = wrangle.get_zillow_data(cached=True)
df_la, df_v, df_o = wrangle.clean_zillow_data(df)
X_train, X_validate, X_test, X_train_explore, y_train, y_validate, y_test, X_train_scaled, X_validate_scaled, X_test_scaled = wrangle.split_scale(df_la)

In [9]:
X_train.shape, X_validate.shape, X_test.shape, y_train.shape

((7676, 15), (3290, 15), (2742, 15), (7676, 1))

# Exploration:

**Target = Logerror** 


-A number that represents a ratio that is derived from two prior distributions - the real price distribution of homes and then Zillow's existing model of that distribution

In [10]:
X_train_explore.head()

Unnamed: 0,parcelid,logerror,bathroomcnt,calculatedfinishedsquarefeet,latitude,longitude,regionidcity,LA,Orange,Ventura,age,taxrate,acres,structure_dollar_per_sqft,land_dollar_per_sqft,bed_bath_ratio
32602,13848297,-0.05,1.0,854.0,33.72,-117.88,47568.0,0,1,0,68.0,0.02,0.14,33.1,3.72,2.0
22416,14412168,0.01,2.5,1426.0,33.53,-117.69,25459.0,0,1,0,30.0,0.01,0.14,121.09,82.43,1.2
40548,14496973,-0.02,2.5,1660.0,33.65,-117.59,15554.0,0,1,0,30.0,0.01,0.1,101.68,39.01,1.6
4720,13905733,0.12,3.0,2542.0,33.83,-118.02,10608.0,0,1,0,9.0,0.01,0.08,180.23,79.89,1.33
41509,14183536,0.02,3.0,2230.0,33.88,-117.84,21412.0,0,1,0,46.0,0.01,0.2,78.75,45.72,1.0


#### Inital Thoughts:

- From my inital investigation on regression project I know that room count has a large affect on taxrate and housing price.  I was unable to create a derived feature last go round so I want to test the affect of this feature now.     

- I want to examine how usefull our created feature of bedbathratio is in predicting logerror in LA County.  I chose LA County because it has the largest number of datapoints.  I want to cluster on bedbathratio, bathroomcnt, and caluculaedfinishedsquarefeet.   

## Cluster One: Room Clusters

#### Step 1. Elbow Plot

In [None]:
# #Reasign for formula to work correctly
# X_train_scaled = X_train_scaled_LA.copy()

cluster_vars = ['bathroomcnt', 'bed_bath_ratio', 'calculatedfinishedsquarefeet']
explore.elbow_plot(X_train_scaled_LA, cluster_vars)

#### Takeaway:

- Looks like 3 is the optimal K for this cluster

***

#### Step 2. Create Clusters

#### 2a. Train Cluster

In [None]:
LA_train_clusters, kmeans = explore.run_kmeans(X_train_LA, X_train_scaled_LA, k=3, cluster_vars=cluster_vars, cluster_col_name = 'room_cluster')

In [None]:
 LA_train_clusters

In [None]:
#Visualize distribution of clusters, they do not look even
LA_train_clusters.room_cluster.value_counts()

#### Get Centroids

In [None]:
centroid_col_names = ['centroid_' + i for i in cluster_vars]
centroid_col_names

LA_centroids = pd.DataFrame(kmeans.cluster_centers_, 
             columns=centroid_col_names).reset_index().rename(columns={'index': 'room_cluster'})

In [None]:
LA_centroids

#### Append cluster id onto X_train & X_train_scaled, then join with the centroids dataframe.


In [None]:
# concatenate cluster id on LA_X_Train
X_train_LA_cluster = pd.concat([X_train_LA, LA_train_clusters], axis=1)

In [None]:
X_train_LA_cluster.head()

In [None]:
# join on clusterid to get centroids
X_train_LA_cluster_centroid = X_train_LA_cluster.merge(LA_centroids, how='left', on='room_cluster').set_index(X_train_LA_cluster.index)

In [None]:
X_train_LA_cluster_centroid.head()

#### Clusters and Centroids on Train DF

In [None]:
# Visualize 

plt.scatter(X_train_LA_cluster_centroid.bathroomcnt, y_train.logerror, c=X_train_LA_cluster_centroid.room_cluster)
plt.show()

#### 2b. Validate Cluster

In [None]:
LA_validate_clusters, kmeans = explore.run_kmeans(X_validate_LA, X_validate_scaled_LA, k=3, cluster_vars=cluster_vars, cluster_col_name = 'room_clusters')

In [None]:
LA_validate_clusters

#### 2c. Test Cluster

In [None]:
LA_test_clusters, kmeans = explore.run_kmeans(X_test_LA, X_test_scaled_LA, k=3, cluster_vars=cluster_vars, cluster_col_name = 'room_clusters')

In [None]:
LA_test_clusters