In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

# Zillow

Data provided by Zillow * Kaggle (see [here](https://www.kaggle.com/pratyushakar/zillow-zestimate#properties_2017.csv))

In [2]:
data_url = "https://docs.google.com/spreadsheets/d/198EG3tckqzD1uOKSYxAY62i5v_0LIZQMgzaIae6u1vo/export?format=csv"

<IPython.core.display.Javascript object>

Load all the usual suspects and some new ones including: `AgglomerativeClustering`, `DBSCAN`, `dendrogram`.

In [3]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering, DBSCAN

from scipy.cluster.hierarchy import dendrogram
from scipy.spatial.distance import pdist, squareform

import matplotlib.pyplot as plt

%matplotlib inline

<IPython.core.display.Javascript object>

Function that will also be used in your exercise to produce a dendrogram from our `AgglomerativeClustering` object.

In [4]:
def plot_dendrogram(model, **kwargs):
    """
    A function for plotting a dendrogram. Sourced from the following link:
    https://github.com/scikit-learn/scikit-learn/blob/70cf4a676caa2d2dad2e3f6e4478d64bcb0506f7/examples/cluster/plot_hierarchical_clustering_dendrogram.py
    
    Parameters:
        model (object of class sklearn.cluster.hierarchical.AgglomerativeClustering): a fitted scikit-learn hierarchical clustering model.
    
    Output: a dendrogram based on the model based in the parameters.
    
    Returns: None   
    """
    # Children of hierarchical clustering
    children = model.children_

    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0] + 2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(
        float
    )

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

<IPython.core.display.Javascript object>

Read data and do some inspection & cleaning.

In [5]:
zillow = pd.read_csv(data_url)
zillow.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,17291058,,,8516.0,9.5,6,,,9.5,66.0,...,2.0,,12956457,26879210,2016,13922753,283062.46,,,61110070000000.0
1,17214945,,,296.0,2.5,3,,,2.5,66.0,...,2.0,,321000,1074000,2016,753000,11525.74,,,61110050000000.0
2,17060678,,,1146.0,6.5,4,,,6.5,66.0,...,2.0,,1804157,2275709,2016,471552,24503.28,,,61110010000000.0
3,17284901,,,2322.0,1.5,6,,,1.5,66.0,...,3.0,,4481348,7138171,2016,2656823,75722.34,,,61110070000000.0
4,17277746,,,182.0,3.5,4,,,3.5,66.0,...,2.0,,254934,420023,2016,165089,4427.28,,,61110060000000.0


<IPython.core.display.Javascript object>

Drop columns that have more than 20% of their values missing.  How many columns does this remove?

In [6]:
len(zillow.columns)

58

<IPython.core.display.Javascript object>

In [7]:
perc_na = zillow.isna().mean()
cols_over_20 = perc_na > 0.2
cols_over_20.sum()

24

<IPython.core.display.Javascript object>

In [8]:
zillow = zillow.loc[:, ~cols_over_20]

<IPython.core.display.Javascript object>

Drop all NAs from the dataframe.  How many rows does this remove?

In [9]:
pre_dropna_rowcount = zillow.shape[0]
zillow = zillow.dropna()
post_dropna_rowcount = zillow.shape[0]

n_dropped = pre_dropna_rowcount - post_dropna_rowcount
n_dropped

2575

<IPython.core.display.Javascript object>

For the sake of time & plotting, downsample to 100 random records in the `zillow` dataframe.  Use a random seed of `42` to obtain consistent results.

In [10]:
# If we calc a distance matrix it will be 12425 * 12425 (154,380,625) elements large
# Let's down sample to have a quicker demo and a prettier dendrogram to look at
zillow = zillow.sample(100, random_state=42)

<IPython.core.display.Javascript object>

Dropping a lot of columns based on being ID, having 0 variance, or being collinear (based on my understanding... rather than analysis... might be wrong).

In [11]:
# Id columns aren't too useful for clustering but we might want them later
# fmt: off
id_cols = ["parcelid", "pooltypeid7", "propertycountylandusecode", 
           "propertylandusetypeid", "regionidcity", "regionidcounty", "regionidzip",
           "latitude", "longitude", "fips",
           "rawcensustractandblock", "censustractandblock"]
zillow_sub = zillow.drop(columns=id_cols)

# Some columns that duplicate info
# Idk much about real estate so some of these might be bad assumptions
dup_cols = ["calculatedbathnbr", "finishedsquarefeet50", "finishedsquarefeet12",
            "finishedfloor1squarefeet", "structuretaxvaluedollarcnt", 
            "taxvaluedollarcnt", "landtaxvaluedollarcnt"]
# fmt: on
zillow_sub = zillow_sub.drop(columns=dup_cols)

# My random sample (using 42 as seed) had 0 variance in these 2 columns
no_var_cols = ["poolcnt", "assessmentyear", "garagecarcnt"]
zillow_sub = zillow_sub.drop(columns=no_var_cols)

zillow_sub.head()

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fireplacecnt,fullbathcnt,garagetotalsqft,lotsizesquarefeet,poolsizesum,roomcnt,yearbuilt,numberofstories,taxamount
11325,4.0,5,3705,2.0,4.0,948.0,59349.0,684.0,10,2001.0,2.0,15108.92
1766,3.0,5,3642,2.0,3.0,471.0,7720.0,640.0,10,1998.0,2.0,8078.32
14591,3.0,3,2779,2.0,3.0,659.0,11610.0,420.0,8,1988.0,2.0,7340.82
14939,3.0,3,2902,2.0,3.0,774.0,43560.0,525.0,7,1977.0,2.0,11795.98
12049,3.0,4,2295,2.0,3.0,487.0,8539.0,396.0,8,1979.0,2.0,4799.22


<IPython.core.display.Javascript object>

## Heirarchical Clustering

Links:
* [Greate resource here explaining an example in depth](http://www.econ.upf.edu/~michael/stanford/maeb7.pdf)
* [StatQuest video](https://www.youtube.com/watch?v=7xHsRkOdVwo): admittedly not his best work and not his best "bams"

Prep data for clustering

In [12]:
scaler = StandardScaler()
scaled = scaler.fit_transform(zillow_sub)
scaled_df = pd.DataFrame(scaled, index=zillow_sub.index, columns=zillow_sub.columns)

<IPython.core.display.Javascript object>

Calculate distance matrix using euclidean distance

In [13]:
dist_mat_arr = squareform(pdist(scaled_df, metric="cityblock"))
dist_mat_df = pd.DataFrame(dist_mat_arr, index=scaled_df.index, columns=scaled_df.index)

<IPython.core.display.Javascript object>

In [14]:
dist_mat_df

Unnamed: 0,11325,1766,14591,14939,12049,6551,12445,1176,7006,3980,...,9060,1351,12753,13125,2452,2869,5163,4317,9151,2153
11325,0.000000,6.133614,11.766457,10.582870,12.620758,11.965275,18.006859,13.333124,8.562665,14.476535,...,20.630178,9.790534,26.104946,9.301654,12.015474,12.130775,13.533670,6.607718,15.480510,12.665146
1766,6.133614,0.000000,7.206605,9.195021,6.643523,8.879718,12.548180,8.440920,4.179217,13.219310,...,14.780729,8.704136,20.121271,7.841647,7.172835,13.973720,9.711865,12.578106,9.396477,13.392515
14591,11.766457,7.206605,0.000000,3.821257,3.291152,7.390463,10.273153,4.930565,4.097653,6.021876,...,10.024376,11.653249,15.861848,4.992300,3.858931,13.626939,5.619495,14.657371,6.150904,11.554795
14939,10.582870,9.195021,3.821257,0.000000,5.868001,8.169643,9.933381,7.686508,6.331359,5.990283,...,10.047309,14.669616,15.522076,5.894452,5.613768,11.798544,8.163590,13.473784,8.589485,10.371208
12049,12.620758,6.643523,3.291152,5.868001,0.000000,7.462491,7.848753,4.581145,4.058093,7.554487,...,9.926469,11.550685,15.468014,3.733907,3.156289,12.969733,5.894096,15.511672,3.002238,14.845947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2869,12.130775,13.973720,13.626939,11.798544,12.969733,9.813104,10.691083,12.312113,13.566640,9.077658,...,12.623064,16.597099,16.438619,9.235827,10.328084,0.000000,11.346066,9.519347,13.215090,11.763209
5163,13.533670,9.711865,5.619495,8.163590,5.894096,2.597164,5.633843,4.582456,6.200728,6.508253,...,7.821917,14.593754,13.659389,7.427109,6.293740,11.346066,0.000000,12.342102,6.029323,8.951851
4317,6.607718,12.578106,14.657371,13.473784,15.511672,10.773707,16.815291,16.224038,13.890431,13.284967,...,19.438610,15.337460,24.913377,12.192569,14.906389,9.519347,12.342102,0.000000,18.371425,10.790932
9151,15.480510,9.396477,6.150904,8.589485,3.002238,7.597718,5.032000,2.147386,6.917845,7.872385,...,7.161786,14.552923,12.488058,6.455391,3.551826,13.215090,6.029323,18.371425,0.000000,12.505046


<IPython.core.display.Javascript object>

Perform heirarchical clustering with the distance matrix and [`sklearn.cluster.AgglomerativeClustering`](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html).

In [15]:
clst = AgglomerativeClustering(
    n_clusters=10, affinity="precomputed", linkage="complete"
)
clst.fit(dist_mat_df)

AgglomerativeClustering(affinity='precomputed', compute_full_tree='auto',
                        connectivity=None, distance_threshold=None,
                        linkage='complete', memory=None, n_clusters=10)

<IPython.core.display.Javascript object>

Use the `plot_dendrogram()` helper function to plot the heirarchical clusters.

In [16]:
# plt.figure(figsize=(20, 10))
# plot_dendrogram(clst)

<IPython.core.display.Javascript object>

Assign the cluster labels to a column in our original dataframe.

In [17]:
zillow_sub["label"] = clst.labels_
zillow_sub["label"].value_counts()

2    30
6    19
3    19
0    12
4     9
1     5
7     3
9     1
8     1
5     1
Name: label, dtype: int64

<IPython.core.display.Javascript object>

Interpret the clusters

In [18]:
clst_avgs = zillow_sub.groupby("label").mean()
clst_avgs.style.background_gradient()

Unnamed: 0_level_0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fireplacecnt,fullbathcnt,garagetotalsqft,lotsizesquarefeet,poolsizesum,roomcnt,yearbuilt,numberofstories,taxamount
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,4.5,4.083333,4594.0,2.0,4.083333,875.666667,58822.083333,595.333333,9.166667,1995.833333,1.5,15082.486667
1,3.3,3.4,3345.8,1.2,2.8,722.6,69203.8,688.0,7.8,1980.6,1.0,14605.3
2,3.533333,4.566667,3282.633333,2.0,3.266667,633.9,14750.533333,527.6,8.866667,1990.466667,1.966667,8883.807333
3,2.5,3.684211,2261.157895,1.473684,2.105263,536.631579,10386.684211,460.842105,7.368421,1976.0,2.0,4945.589474
4,3.0,4.333333,2732.444444,1.666667,2.666667,603.666667,17618.777778,534.888889,8.222222,1986.333333,1.0,7846.44
5,6.5,5.0,10188.0,4.0,6.0,2204.0,81022.0,800.0,12.0,2004.0,2.0,72020.94
6,2.236842,3.421053,1918.157895,1.052632,2.0,457.315789,14017.684211,501.157895,7.0,1968.157895,1.0,4895.765263
7,5.166667,5.333333,5123.666667,3.0,4.666667,1368.333333,164657.0,746.666667,11.0,1992.333333,2.0,16223.74
8,5.0,3.0,4361.0,4.0,5.0,864.0,230432.0,800.0,7.0,1987.0,2.0,24757.64
9,8.0,7.0,15279.0,4.0,8.0,1523.0,76666.0,600.0,21.0,1994.0,2.0,40354.84


<IPython.core.display.Javascript object>

## DBSCAN Clustering

Links:
* [A cool demo animation](https://www.youtube.com/watch?v=h53WMIImUuc&feature=youtu.be&t=2)
* [Video to try and slow down and discuss the process shown in the animation animation](https://drive.google.com/file/d/1qXsccjEWXYUi1SNJQWRCNrySegjWjnPN/view?usp=sharing)

Perform clustering with [`sklearn.cluster.DBSCAN`](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html).

In [19]:
clst = DBSCAN(metric="precomputed")
clst.fit(dist_mat_df)

DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='precomputed',
       metric_params=None, min_samples=5, n_jobs=None, p=None)

<IPython.core.display.Javascript object>

In [20]:
clst = DBSCAN(eps=1.5, metric="euclidean", min_samples=5)
clst.fit(scaled_df)

DBSCAN(algorithm='auto', eps=1.5, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=5, n_jobs=None, p=None)

<IPython.core.display.Javascript object>

In [21]:
pd.Series(clst.labels_).value_counts()

-1    64
 0    17
 1    14
 2     5
dtype: int64

<IPython.core.display.Javascript object>

In [22]:
zillow_sub["label"] = clst.labels_
zillow_sub["label"].value_counts()

-1    64
 0    17
 1    14
 2     5
Name: label, dtype: int64

<IPython.core.display.Javascript object>

In [23]:
clst_avgs = zillow_sub.groupby("label").mean()
clst_avgs.style.background_gradient()

Unnamed: 0_level_0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fireplacecnt,fullbathcnt,garagetotalsqft,lotsizesquarefeet,poolsizesum,roomcnt,yearbuilt,numberofstories,taxamount
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-1,3.734375,4.15625,3735.578125,2.046875,3.375,750.296875,39553.078125,562.546875,8.734375,1988.53125,1.640625,11917.768438
0,2.294118,3.529412,1902.941176,1.0,2.058824,460.823529,14505.588235,507.352941,7.117647,1968.588235,1.0,4691.009412
1,2.5,4.071429,2259.714286,1.142857,2.142857,520.428571,8736.0,479.071429,7.714286,1976.142857,2.0,5135.968571
2,3.1,5.0,3294.2,2.0,3.0,602.6,19689.4,521.8,9.4,1989.4,2.0,9910.2


<IPython.core.display.Javascript object>

Assign the labels to the dataframe

Perform clustering with DBSCAN using the distance matrix (with the same `eps` and `min_samples`.

Confirm you get the same results

Interpret the clusters

Change the values of `eps` and `min_samples` and repeat the analysis.