In [30]:
import pandas as pd
import plotly.graph_objs as go
import plotly.offline as offline
import numpy.random as random
from plotly.graph_objs import Scatter, Layout
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
offline.init_notebook_mode(connected=True)

**Clustering Buildings**
Functions to cluster buildings using DBScan, KMeans, and Hierarchical Ward
Always define parameters and read CSV file. Select one or more of the algorithms by running their individual cells.

Define Parameters, Load and Define Data


In [31]:
csvFileName = "CSVCoordinates/ArroyoGrande_Buildings_Coordinates_original.csv"
numClusters = 20 #used for KMeans and AggClustering
field_names = ['X', 'Y']
buildings = pd.read_csv(csvFileName, header=None, names=field_names)
buildings['X'].head()

# Create a trace for building points
trace = go.Scatter(
    x = buildings['X'].as_matrix(),
    y = buildings['Y'].as_matrix(),
    name = 'Buildings',
    mode = 'markers',
    marker = dict(
        size = 6,
        color = 'rgba(42, 147, 227, .50)',
    )
)

def colorClusterTrace(numClusters, clusterPoints):
    """
    Returns a trace to color each building according to its cluster id
    """
    colorTrace = [0] * numClusters
    col = 25
    for x in range(0, numClusters):
        colorTrace[x] = go.Scatter(
            x = clusterPoints[x][0], 
            y = clusterPoints[x][1], 
            name = 'Cluster ' + str(x) + ' Buildings',
            mode = 'markers',
            marker = dict(
                size = 6, 
                color = 'rgba(' + str(col) + ',' + str(col) + ',' + str(col) + ', 0.75)',
            )
        )
        col += 25
        if col > 255:
            col -= 255
    return colorTrace

def clusterCenterTrace(clusterCenters, algorithmName):
    """
    Returns a trace of red cluster centers
    """
    return go.Scatter(
        x = clusterCenters[0],
        y = clusterCenters[1],
        name = algorithmName,
        mode = 'markers',
        marker = dict(
            size = 12,
            color = 'rgba(255, 0, 0, 0.75)'
            ,

        )
    )

def plot(colorTrace, centerTrace):
    #Plot result
    points = []
    for x in range(len(colorTrace)):
        points.append(colorTrace[x])
    points.append(centerTrace)
    #Gets name of city from the name of the csv file
    cityName = csvFileName.split("/")[1].split("_")[0]
    #Plot
    iplot({
            "data": points,
            'layout': {'title': cityName + ' Building Cluster Coordinates with ' + str(numClusters) + ' Clusters'}
            },
        )

def createClusterCenters(labels):
    """
    In: labels is a list of clusters generated by DBScan or AggClustering. labels[i] is the cluster id of buildings[i]
    Out: centersX is a list of x coordinates for each cluster center
         centersY is a list of y coordinates for each cluster center
         clusterPoints is a list of lists used for the trace where [i] is all x, y coords of buildings in cluster i
                    and clusterPoints[i][0] is a list of x values in cluster i and clusterPoints[i][1] is a list of y values in cluster i
    """
    points = [] #points is a list of lists where points[i] holds the coordinates of the buildings within cluster i
    
    clusterPoints = []  

    #set up list of lists
    for x in range(0, len(labels)):
        points.append([])
        clusterPoints.append([])
        clusterPoints[x] = [[], []]

    #append each building to points based on its label
    for x in range(0, len(buildings)):
        if(labels[x] >= 0):
            points[labels[x]].append(buildings.as_matrix()[x])
        
    centersX = []
    centersY = []


    #find cluster center by averaging all points in each cluster
    for x in range(0, len(labels)):
        sumX = 0
        sumY = 0
        for y in range(0, len(points[x])):
            sumX += points[x][y][0]
            sumY += points[x][y][1]        
            clusterPoints[x][0].append(points[x][y][0])
            clusterPoints[x][1].append(points[x][y][1])
        if(len(points[x]) > 0):
            centersX.append(sumX / len(points[x]))
            centersY.append(sumY / len(points[x]))
        
    return [centersX, centersY], clusterPoints, points

def euclideanDistance(a, b):
    return ((a[0]-b[0])**2 + (a[1]-b[1])**2)**0.5

def distBetweenCenters(centers):
    """
    In: list of tuples (x, y) of each cluster center
    Out: 2d array where array[i][j] is the distance between center i and center j
    """
    distances = []
    for i in range(len(centers)):
        distances.append([])
        for j in range(len(centers)):
            distances[i].append(euclideanDistance(centers[i], centers[j]))
    return distances

def minDistBetweenClusters(points):
    """
    In: points (computed in createClusterCenters), where points[i] is a list of tuples (x, y) of points in cluster i
    Out: 2d array where array[i][j] is the min distance between cluster i and cluster j
    """
    distances= []
    for i in range(len(points)):
        distances.append([])
        for j in range(len(points)):
            minDist = min([euclideanDistance(a, b) for a in points[i] for b in points[j]])
            distances[i].append(minDist)
    return distances

def maxDistBetweenClusters(points):
    """
    In: points (computed in createClusterCenters), where points[i] is a list of arrays [x, y] of points in cluster i
    Out: 2d array where array[i][j] is the min distance between cluster i and cluster j
    """
    distances= []
    for i in range(len(points)):
        distances.append([])
        for j in range(len(points)):
            allDistances = [euclideanDistance(a, b) for a in points[i] for b in points[j]]
            if(len(allDistances) > 0):
                print('ok')
                maxDist = max(allDistances)
                distances[i].append(maxDist)
            else:
                distances[i].append(None)
    return distances

def testMinDist():
    points = [[(0, 0), (0, 1), (1, 0), (1, 1)], [(2, 1), (2, 2), (2, 3), (2, 2)], [(4, 3), (3, 4), (4, 3)]]
    return minDistBetweenClusters(points)



KMeans Clustering

In [32]:
kmeans = KMeans(n_clusters=numClusters, init = 'k-means++').fit(buildings.as_matrix())
kxs = []
kys = []
for k in kmeans.cluster_centers_:
    kxs.append(k[0])
    kys.append(k[1])


clusterCenters, clusterPoints, points = createClusterCenters(kmeans.labels_)
#print(euclideanDistance(points[1][0], points[1][1]))
#print(maxDistBetweenClusters(points))
    
#Trace for color coding the clusters
kMeansTrace = colorClusterTrace(len(kmeans.cluster_centers_), clusterPoints)


#Create a trace for the kmeans cluster centers with slightly larger dots than the buildings
kTrace = clusterCenterTrace([kxs, kys], 'KMeans Clusters')


#Plot result
plot(kMeansTrace, kTrace)

DB Scan

In [33]:
db = DBSCAN(0.0004, 10).fit(buildings.as_matrix())

dbPointsLen = max(db.labels_)
dbNumClusters = dbPointsLen + 1

    

clusterCenters, clusterPoints, points = createClusterCenters(db.labels_)

#Trace for color coding the clusters
dbTrace = colorClusterTrace(dbNumClusters, clusterPoints)

#Create a trace for the DBScan values with slightly larger dots than the buildings
dbCentersTrace = clusterCenterTrace(clusterCenters, 'DB Clusters')

#Plot result: doesn't use color coding trace because DBScan only uses small subset of buildings
points = [trace]
#points = []
#for x in range(len(dbTrace)):
#    points.append(dbTrace[x])
points.append(dbCentersTrace)
#Gets name of city from the name of the csv file
cityName = csvFileName.split("/")[1].split("_")[0]
#Plot
iplot({
        "data": points,
        'layout': {'title': cityName + ' Building Cluster Coordinates with ' + str(dbNumClusters) + ' Clusters'}
        },
        )

Agglomerative Clustering

In [34]:
agg = AgglomerativeClustering(n_clusters = numClusters).fit(buildings.as_matrix())

aggNumClusters = max(agg.labels_) + 1

clusterCenters, clusterPoints, points = createClusterCenters(agg.labels_)

#Trace for color coding the clusters
aggTrace = colorClusterTrace(aggNumClusters, clusterPoints)

#Trace of cluster centers
aggCentersTrace = clusterCenterTrace(clusterCenters, 'Agg Clusters')

#Plot result
plot(aggTrace, aggCentersTrace)