In [36]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.offline as offline
from plotly.graph_objs import Scatter, Layout
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
offline.init_notebook_mode(connected=True)
#apply code Sarafina wrote to Hector's dataset
#run source sink code on those clusters

In [30]:
c = "YorkeysKnobFiltered/YK_Coordinates.csv"
probs = "YorkeysKnobFiltered/YN_Probs.csv"
numClusters = 3
field_names = ['X', 'Y']
coords = pd.read_csv(c, header=None, names=field_names)
transitions = pd.read_csv(probs, header=None)

In [27]:
# Create a trace for building points
trace = go.Scatter(
    x = coords['X'].as_matrix(),
    y = coords['Y'].as_matrix(),
    name = 'Site',
    mode = 'markers',
    marker = dict(
        size = 6,
        color = 'rgba(42, 147, 227, .50)',
    )
)

In [28]:
agg = AgglomerativeClustering(n_clusters = numClusters).fit(coords.as_matrix())
aggPoints = []

for x in range(0, numClusters):
    aggPoints.append([])

for x in range(0, len(coords)):
    aggPoints[agg.labels_[x]].append(coords.as_matrix()[x])

centersX = []
centersY = []

clusterLabels = agg.labels_
citiesInCluster = [0] * len(clusterLabels)
for x in range(0, len(clusterLabels)):
    citiesInCluster[clusterLabels[x]] += 1

for x in range(0, numClusters):
    sumX = 0
    sumY = 0
    for y in range(0, len(aggPoints[x])):
        sumX += aggPoints[x][y][0]
        sumY += aggPoints[x][y][1]
    centersX.append(sumX / len(aggPoints[x]))
    centersY.append(sumY / len(aggPoints[x]))

aggCentersTrace = go.Scatter(
    x = centersX,
    y = centersY,
    name = 'Agg Clusters',
    mode = 'markers',
    marker = dict(
        size = 12,
        color = 'rgba(255, 0, 0, 0.75)'
    )
)

#Plot result
points = [trace]
points.append(aggCentersTrace)
#Gets name of city from the name of the csv file
cityName = c.split("/")[1].split("_")[0]
#Plot
iplot({
        "data": points,
        'layout': {'title': cityName + ' Landscape Cluster Coordinates with ' + str(numClusters) + ' Clusters'}
        },
        )

In [54]:
d = {"clusters":clusterLabels, "site":np.arange(0, 30, 1)}
sites_w_clusters = pd.DataFrame(data=d).sort_values("clusters")
cluster0 = sites_w_clusters.loc[sites_w_clusters['clusters'] == 0]
cluster1 = sites_w_clusters.loc[sites_w_clusters['clusters'] == 1]
cluster2 = sites_w_clusters.loc[sites_w_clusters['clusters'] == 2]

In [112]:
#my code
#I timed each of these (calculating in transitions and out transitions separately vs all together). 
#the get_transitions_freq function is faster than the combo of in_transitions_freq and out_transitions_freq
def within_transitions(matrix, community):
    ixgrid = np.ix_(community, community)[0]
    m = matrix[ixgrid]
    s = m.sum()
    return s

def out_transitions(matrix, community):
    within = within_transitions(matrix, community)

    #start = time.time()
    ixgrid = np.ix_(community)[0]
    m = matrix[ixgrid, :]
    out_transitions = m.sum()
    #end = time.time()
    return max(0, out_transitions - within)
    
def in_transitions(matrix, community):
    #floating point errors possible, could give negative values close to 0
    within = within_transitions(matrix, community)
    
    ixgrid = np.ix_(community)[0]
    m_in = matrix[:, ixgrid]
    in_transitions = m_in.sum()
    return max(0, in_transitions -  within)

def ratio(matrix, community):
    o = out_transitions(matrix, community)
    i = in_transitions(matrix, community)
    if o == 0 and i == 0:
        return 0.5
    elif o == 0 and i != 0:
        #guarentees that we have a sink
        return 2
    elif i == 0 and o != 0:
        return 0
    #avoid division by 0
    else:
        return i/o

def classify(ratio, bounds):
    '''bounds = [upper bound for source, lower bound for sink], between 0 and 1'''
    if ratio <= bounds[0]:
        return "sink"
    elif ratio => bounds[1]:
        return "source"
    else:
        return "manager"

def get_transition_freq(matrix, community):
    '''matrix is a numpy matrix describing the transition matrix for a graph. communities is a list of nodes'''
    #A_{ij}: represents frequency of transition from node i to node j

    #sums entries of square matrix that represents all transitions within the community and from community outward
    #np.ix_ allows easier subsetting by creating n-d meshgrid for the matrix
    ixgrid = np.ix_(community)
    m = matrix[ixgrid, :]
    comm_out_transitions = m.sum()
    
    m_in = matrix[:, ixgrid]
    comm_in_transitions = m_in.sum()
    
    #sums entries of square matrix that represents all transitions within the community only
    ixgrid_c = np.ix_(community, community)
    m_c = matrix[ixgrid_c]
    within_transition = m_c.sum()
    
    #subtract total community transitions from within community transitions
    out_transition = comm_out_transitions - within_transition
    in_transition = comm_in_transitions - within_transition
    
    ratio = in_transition/out_transition
    return (in_transition, out_transition, ratio)

In [121]:
t = transitions.as_matrix()
import scipy
scipy.linalg.eig(t, b=None, right=False, left=True)

(array([ 0.99672944+0.j,  0.98357717+0.j,  1.00000000+0.j,  1.00000000+0.j,
         0.98590684+0.j,  0.99217230+0.j,  0.99160091+0.j,  0.98698229+0.j,
         0.98752336+0.j,  0.99124726+0.j,  0.98824020+0.j,  0.98607431+0.j,
         0.98667274+0.j,  0.98825855+0.j,  0.99128479+0.j,  0.99113757+0.j,
         0.99091361+0.j,  0.98863700+0.j,  0.98927230+0.j,  0.98990800+0.j,
         0.98962516+0.j,  0.98984165+0.j,  0.98953427+0.j,  0.98975365+0.j,
         0.98976837+0.j,  0.98938555+0.j,  0.98953140+0.j,  0.98989507+0.j,
         0.98970577+0.j,  0.98979077+0.j]),
 array([[ -1.86878767e-01,  -9.83883993e-02,   2.32399914e-01,
          -2.39784441e-13,   3.48567438e-01,   1.00912142e-01,
           4.07946730e-01,  -1.05214603e-02,  -2.59087484e-01,
           8.39303735e-02,  -8.78597935e-02,  -5.07990654e-14,
          -3.72272803e-14,   1.09970783e-13,   2.31077193e-15,
          -1.04469793e-13,  -7.86567705e-14,   1.01334333e-13,
           2.93892817e-01,   2.48157300e-02,  

In [102]:
0.99099 + 0.005465+0.001020+0.002209+0.990099+0.002835+0.000645+0.004436+0.990099

2.987798

In [111]:
within = within_transitions(transitions.as_matrix(), list(cluster2.iloc[:,1]))

ixgrid = np.ix_(list(cluster2.iloc[:,1]))[0]
m_in = transitions.as_matrix()[:, ixgrid]
np.sum(m_in) - within

-0.0084686315107895815

In [123]:
x=[ -1.74326697e-01,  -7.48281558e-02,   2.20252022e-01,
  -2.50414616e-13,  -1.28041980e-01,   1.89179001e-01,
  -1.18374282e-01,   3.75409695e-01,  -6.64806538e-04,
  -1.94256225e-01,   4.37307252e-02,   8.81198383e-14,
   3.69758929e-14,   2.83976638e-14,  -1.22927280e-13,
  -2.33067804e-13,  -3.68512457e-14,  -1.98856825e-13,
   2.83756043e-01,  -3.20978967e-01,   3.28454377e-01,
  -4.10565360e-02,   2.52630126e-01,  -3.43363793e-01,
  -3.63235671e-01,   2.27331432e-13,   1.75717779e-12,
   2.07672102e-14,  -6.83607719e-13,  -5.65291244e-13]
x/np.sum(x)

array([  2.65276372e+00,   1.13867480e+00,  -3.35161844e+00,
         3.81060858e-12,   1.94844005e+00,  -2.87877416e+00,
         1.80132479e+00,  -5.71268334e+00,   1.01164922e-02,
         2.95603527e+00,  -6.65459067e-01,  -1.34093695e-12,
        -5.62669452e-13,  -4.32132849e-13,   1.87060866e-12,
         3.54663872e-12,   5.60772671e-13,   3.02604351e-12,
        -4.31797165e+00,   4.88440022e+00,  -4.99815501e+00,
         6.24765403e-01,  -3.84432243e+00,   5.22503453e+00,
         5.52742882e+00,  -3.45934722e-12,  -2.67393209e-11,
        -3.16018732e-13,   1.04025934e-11,   8.60214831e-12])