In [3]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.offline as offline
from plotly.graph_objs import Scatter, Layout
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
offline.init_notebook_mode(connected=True)
from discreteMarkovChain import markovChain
#apply code Sarafina wrote to Hector's dataset
#run source sink code on those clusters

In [4]:
c = "coordinates.csv"#"YorkeysKnobFiltered/YK_Coordinates.csv"
probs = "nPartiteAdjMatrix.csv"#"YorkeysKnobFiltered/YN_Probs.csv"
numClusters = 3
field_names = ['X', 'Y', "w"]
coords = pd.read_csv(c, header=None, names=field_names)
coords = coords.loc[:,["X","Y"]]
transitions = pd.read_csv(probs, header=None)

In [5]:
# Create a trace for building points
trace = go.Scatter(
    x = coords['X'].as_matrix(),
    y = coords['Y'].as_matrix(),
    name = 'Site',
    mode = 'markers',
    marker = dict(
        size = 6,
        color = 'rgba(42, 147, 227, .50)',
    )
)

In [6]:
agg = AgglomerativeClustering(n_clusters = numClusters).fit(coords.as_matrix())
aggPoints = []

for x in range(0, numClusters):
    aggPoints.append([])

for x in range(0, len(coords)):
    aggPoints[agg.labels_[x]].append(coords.as_matrix()[x])

centersX = []
centersY = []

clusterLabels = agg.labels_
citiesInCluster = [0] * len(clusterLabels)
for x in range(0, len(clusterLabels)):
    citiesInCluster[clusterLabels[x]] += 1

for x in range(0, numClusters):
    sumX = 0
    sumY = 0
    for y in range(0, len(aggPoints[x])):
        sumX += aggPoints[x][y][0]
        sumY += aggPoints[x][y][1]
    centersX.append(sumX / len(aggPoints[x]))
    centersY.append(sumY / len(aggPoints[x]))

aggCentersTrace = go.Scatter(
    x = centersX,
    y = centersY,
    name = 'Agg Clusters',
    mode = 'markers',
    marker = dict(
        size = 12,
        color = 'rgba(255, 0, 0, 0.75)'
    )
)

#Plot result
points = [trace]
points.append(aggCentersTrace)
#Gets name of city from the name of the csv file
cityName = "Testing"#$c.split("/")[1].split("_")[0]
#Plot
iplot({
        "data": points,
        'layout': {'title': cityName + ' Landscape Cluster Coordinates with ' + str(numClusters) + ' Clusters'}
        },
        )

In [7]:
d = {"clusters":clusterLabels, "site":np.arange(0, len(clusterLabels), 1)}
sites_w_clusters = pd.DataFrame(data=d).sort_values("clusters")
cluster0 = sites_w_clusters.loc[sites_w_clusters['clusters'] == 0]
cluster1 = sites_w_clusters.loc[sites_w_clusters['clusters'] == 1]
cluster2 = sites_w_clusters.loc[sites_w_clusters['clusters'] == 2]
cluster0

Unnamed: 0,clusters,site
9,0,9
17,0,17
16,0,16
15,0,15
14,0,14
13,0,13
10,0,10
18,0,18
19,0,19
8,0,8


In [8]:
#my code
#I timed each of these (calculating in transitions and out transitions separately vs all together). 
#the get_transitions_freq function is faster than the combo of in_transitions_freq and out_transitions_freq
def within_transitions(matrix, community):
    ixgrid = np.ix_(community, community)[0]
    m = matrix[ixgrid]
    s = m.sum()
    return s

def out_transitions(matrix, community):
    within = within_transitions(matrix, community)

    #start = time.time()
    ixgrid = np.ix_(community)[0]
    m = matrix[ixgrid, :]
    out_transitions = m.sum()
    #end = time.time()
    return max(0, out_transitions - within)
    
def in_transitions(matrix, community):
    #floating point errors possible, could give negative values close to 0
    within = within_transitions(matrix, community)
    
    ixgrid = np.ix_(community)[0]
    m_in = matrix[:, ixgrid]
    in_transitions = m_in.sum()
    return max(0, in_transitions -  within)

def ratio(matrix, community):
    o = out_transitions(matrix, community)
    i = in_transitions(matrix, community)
    if o == 0 and i == 0:
        return 0.5
    elif o == 0 and i != 0:
        #guarentees that we have a sink
        return 2
    elif i == 0 and o != 0:
        return 0
    #avoid division by 0
    else:
        return i/o

def classify(ratio, bounds):
    '''bounds = [upper bound for source, lower bound for sink], between 0 and 1'''
    if ratio <= bounds[0]:
        return "sink"
    elif ratio >= bounds[1]:
        return "source"
    else:
        return "manager"

def get_transition_freq(matrix, community):
    '''matrix is a numpy matrix describing the transition matrix for a graph. communities is a list of nodes'''
    #A_{ij}: represents frequency of transition from node i to node j

    #sums entries of square matrix that represents all transitions within the community and from community outward
    #np.ix_ allows easier subsetting by creating n-d meshgrid for the matrix
    ixgrid = np.ix_(community)
    m = matrix[ixgrid, :]
    comm_out_transitions = m.sum()
    
    m_in = matrix[:, ixgrid]
    comm_in_transitions = m_in.sum()
    
    #sums entries of square matrix that represents all transitions within the community only
    ixgrid_c = np.ix_(community, community)
    m_c = matrix[ixgrid_c]
    within_transition = m_c.sum()
    
    #subtract total community transitions from within community transitions
    out_transition = comm_out_transitions - within_transition
    in_transition = comm_in_transitions - within_transition
    
    ratio = in_transition/out_transition
    return (in_transition, out_transition, ratio)

In [9]:
within_transitions(transitions.as_matrix(), list(cluster0.iloc[:, 1]))
ratio(transitions.as_matrix(), list(cluster0.iloc[:, 1]))

2

In [10]:
testing = transitions.as_matrix()
t = markovChain(testing)
t.computePi()
print(t.pi)
#can add steady states for nodes to find probability of within transition
#P(transition into cluster|outside)*P(outside)
#P(outside) by summing steady state for all nodes not in cluster
#P(transition into cluster|outside) summing up entries of columns corresponding to cluster


[ 0.0705219   0.032752    0.04749825  0.029796    0.03281641  0.03034134
  0.07335767  0.03895854  0.03576786  0.06833721  0.04236353  0.03254765
  0.02265339  0.0316026   0.10369766  0.0384987   0.04795309  0.09865545
  0.06062606  0.06125468]


In [12]:
transitions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,0.049167,0.0,0.02975,0.046005,0.026021,0.608701,0.068366,0.012443,0.0,0.029543,0.006768,0.018713,0.026893,0.0,0.010557,0.0261,0.0248,0.007894,0.008278
1,0.348101,0.0,0.257629,0.0,0.0,0.0,0.127992,0.047532,0.0,0.064871,0.023467,0.0,0.01501,0.021817,0.055442,0.0,0.019311,0.018827,0.0,0.0
2,0.0,0.073221,0.0,0.043131,0.046706,0.042675,0.184411,0.178823,0.031576,0.0,0.081147,0.016693,0.046437,0.074495,0.0,0.024829,0.059277,0.059045,0.018374,0.019161
3,0.25879,0.0,0.186454,0.0,0.0,0.0,0.090605,0.118542,0.0,0.103778,0.035387,0.0,0.020553,0.030564,0.089958,0.0,0.034303,0.031067,0.0,0.0
4,0.349416,0.0,0.176293,0.0,0.0,0.0,0.120841,0.081749,0.0,0.081077,0.028018,0.0,0.016907,0.02473,0.070497,0.0,0.026246,0.024224,0.0,0.0
5,0.227112,0.0,0.185106,0.0,0.0,0.0,0.080001,0.1387,0.0,0.111433,0.037847,0.0,0.021414,0.032298,0.09595,0.0,0.03692,0.033217,0.0,0.0
6,0.23143,0.185579,0.034844,0.106925,0.163326,0.094092,0.0,0.0,0.04477,0.011052,0.0,0.02389,0.0,0.0,0.009516,0.037538,0.0,0.0,0.027834,0.029204
7,0.030523,0.080929,0.039677,0.164276,0.129748,0.191563,0.0,0.0,0.109826,0.020965,0.0,0.038335,0.0,0.0,0.016741,0.077411,0.0,0.0,0.04827,0.051737
8,0.091136,0.0,0.114936,0.0,0.0,0.0,0.031943,0.066731,0.0,0.246612,0.076518,0.0,0.027649,0.051911,0.168494,0.0,0.064002,0.060068,0.0,0.0
9,0.0,0.010583,0.0,0.013779,0.012329,0.014746,0.033575,0.054235,0.038888,0.0,0.226413,0.027602,0.058661,0.128479,0.0,0.041337,0.102458,0.159651,0.038197,0.039065
