In [49]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.offline as offline
from plotly.graph_objs import Scatter, Layout
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
offline.init_notebook_mode(connected=True)
from discreteMarkovChain import markovChain
#apply code Sarafina wrote to Hector's dataset
#run source sink code on those clusters

In [35]:
c = "coordinates.csv"#"YorkeysKnobFiltered/YK_Coordinates.csv"
probs = "nPartiteAdjMatrix.csv"#"YorkeysKnobFiltered/YN_Probs.csv"
numClusters = 3
field_names = ['X', 'Y', "w"]
coords = pd.read_csv(c, header=None, names=field_names)
coords = coords.loc[:,["X","Y"]]
transitions = pd.read_csv(probs, header=None)

In [36]:
# Create a trace for building points
trace = go.Scatter(
    x = coords['X'].as_matrix(),
    y = coords['Y'].as_matrix(),
    name = 'Site',
    mode = 'markers',
    marker = dict(
        size = 6,
        color = 'rgba(42, 147, 227, .50)',
    )
)

In [40]:
agg = AgglomerativeClustering(n_clusters = numClusters).fit(coords.as_matrix())
aggPoints = []

for x in range(0, numClusters):
    aggPoints.append([])

for x in range(0, len(coords)):
    aggPoints[agg.labels_[x]].append(coords.as_matrix()[x])

centersX = []
centersY = []

clusterLabels = agg.labels_
citiesInCluster = [0] * len(clusterLabels)
for x in range(0, len(clusterLabels)):
    citiesInCluster[clusterLabels[x]] += 1

for x in range(0, numClusters):
    sumX = 0
    sumY = 0
    for y in range(0, len(aggPoints[x])):
        sumX += aggPoints[x][y][0]
        sumY += aggPoints[x][y][1]
    centersX.append(sumX / len(aggPoints[x]))
    centersY.append(sumY / len(aggPoints[x]))

aggCentersTrace = go.Scatter(
    x = centersX,
    y = centersY,
    name = 'Agg Clusters',
    mode = 'markers',
    marker = dict(
        size = 12,
        color = 'rgba(255, 0, 0, 0.75)'
    )
)

#Plot result
points = [trace]
points.append(aggCentersTrace)
#Gets name of city from the name of the csv file
cityName = "Testing"#$c.split("/")[1].split("_")[0]
#Plot
iplot({
        "data": points,
        'layout': {'title': cityName + ' Landscape Cluster Coordinates with ' + str(numClusters) + ' Clusters'}
        },
        )

In [53]:
d = {"clusters":clusterLabels, "site":np.arange(0, len(clusterLabels), 1)}
sites_w_clusters = pd.DataFrame(data=d).sort_values("clusters")
cluster0 = sites_w_clusters.loc[sites_w_clusters['clusters'] == 0]
cluster1 = sites_w_clusters.loc[sites_w_clusters['clusters'] == 1]
cluster2 = sites_w_clusters.loc[sites_w_clusters['clusters'] == 2]
cluster0

Unnamed: 0,clusters,site
9,0,9
17,0,17
16,0,16
15,0,15
14,0,14
13,0,13
10,0,10
18,0,18
19,0,19
8,0,8


In [44]:
#my code
#I timed each of these (calculating in transitions and out transitions separately vs all together). 
#the get_transitions_freq function is faster than the combo of in_transitions_freq and out_transitions_freq
def within_transitions(matrix, community):
    ixgrid = np.ix_(community, community)[0]
    m = matrix[ixgrid]
    s = m.sum()
    return s

def out_transitions(matrix, community):
    within = within_transitions(matrix, community)

    #start = time.time()
    ixgrid = np.ix_(community)[0]
    m = matrix[ixgrid, :]
    out_transitions = m.sum()
    #end = time.time()
    return max(0, out_transitions - within)
    
def in_transitions(matrix, community):
    #floating point errors possible, could give negative values close to 0
    within = within_transitions(matrix, community)
    
    ixgrid = np.ix_(community)[0]
    m_in = matrix[:, ixgrid]
    in_transitions = m_in.sum()
    return max(0, in_transitions -  within)

def ratio(matrix, community):
    o = out_transitions(matrix, community)
    i = in_transitions(matrix, community)
    if o == 0 and i == 0:
        return 0.5
    elif o == 0 and i != 0:
        #guarentees that we have a sink
        return 2
    elif i == 0 and o != 0:
        return 0
    #avoid division by 0
    else:
        return i/o

def classify(ratio, bounds):
    '''bounds = [upper bound for source, lower bound for sink], between 0 and 1'''
    if ratio <= bounds[0]:
        return "sink"
    elif ratio >= bounds[1]:
        return "source"
    else:
        return "manager"

def get_transition_freq(matrix, community):
    '''matrix is a numpy matrix describing the transition matrix for a graph. communities is a list of nodes'''
    #A_{ij}: represents frequency of transition from node i to node j

    #sums entries of square matrix that represents all transitions within the community and from community outward
    #np.ix_ allows easier subsetting by creating n-d meshgrid for the matrix
    ixgrid = np.ix_(community)
    m = matrix[ixgrid, :]
    comm_out_transitions = m.sum()
    
    m_in = matrix[:, ixgrid]
    comm_in_transitions = m_in.sum()
    
    #sums entries of square matrix that represents all transitions within the community only
    ixgrid_c = np.ix_(community, community)
    m_c = matrix[ixgrid_c]
    within_transition = m_c.sum()
    
    #subtract total community transitions from within community transitions
    out_transition = comm_out_transitions - within_transition
    in_transition = comm_in_transitions - within_transition
    
    ratio = in_transition/out_transition
    return (in_transition, out_transition, ratio)

In [68]:
within_transitions(transitions.as_matrix(), list(cluster0.iloc[:, 1]))
ratio(transitions.as_matrix(), list(cluster0.iloc[:, 1]))

2

In [52]:
testing = transitions.as_matrix()
t = markovChain(testing)
t.computePi()
print(t.pi)

[ 0.0705219   0.032752    0.04749825  0.029796    0.03281641  0.03034134
  0.07335767  0.03895854  0.03576786  0.06833721  0.04236353  0.03254765
  0.02265339  0.0316026   0.10369766  0.0384987   0.04795309  0.09865545
  0.06062606  0.06125468]
