In [11]:
import folium
import pandas as pd
import numpy as np
import webbrowser
from scipy import spatial
import networkx as nx
import matplotlib.pyplot as plt
from sklearn import decomposition
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
import os
import shutil
from haversine import haversine, Unit

In [12]:
import csv

In [33]:
class GWR:

    def __init__(self, input_data):
        self.network = None
        self.data = input_data
        self.units_created = 0
        plt.style.use('ggplot')

    def dis(self, u1, u2):
        
        return haversine(u1[0:2],u2[0:2])
    
    def find_nearest_units(self, observation):
        distance = []
        for u, attributes in self.network.nodes(data=True):
            vector = attributes['vector']
            # dist = spatial.distance.euclidean(vector, observation)
#             dist = haversine(vector,observation)
            dist = self.dis(vector,observation)
            distance.append((u, dist))
        distance.sort(key=lambda x: x[1])
        ranking = [u for u, dist in distance]
        return ranking
    
    def prune_connections(self, a_max):
        for u, v, attributes in self.network.edges(data=True):
            if attributes['age'] > a_max:
                self.network.remove_edge(u, v)
        for u in self.network.nodes():
            if self.network.degree(u) == 0 and len(self.network.nodes())>2:
                self.network.remove_node(u)

    def fit_network(self, e_b, e_n, a_max, l, a, d, r_min, passes=1, plot_evolution=False):
        # logging variables
        accumulated_local_error = []
        global_error = []
        network_order = []
        network_size = []
        total_units = []
        self.units_created = 0
        # 0. start with two units a and b at random position w_a and w_b
        # w_a = [np.random.uniform(-2, 2) for _ in range(np.shape(self.data)[1])]
        # w_b = [np.random.uniform(-2, 2) for _ in range(np.shape(self.data)[1])]
        w_a = self.data[0]
        w_b = self.data[1]
        self.network = nx.Graph()
        self.network.add_node(self.units_created, vector=w_a, error=0)
        self.units_created += 1
        self.network.add_node(self.units_created, vector=w_b, error=0)
        self.units_created += 1

        
        # 1. iterate through the data
        sequence = 0
        for p in range(passes):
            print('   Pass #%d' % (p + 1))
            steps = 0
            for observation in self.data:
                # 2. find the nearest unit s_1 and the second nearest unit s_2
                nearest_units = self.find_nearest_units(observation)
                s_1 = nearest_units[0]
                s_2 = nearest_units[1]
                # 3. increment the age of all edges emanating from s_1
                for u, v, attributes in self.network.edges_iter(data=True, nbunch=[s_1]):
                    self.network.add_edge(u, v, age=attributes['age']+1)
                # 4. add the squared distance between the observation and the nearest unit in input space
                dist = self.dis(observation, self.network.node[s_1]['vector'])**2
                self.network.node[s_1]['error'] += dist
                # 5 .move s_1 and its direct topological neighbors towards the observation by the fractions
                #    e_b and e_n, respectively, of the total distance
                update_w_s_1 = e_b * (np.subtract(observation, self.network.node[s_1]['vector']))
                self.network.node[s_1]['vector'] = np.add(self.network.node[s_1]['vector'], update_w_s_1)
                if(s_1==2):
                    print(self.network.node[s_1]['vector'])
                update_w_s_n = e_n * (np.subtract(observation, self.network.node[s_1]['vector']))
                for neighbor in self.network.neighbors(s_1):
                    self.network.node[neighbor]['vector'] = np.add(self.network.node[neighbor]['vector'], update_w_s_n)
                # 6. if s_1 and s_2 are connected by an edge, set the age of this edge to zero
                #    if such an edge doesn't exist, and the distance between them is not obviosly hight, create it
                if(self.dis(self.network.node[s_1]['vector'],self.network.node[s_2]['vector'])<=r_min):
                    self.network.add_edge(s_1, s_2, age=0)
                # 7. if distance is greater than a spefic number, insert a new unit
                steps += 1
                if dist >= r_min or  steps % l == 0:
                    if plot_evolution:
                        self.plot_network('./visualization/GNG'+str(sequence) + '.png')
                    sequence += 1
                    # 8. insert a new unit r halfway between the nearest node s_1 and the data point
                    try:
                        w_r = 0.5 * (np.add(self.network.node[s_1]['vector'], observation))
                    except:
                        print(str(observation)+'wrong')
                        print(s_1)
                        
                    r = self.units_created
                    self.units_created += 1
                    self.network.add_node(r, vector=w_r, error=0)
                    self.network.node[s_1]['error'] *= a
                    self.network.node[r]['error'] = self.network.node[s_1]['error']
                    self.network.add_edge(r, s_1, age=0)
                # 8. remove edges with an age larger than a_max
                #    if this results in units having no emanating edges, remove them as well
                self.prune_connections(a_max)
                # 9. decrease all error variables by multiplying them with a constant d
                error = 0
                for u in self.network.nodes_iter():
                    error += self.network.node[u]['error']
                accumulated_local_error.append(error)
                network_order.append(self.network.order())
                network_size.append(self.network.size())
                total_units.append(self.units_created)
                for u in self.network.nodes_iter():
                    self.network.node[u]['error'] *= d
                    if self.network.degree(nbunch=[u]) == 0:
                        print(u)
            global_error.append(self.compute_global_error())


    def plot_network(self, file_path):
        plt.clf()
        plt.scatter(self.data[:,0], self.data[:,1])
        node_pos = {}
        for u in self.network.nodes_iter():
            vector = self.network.node[u]['vector']
            node_pos[u] = (vector[0], vector[1])
        nx.draw(self.network, pos=node_pos)
        plt.draw()
        plt.savefig(file_path)
        
        
    def get_nodes(self, map1):
        colors = [ 'darkblue', 'darkgreen', 'cadetblue',
                  'pink', 'lightblue', 'lightgreen', 'gray', 'black', 'lightgray']
        plt.clf()
        node_pos = {}
        nodes=[]
        col=0
        for c in nx.connected_components(self.network):
            for unit in c:
                vector = self.network.node[unit]['vector']
                nodes.append(vector)
                folium.Marker(location=[vector[0], vector[1]], radius = 5, color = colors[col] ).add_to(map1)
                
        return nodes
#         for u in self.network.nodes_iter():
#             vector = self.network.node[u]['vector']
#             nodes.append(vector)
#             folium.Marker(location=[vector[0], vector[1]], radius = 5, color = 'grey' ).add_to(map1)
#         return nodes

    
    def number_of_clusters(self):
        return nx.number_connected_components(self.network)

    
    def cluster_data(self):
        unit_to_cluster = np.zeros(self.units_created)
        cluster = 0
        print('connected nodes are as follows:')
        for c in nx.connected_components(self.network):
            print(c)
            for unit in c:
                unit_to_cluster[unit] = cluster
            cluster += 1
        clustered_data = {}
        for observation in self.data:
            nearest_units = self.find_nearest_units(observation)
            s = nearest_units[0]
            clustered_data.setdefault( unit_to_cluster[s], []).append(observation)
        return clustered_data

    
    def compute_global_error(self):
        global_error = 0
        for observation in self.data:
            nearest_units = self.find_nearest_units(observation)
            s_1 = nearest_units[0]
            global_error += self.dis(observation, self.network.node[s_1]['vector'])**2
        return global_error

In [100]:
class Test_algorithm:

    def __init__(self, method_type='GNG'):
        # loading data set from a Geolife Trajectories
        data_arr = np.genfromtxt('test_data.txt', delimiter=',', skip_header=0)
        data_df = pd.DataFrame(data=data_arr, columns=['lan', 'long', 'alt', '*', 'time', '**', '***'])
        # first using two dimension langitude and longitude which is easy for visualization and clustering
        self.df = data_df[['lan', 'long','alt']]

    def data_visualization(self, map1=None):
        if (map1 == None):
            map1 = folium.Map(location=self.df.iloc[1][0:2], tiles='cartodbpositron', zoom_start=16, )
        self.df.apply(lambda row: folium.CircleMarker(location=[row["lan"], row["long"]], radius=5).add_to(map1),
                      axis=1)
        return map1

    def plot_clusters(self,number_of_clusters, clustered_data, map1):
        colors = ['red', 'blue', 'green', 'purple', 'orange', 'darkred', 'beige', 'darkblue', 'darkgreen', 'cadetblue',
                  'pink', 'lightblue', 'lightgreen', 'gray', 'black', 'lightgray']
        for i in range(number_of_clusters):
            if i in clustered_data.keys():
                observations = clustered_data[i]
                observations = np.array(observations)
                for row in observations:
                    folium.CircleMarker(location=[row[0], row[1]], radius=5, color=colors[i]).add_to(map1)
        return map1

    def save_to_csv(self, filename, clustered_data):
        with open(filename, 'w', newline='') as file:
            writer = csv.writer(file)
            for i in range(len(clustered_data)):
                if i in clustered_data.keys():
                    observations = clustered_data[i]
                    observations = np.array(observations)
                    for row in observations:
                        writer.writerow([str(row[0]), str(row[1]), str(i)])

    def all_points_normal(self, method_type='GWR'):
        if (method_type == 'GWR'):
            data = self.df.values
            gwr = GWR(data)
            gwr.fit_network(e_b=0.5, e_n=0.01, a_max=1, l=2, a=0.5, d=0.5, r_min=0.0001, passes=1, plot_evolution=False)
            print('Found %d cluster centers.' % gwr.number_of_clusters())
            clusters = gwr.cluster_data()
            map1 = folium.Map(location=self.df.iloc[1][0:2], tiles='cartodbpositron', zoom_start=12, )
            map1 = self.plot_clusters(gwr.number_of_clusters(), clusters, map1)
            # plot the nodes
#             map1 = gwr.get_nodes(map1)
#             for vector in nodes:
#                 folium.Marker(location=[vector[0], vector[1]], radius = 5, color = 'grey' ).add_to(map1)

            colors = [ 'darkblue', 'darkgreen', 'cadetblue',
                      'pink', 'lightblue', 'lightgreen', 'gray', 'black', 'lightgray']
            plt.clf()
            col=0
            for c in nx.connected_components(gwr.network):
                for unit in c:
                    vector = gwr.network.node[unit]['vector']
                    folium.Marker(location=[vector[0], vector[1]], icon=folium.Icon(color=colors[col])).add_to(map1)
                col+=1
            display(map1)
            # save the mapping result to csv file
            self.save_to_csv('GNG_result.csv', clusters)



In [101]:
test = Test_algorithm()
# map1 = test.data_visualization()
# display(map1)

In [102]:
test.all_points_normal('GWR')

   Pass #1
Found 4 cluster centers.
connected nodes are as follows:
{1, 2}
{8, 9, 3, 13}
{10, 11, 4, 12}
{5, 6, 7}


<Figure size 432x288 with 0 Axes>