In [None]:
import pandas as pd
import numpy as np
import math
import scipy.stats as stats
import networkx as nx

In [None]:
# Get data to be a numpy array probably
data=np.array(pd.read_csv('data.csv'))
# Transform the income dimension by logarithmic scale
data[:, income_index] = np.log(data[:, income_index])
# Normalize data by dividing by standard deviation by dimension

# Perform PCA, and project onto the top N dimensions so that they explain 50% of the variance
new_data = perform_pca(data, variance_threshold=0.5)

In [None]:
def create_network(data, weights,window_size):
    # First add each data point as a node
    G = nx.Graph()
    for i, point in enumerate(data):
        G.add_node(i, features=point)

    # For each dimension in the data, add edges between points within a certain window size
    
    for dim in range(data.shape[1]):
        weight=weights[dim]
        # Find the max of the current dimension
        dim_values = data[:, dim]
        max_value = np.max(dim_values)
        
        n=int(max_value/window_size) + 1
        
        # this add each marker in the dimesion as a node, so that data points close to it can connect to it
        markers = [window_size * i for i in range(n+1)]
        G.add_node(f'{dim,markers[0]:.4}', marker=True, dim=dim, value=markers[0])
        for i in range(1,n+1):
            G.add_node(f'{dim,markers[i]:.4}', marker=True, dim=dim, value=markers[i])
            G.add_edge(f'{dim,markers[i-1]:.4}',f'{dim,markers[i]:.4}', weight=weight)
        
        # Now connect data points to the nearest marker nodes within the window size
        for i, point in enumerate(data):
            point_value = point[dim]
            lower_marker = markers[int(point_value // window_size)]
            G.add_edge(i, f'{dim,lower_marker:.4}', weight=weight)
        
    return G