In [None]:
"""
Task; extract handcrafted nodes features for each graph and nodes

Try different aggregations to get edge features

Save numpy arrays with features

Write code to train logistic regression, random forest and gradient boosted trees on them

Random search on hyperparameters to get the best result; Look at features importance for each classifier

Compute MSE; MAE

"""

In [10]:
import sys
sys.path.append("../")  # To import utils

from utils import *
import numpy as np
import networkx as nx
from tqdm import tqdm
import pickle

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
def simple_handcrafted_features(graph):
    features = {}
    features["degree"] = nx.degree(graph)
    features["clustering_coeff"] = nx.clustering(graph)
    # Generalization of eigenvector centrality
    features["katz_centrality"] = nx.katz_centrality(graph)
    # About n. shortest paths between node
    features["betweenness_centrality"] = nx.betweenness_centrality(graph)
    # About all closed walks around each nodes
    features["harmonic_centrality"] = nx.harmonic_centrality(graph)
    features["closeness_centrality"] = nx.closeness_centrality(graph)
    features["pagerank"] = nx.pagerank(graph)
    features["avg_neighbours_degree"] = nx.average_neighbor_degree(graph)
    # features["eccentricity"] = nx.eccentricity(graph)
    return features


In [48]:
def features_dict_to_vec(graph, features):
    feature_header =  list(features.keys())

    features_out = {}
    for node in graph.nodes():
        row = []
        for feature in feature_header:
            row.append(features[feature][node])
        features_out[node] = np.array(row)
    return features_out, feature_header

In [49]:
def prepare_target(graph):
    nodes = []
    targets = []
    edges = graph.edges()
    for e in edges:
        duration_avg = edges[e]["duration_avg"]
        nodes.append(e)
        targets.append(duration_avg)

    return np.array(nodes), np.array(targets)
        

In [50]:
all_cities = list_cities()
graphs = [load_graph(c, "bus") for c in all_cities]


In [None]:
model = xgboost.XGBRegressor()

In [52]:
with open("simple_features/paris.pkl", "rb") as f:
    paris_pkl = pickle.load(f)

with open("targets/paris.pkl", "rb") as f:
    paris_targets = pickle.load(f)


with open("simple_features/detroit.pkl", "rb") as f:
    detroit_pkl = pickle.load(f)

with open("targets/detroit.pkl", "rb") as f:
    detroit_targets = pickle.load(f)

In [56]:
def preprocess(features, edges):
    res = []

    for i, j in edges:
        v1 = features[i]
        v2 = features[j]
        v = v1 + v2 / 2
        res.append(v)

    return res

In [58]:
paris_features = paris_pkl["features"]
paris_edges = paris_targets["edges"]
paris_edge_features = preprocess(paris_features, paris_edges)

detroit_features = detroit_pkl["features"]
detroit_edges = detroit_targets["edges"]
detroit_edge_features = preprocess(detroit_features, detroit_edges)


In [77]:
model = xgboost.XGBRegressor(n_estimators=10_000)

X = paris_edge_features
y = paris_targets["target"]

model.fit(X, y)

In [78]:
preds = model.predict(X)

In [79]:
((preds - y) ** 2).mean()

97.12271595856805