In [9]:
import numpy as np
import pandas as pd
import networkx as nx
from k_means_constrained import KMeansConstrained

from participant import load_participants

In [10]:
data_path = "data/datathon_participants_final.json"
participants = load_participants(data_path)

In [11]:
# Calculate weight of two interest vectors
def calc_weights(v1: list, v2: list) -> float:
    n = len(v1)
    d = sum([max(v1[i], v2[i]) for i in range(n)])
    return d / (5 * n)

In [15]:
### MAKE GRAPH WITH WEIGHTS DEPENDING ON FEATURES ##############################

def make_graph(
    role_mult: float,
    interests_mult: float,
    year_mult: float,
    friend_mult: float,
    challenges_mult: float,
    languages_mult: float,
    objective_mult: float,
    availability_mult: float,
    edge_threshold: float,
    num_nodes: int,
) -> nx.Graph:

    # nodes and edges dataframes to better store their properties
    nodes = pd.DataFrame(data=[[p.id, p.name] for p in participants], 
                         columns=['id', 'label'])
    edges = pd.DataFrame(data=[], # Initially empty
                         columns=['id1', 'id2', 'weight'])

    # For each pair of nodes, compute weight for each feature
    for i in range(num_nodes):
        for j in range(i+1, num_nodes):

            p1 = participants[i]
            p2 = participants[j]

            # year_of_study
            year_weight = (5 - abs(p1.year_of_study - p2.year_of_study)) / 5

            # interests
            interests_weight = 0
            for a in p1.interests:
                for b in p2.interests:
                    if a == b: interests_weight += 1
            interests_weight / max(len(p1.interests), len(p2.interests), 1)

            # preferred_role
            role_weight = 1 if p1.preferred_role != p2.preferred_role else 0

            # friend_registration
            friend_weight = 0
            if p1.id in p2.friend_registration: friend_weight += 1
            if p2.id in p1.friend_registration: friend_weight += 1

            # interest_in_challenges
            challenges_weight = 0.0
            for a in p1.interest_in_challenges:
                for b in p2.interest_in_challenges:
                    if a == b: challenges_weight += 1
            challenges_weight /= 3

            # preferred_languages
            languages_weight = 0
            for a in p1.preferred_languages:
                for b in p2.preferred_languages:
                    if a == b: languages_weight = max(languages_weight, 1)

            # objective
            objective_weight = np.dot(np.array(p1.objective), np.array(p2.objective))

            # availability
            availability_weight = 0
            for (a, b) in p1.availability.items():
                if b and p2.availability[a]: availability_weight += 1
            availability_weight /= 5

            # Sum all pondered weights
            total_weight = (
                year_mult * year_weight
                + interests_mult * interests_weight
                + role_mult * role_weight
                + friend_mult * friend_weight
                + challenges_mult * challenges_weight
                + languages_mult * languages_weight
                + objective_mult * objective_weight
                + availability_mult * availability_weight
            )

            # Add edge to dataframe
            edges.loc[len(edges)] = [p1.id, p2.id, total_weight]

    # Normalize edges and remove small ones
    max_weight = max(edges['weight'])
    if max_weight != 0.0: edges['weight'] = edges['weight'] / max_weight
    edges = edges[edges['weight'] > edge_threshold]

    return nodes, edges

In [16]:
### CLUSTERING FUNCTION TO MAKE TEAMS ##########################################

def make_teams(nodes: pd.DataFrame, edges: pd.DataFrame, n_clusters) -> list[int]:
    # Create inverse edges
    inv_edges = edges.copy()
    inv_edges['weight'] = 1 / edges['weight']

    # Create graph with inverse edges
    inv_graph = nx.Graph()
    inv_graph.add_nodes_from(nodes["id"])
    inv_graph.add_edges_from(
        zip(
            inv_edges["id1"],
            inv_edges["id2"],
            inv_edges[["weight"]].to_dict(orient="records"),
        )
    )

    # Clustering parameters
    clf = KMeansConstrained(
        n_clusters=n_clusters,
        size_min=2,
        size_max=4,
        random_state=0
    )

    # Get adjacency matrix from graph
    adj_matrix = nx.to_numpy_array(inv_graph)

    # Calc clusters
    clusters = clf.fit_predict(adj_matrix)
    return clusters

In [18]:
### PIPELINE ###################################################################

nodes, edges = make_graph(
        role_mult=          0.5,
        interests_mult=     0.5,
        year_mult=          0.5,
        friend_mult=        0.5,
        challenges_mult=    0.5,
        languages_mult=     0.5,
        objective_mult=     0.5,
        availability_mult=  0.5,
        edge_threshold=     0.2,
        num_nodes=          924
    )
clusters = make_teams(nodes=nodes, edges=edges, n_clusters=round(924/3.2))

nodes['clusters'] = clusters

In [19]:
nodes.to_csv('results.csv', index=False)