In [42]:
import numpy as np
import pandas as pd
from random import shuffle

# Decomposition
!pip install umap-learn
import umap
import sklearn.decomposition as skld
import sklearn.manifold as sklm

# Clustering
import sklearn.cluster as sklc
import scipy.cluster.hierarchy as sch

# Visualization
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
# Mount drive folder
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/MyDrive/Q7/TAED2/TechLoan/TDD/"

Mounted at /content/drive


In [17]:
def hieragglo(data, transform=None, linkage="ward", criteria=None, parameter=None, dendro=False, fig_dim=(20, 8)):
    """
    This function can be used to explore the data visualizing the dendograms without the need to
    define a criteria for the clustering (criteria=None), returning the corresponding linkage
    matrix and, if a criteria is defined, it will return a vector integers representing the
    assigned cluster for each entry.

    > data (no default): dataframe of entries and their features.
    > transform (default None): transformation to be applied to the data.
    > linkage (default ward): string defining the type of linkage used for the linkage matrix ("single",
    "average", "complete", "ward").
    > criteria (default None): If criteria is "n_clusters", function argument 'parameter' will represent
    the number of clusters that will be returned. If criteria is "height", function argument
    'parameter' will represent the height where the dendogram must be cut. Else,
    if criteria is None, no cluster will be returned; use this option to explore
    the data ('parameter' will be ignored).
    > dendro (default None): boolean representing whether the dendogram should be visualized or not.
    > fig_dim (default (20,8)): tuple of two elements (width,height) representing the size of the dendogram (ignored
    if the dendogram is not visualized).
    """
    if type(linkage) is not str or linkage not in {"single", "average", "complete", "ward"}:
        print("ERROR: argument 'linkage' can only have values 'single', 'average', 'complete', 'ward'")
        return

    do_clustering = criteria is not None and parameter is not None

    if transform is not None:
        if transform == "umap":
            x = umap.UMAP().fit_transform(data)
        else:
            print("ERROR: argument 'transform' can only have values 'umap'")
            return None
    else:
        x = data

    linkage_mat = sch.linkage(x, method=linkage)

    if dendro:         # plotting the dendogram
        fig = plt.figure(figsize=fig_dim)
        _ = sch.dendrogram(linkage_mat)
        plt.title(f"Dendrogram (linkage = {linkage})")
        if do_clustering and criteria == "height":
            plt.axhline(y=parameter, color='r', linestyle='--')
        plt.show()

    if do_clustering:  # computing the clusters
        if criteria == "n_clusters":
            clusters = sch.cut_tree(linkage_mat, n_clusters=parameter)
        elif criteria == "height":
            clusters = sch.cut_tree(linkage_mat, height=parameter)
        return clusters
    return None

In [18]:
df = pd.read_csv(path + "committer-level_dataframe.csv")

# els usuaris han fet l'exploració anterior per determinar quin squad representa cada nivell d'experiència
clust2XP = ["senior", "experienced", "newbie"]  # nivells ordenats segons el cluster que els representa ('senior' representat per cluster 0)
devsXsquad = 10

In [59]:
def parameter_error(df, devsXsquad, clustToXP):
    error = False
    if type(df) != pd.core.frame.DataFrame:
        print("ERROR: Parameter 'df' needs to be a pandas dataframe.")
        error = True
    elif type(devsXsquad) != int or devsXsquad < 3:
        print("ERROR: Check parameter 'devsXsquad'.")
        error = True
    elif len(clust2XP) != 3 or not ('senior' in clust2XP and
                                    'experienced' in clust2XP and
                                    'newbie' in clust2XP):
        print("ERROR: Check parameter 'clust2XP'.")
        error = True
    return error

def print_squads(squads):
    for i,s in enumerate(squads):
        print(f"Squad #{i}:")
        for lvl in s.keys():
            print(" "*5 + f"{lvl}:")
            for dev in s[lvl]:
                print(" "*10 + dev)

def proportional_autosquad(df, devsXsquad, clustToXP, bool_print=False):
    if parameter_error(df, devsXsquad, clustToXP):
        return
    # Clusterize developers using the techniques we found to give the best results
    devs = np.array(df["COMMITTER"])
    embedded = umap.UMAP().fit_transform(df.loc[:, df.columns != 'COMMITTER'])
    clust = hieragglo(embedded, linkage="complete", criteria="n_clusters", parameter=3)

    # Form squads based on the proportion of each cluster so that squads are balanced
    grouped_devs = {clust2XP[i]:devs[[bool(c == i) for c in clust]] for i in range(3)}
    [shuffle(group) for group in grouped_devs.values()]
    prop = {clust2XP[i]:round(float(sum(clust == i)/len(df))*devsXsquad) for i in range(3)}
    if sum([prop[lvl] for lvl in prop.keys()]) > devsXsquad:
        prop[max(prop, key=prop.get)] -= 1
    nsquads = int(min([len(grouped_devs[lvl])/prop[lvl] for lvl in clust2XP]))
    squads = []
    for i in range(nsquads):
        squads.append({lvl:grouped_devs[lvl][i*prop[lvl]:(i+1)*prop[lvl]] for lvl in clust2XP})

    # Print the squads
    if bool_print:
        print_squads(squads)

    return squads

In [60]:
_ = proportional_autosquad(df, 10, ["senior","experienced","newbie"], bool_print=True)

Squad #0:
     senior:
          David Graham
          Christopher Oliver
     experienced:
          Ignacio J. Ortega
          Outerthought - Steven Noels
          coheigea
          Dean Jackson
          Siying Dong
          henrib <>
     newbie:
          Andrew Kornev
          Francesco Furfari
Squad #1:
     senior:
          Vincent Hardy
          Stephan Michels
     experienced:
          Michael Wechner
          Luca Morandini
          Andrei Dulvac
          Luis Bernardo
          dave
          Karl Lehenbauer
     newbie:
          Robert Leland
          Steven M. Cohen
Squad #2:
     senior:
          Oleg Kalnichevski
          Ashutosh Chauhan
     experienced:
          Julius Davies
          Jim Jagielski
          Clay Leeds
          Vaibhav Gumashta
          Javier Puerto
          Eric N. Hanson
     newbie:
          Stefano Lenzi
          Marc Slemko
Squad #3:
     senior:
          Phil Steitz
          Carsten Ziegeler
     experienced:
        