In [1]:
#importing libraries
import pandas as pd
from sklearn.neighbors import KDTree

In [2]:
#loading the dataset
df = pd.read_csv("votes_skills_anon.csv")
df.head()

Unnamed: 0,SUID,univid,response
0,538,5c9e7a6834cda4f3677e662b,Internationality
1,2285,5c9e373c34cda4f3677e14ef,Practical relevance / professionalization
2,2285,5c9e7d8334cda4f3677e6b4f,Internationality
3,2285,5e84b8a01ead1a77be3bd3a8,Soft skills and digital literacy
4,2285,5c9e471734cda4f3677e2ed7,Internationality


#### Each row corresponds to a vote where:
#### SUID is the Unique ID of the voter
#### univid: the unique ID of the institution
#### response: the voted skill

In [4]:
#grouping by institution and skill (response)
skills = df.groupby(["univid", "response"])["response"].count().reset_index(name='value')
skills.head()

Unnamed: 0,univid,response,value
0,5c9e345f34cda4f3677e1047,Academic excellence,10
1,5c9e345f34cda4f3677e1047,Internationality,24
2,5c9e345f34cda4f3677e1047,Practical relevance / professionalization,10
3,5c9e345f34cda4f3677e1047,Soft skills and digital literacy,7
4,5c9e345f34cda4f3677e1047,Specialization,18


In [5]:
univSkills = skills.pivot_table(values="value", columns="response", index=["univid"])
univSkills.head()

response,Academic excellence,Internationality,Practical relevance / professionalization,Soft skills and digital literacy,Specialization
univid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5c9e345f34cda4f3677e1047,10.0,24.0,10.0,7.0,18.0
5c9e349434cda4f3677e109f,5.0,10.0,12.0,9.0,15.0
5c9e34a834cda4f3677e10bd,8.0,10.0,8.0,6.0,8.0
5c9e34ae34cda4f3677e10c7,1.0,17.0,17.0,14.0,17.0
5c9e34d534cda4f3677e1107,10.0,11.0,15.0,10.0,24.0


In [6]:
univSkills.sum(axis=1).head()

univid
5c9e345f34cda4f3677e1047    69.0
5c9e349434cda4f3677e109f    51.0
5c9e34a834cda4f3677e10bd    40.0
5c9e34ae34cda4f3677e10c7    66.0
5c9e34d534cda4f3677e1107    70.0
dtype: float64

In [7]:
#changing values into percentage
univSkills = univSkills.div(univSkills.sum(axis=1), axis=0)
univSkills.fillna(0, inplace=True )
univSkills.head()

response,Academic excellence,Internationality,Practical relevance / professionalization,Soft skills and digital literacy,Specialization
univid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5c9e345f34cda4f3677e1047,0.144928,0.347826,0.144928,0.101449,0.26087
5c9e349434cda4f3677e109f,0.098039,0.196078,0.235294,0.176471,0.294118
5c9e34a834cda4f3677e10bd,0.2,0.25,0.2,0.15,0.2
5c9e34ae34cda4f3677e10c7,0.015152,0.257576,0.257576,0.212121,0.257576
5c9e34d534cda4f3677e1107,0.142857,0.157143,0.214286,0.142857,0.342857


In [10]:
#Processing the data to find neighbors
tree = KDTree(univSkills)

In [15]:
#findind the top 3 nearest neighbours of first university
dist, ind = tree.query(univSkills[0:], k=4)
univSkills.iloc[ind.tolist()[0]]

response,Academic excellence,Internationality,Practical relevance / professionalization,Soft skills and digital literacy,Specialization
univid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5c9e345f34cda4f3677e1047,0.144928,0.347826,0.144928,0.101449,0.26087
5c9e519734cda4f3677e4003,0.159091,0.340909,0.136364,0.113636,0.25
5c9e3c6a34cda4f3677e1d67,0.157895,0.342105,0.131579,0.131579,0.236842
5c9e895334cda4f3677e7141,0.126126,0.333333,0.18018,0.117117,0.243243
