In [1]:
import requests
import sys
import pandas as pd
import numpy as np
from scipy import sparse
from io import StringIO
import networkscience as ns
from matplotlib import pyplot as plt
import seaborn as sns
reply = requests.get('https://data.europa.eu/euodp/data/storage/f/2020-08-11T140550/Finalised%20mobilities%20started%20in%202018%20-%20KA1.csv')
reply.encoding = 'utf-8'
csv_data = reply.text
df = pd.read_csv(StringIO(csv_data), sep=';',low_memory=False, header=0)
for i, name in enumerate(df.columns):
    print(i, name)

0 Project Reference
1 Academic Year
2 Mobility Start Month
3 Mobility End Month
4 Mobility Duration
5 Activity (mob)
6 Field of Education
7 Participant Nationality
8 Education Level
9 Participant Gender
10 Participant Profile
11 Special Needs
12 Fewer Opportunities
13 Participant Age
14 Sending Country Code
15 Sending City
16 Sending Organization
17 Receiving Country Code
18 Receiving City
19 Receiving Organization
20 Participants


In [2]:
mobility_df = df[df[df.columns[10]]=='Learner']
mobility_df = mobility_df[mobility_df['Activity (mob)'].str.contains('Student')]
mobility_df = mobility_df[~mobility_df['Activity (mob)'].str.contains('traineeship')]
# consideriamo solo l'anno accademico 2018/2019 o 2017/2018?
# i numeri cambiano parecchio di anno in anno, è normale?
#mobility_df = mobility_df[mobility_df['Academic Year'].str.contains('19')]
mobility_df = mobility_df.loc[:, df.columns[np.array((16, 19, 20, 5, 14, 6))]]
print((mobility_df['Field of Education'].nunique())) 
mobility_df['Sending Organization'] = mobility_df['Sending Organization'].str.upper()
mobility_df['Receiving Organization'] = mobility_df['Receiving Organization'].str.upper()

141


In [3]:
mobility_df.drop('Activity (mob)',axis=1, inplace=True)
display(mobility_df)

Unnamed: 0,Sending Organization,Receiving Organization,Participants,Sending Country Code,Field of Education
174,VORARLBERGER LANDESKONSERVATORIUM,LUNDS UNIVERSITET,1,AT,Music and performing arts
195,NATIONAL MINING UNIVERSITY,MONTANUNIVERSITAT LEOBEN,1,UA,Environmental protection technology
196,NATIONAL MINING UNIVERSITY,MONTANUNIVERSITAT LEOBEN,1,UA,Electronics and automation
197,NATIONAL MINING UNIVERSITY,MONTANUNIVERSITAT LEOBEN,1,UA,Mechanics and metal trades
198,MONTANUNIVERSITAT LEOBEN,THE NATIONAL MINERAL RESOURCES UNIVERSITY (THE...,1,AT,Mining and extraction
...,...,...,...,...,...
667190,UNIVERSITY COLLEGE LONDON,FEDERAL STATE AUTONOMOUS EDUCATIONAL INSTITUTI...,1,UK,Political sciences and civics
667191,UNIVERSITY COLLEGE LONDON,FEDERAL STATE AUTONOMOUS EDUCATIONAL INSTITUTI...,1,UK,Political sciences and civics
667192,FEDERAL STATE AUTONOMOUS EDUCATIONAL INSTITUTI...,UNIVERSITY COLLEGE LONDON,1,RU,Economics
667193,UNIVERSITY COLLEGE LONDON,THE HEBREW UNIVERSITY OF JERUSALEM,1,UK,"Arts and humanities, inter-disciplinary progra..."


In [4]:
mobility_df['Field of Education'] = mobility_df['Field of Education'].str.replace(', not further defined','')
mobility_df['Field of Education'] = mobility_df['Field of Education'].str.replace(', inter-disciplinary programmes','')
mobility_df['Field of Education'] = mobility_df['Field of Education'].str.replace(', not elsewhere classified','')
mobility_df['Field of Education'] = mobility_df['Field of Education'].str.replace(' not elsewhere classified','')
mobility_df['Field of Education'] = mobility_df['Field of Education'].str.replace(', not elsewhere classifed','')
mobility_df['Field of Education'] = mobility_df['Field of Education'].str.replace('Inter-disciplinary programmes and qualifications involving ','')
mobility_df['Field of Education'] = mobility_df['Field of Education'].str.lower()
mobility_df['Field of Education'] = mobility_df['Field of Education'].str.strip()

In [5]:
print(mobility_df['Field of Education'].nunique())
print(np.sort(mobility_df['Field of Education'].unique()))

106
['accounting and taxation' 'agriculture'
 'agriculture, forestry, fisheries and veterinary'
 'architecture and construction' 'architecture and town planning' 'arts'
 'arts and humanities' 'audio-visual techniques and media production'
 'biochemistry' 'biological and related sciences' 'biology'
 'building and civil engineering' 'business and administration'
 'business, administration and law'
 'care of the elderly and of disabled adults'
 'chemical engineering and processes' 'chemistry'
 'child care and youth services' 'community sanitation' 'computer use'
 'crop and livestock production'
 'database and network design and administration' 'dental studies'
 'domestic services' 'earth sciences' 'economics' 'education'
 'education science' 'electricity and energy' 'electronics and automation'
 'engineering and engineering trades'
 'engineering, manufacturing and construction' 'environment'
 'environmental protection technology' 'environmental sciences'
 'fashion, interior and industrial

In [6]:
nodes = np.array(list(set.union(set(mobility_df.loc[:, mobility_df.columns[0]]), set(mobility_df.loc[:, mobility_df.columns[1]]))))
node_position = {}
for i, name in enumerate(nodes):
    node_position[name] = i
nodes_df = pd.DataFrame({'Nodes': nodes})
mobility_df['SendId'] = (np.array(list(map(lambda x: node_position[x], mobility_df['Sending Organization']))))
mobility_df['RecId']= (np.array(list(map(lambda x: node_position[x], mobility_df['Receiving Organization']))))
# Adjacency matrix
mobility_serie = mobility_df.groupby(['SendId', 'RecId']).sum()
row = np.array(mobility_serie.index.get_level_values(1).tolist())
col = np.array(mobility_serie.index.get_level_values(0).tolist())
val = mobility_serie.values
adj_matrix_crs = sparse.csr_matrix((val.flatten(), (row, col)), shape=(nodes.size, nodes.size))
print(ns.fraction_in_giant(adj_matrix_crs))

0.9936995471549518


In [7]:
dd = {}
for field in mobility_df['Field of Education'].unique():
    temp_df = mobility_df.loc[mobility_df['Field of Education'] == field]
    temp_serie = temp_df.groupby(['SendId', 'RecId']).sum()
    row = np.array(temp_serie.index.get_level_values(1).tolist())
    col = np.array(temp_serie.index.get_level_values(0).tolist())
    val = temp_serie.values
    temp_adj = sparse.csr_matrix((val.flatten(), (row, col)), shape=(nodes.size, nodes.size))
    dd[field] = ns.fraction_in_giant(temp_adj)
field_df = pd.DataFrame.from_dict(dd, orient='index', columns=['percentage'])
print('Top', 10, 'for', 'percentage')
pd.set_option('display.max_rows', 10)
field_df = field_df.sort_values('percentage', ascending=False)
display(field_df.head(10))

Top 10 for percentage


Unnamed: 0,percentage
business and administration,0.403032
engineering and engineering trades,0.290805
management and administration,0.285095
economics,0.281355
information and communication technologies (icts),0.252018
languages,0.235479
political sciences and civics,0.208309
literature and linguistics,0.183698
law,0.179563
arts,0.1772


TODO: Add an iterative version of it, checking if 2 or more field of education can get a percentage of nodes in the giant component close to the one with all the edges (99.36%).
You can also work in the other way around iteratively deleting edges of one field 

In [8]:
import itertools
fields = mobility_df['Field of Education'].unique()
subsets = []
for L in range(1,5):
    for subset in itertools.combinations(fields, L):
        subsets +=[subset,]
print(len(subsets))

5166281


In [9]:
def fields_percentage_in_giant(df, n_fields, removing=False):
    '''computes the percentages of nodes in the giant component of the network with edges of only certain fields of education
    (iterates over all possible combinations of n_fields fields)

    Args:
        df (DataFrame): the mobility DataFrame
        n_fields (int): number of fields to consider
        removing (bool): if true we consider the network obtained by removing n_fields fields
    Returns:
        a DataFrame with the percentages (sorted)
    '''
    fields = df['Field of Education'].unique()
    dd = {}
    for subset in itertools.combinations(fields, n_fields):
        if removing:
            temp_df = df.loc[~df['Field of Education'].isin(subset)]
        else:
            temp_df = df.loc[df['Field of Education'].isin(subset)]
        temp_serie = temp_df.groupby(['SendId', 'RecId']).sum()
        row = np.array(temp_serie.index.get_level_values(1).tolist())
        col = np.array(temp_serie.index.get_level_values(0).tolist())
        val = temp_serie.values
        temp_adj = sparse.csr_matrix((val.flatten(), (row, col)), shape=(nodes.size, nodes.size))
        dd[subset] = ns.fraction_in_giant(temp_adj)
    field_df = pd.DataFrame.from_dict(dd, orient='index', columns=['percentage'])
    field_df = field_df.sort_values('percentage', ascending=removing)
    return field_df

In [10]:
field_df = fields_percentage_in_giant(mobility_df, 2)
print('Percentages considering 2 fields of education:')
print('Top 10 for percentage')
display(field_df.head(10))

Percentages considering 2 fields of education:
Top 10 for percentage


Unnamed: 0,percentage
"(business and administration, engineering and engineering trades)",0.48927
"(business and administration, languages)",0.473322
"(business and administration, arts)",0.47214
"(music and performing arts, business and administration)",0.464068
"(business and administration, information and communication technologies (icts))",0.460918
"(business and administration, economics)",0.457177
"(business and administration, literature and linguistics)",0.453633
"(business and administration, fine arts)",0.453633
"(business and administration, fashion, interior and industrial design)",0.450876
"(business and administration, management and administration)",0.449104


In [11]:
field_df = fields_percentage_in_giant(mobility_df, 3)
print('Percentages considering 3 fields of education:')
print('Top 10 for percentage')
display(field_df.head(10))

Percentages considering 3 fields of education:
Top 10 for percentage


Unnamed: 0,percentage
"(business and administration, arts, engineering and engineering trades)",0.552865
"(business and administration, languages, engineering and engineering trades)",0.550108
"(music and performing arts, business and administration, engineering and engineering trades)",0.547746
"(business and administration, engineering and engineering trades, fine arts)",0.53672
"(business and administration, languages, arts)",0.535342
"(business and administration, economics, engineering and engineering trades)",0.532782
"(business and administration, engineering and engineering trades, literature and linguistics)",0.532585
"(business and administration, engineering and engineering trades, fashion, interior and industrial design)",0.531798
"(music and performing arts, business and administration, languages)",0.53101
"(business and administration, information and communication technologies (icts), engineering and engineering trades)",0.526285


Up to 3 fields of education we can't get close to the giant component of the main network.
Let's try removing fields.

In [12]:
field_df = fields_percentage_in_giant(mobility_df, 1, removing=True)
print('Percentages removing 1 field of education:')
print('Top 10 for percentage')
display(field_df.head(10))

Percentages removing 1 field of education:
Top 10 for percentage


Unnamed: 0,percentage
"(music and performing arts,)",0.954912
"(business and administration,)",0.963576
"(engineering and engineering trades,)",0.974011
"(languages,)",0.983461
"(economics,)",0.983658
"(information and communication technologies (icts),)",0.983658
"(nursing and midwifery,)",0.98484
"(management and administration,)",0.98543
"(agriculture,)",0.986415
"(political sciences and civics,)",0.987202


In [13]:
field_df = fields_percentage_in_giant(mobility_df, 2, removing=True)
print('Percentages removing 2 fields of education:')
print('Top 10 for percentage')
display(field_df.head(10))

Percentages removing 2 fields of education:
Top 10 for percentage


Unnamed: 0,percentage
"(music and performing arts, business and administration)",0.924591
"(music and performing arts, engineering and engineering trades)",0.935027
"(business and administration, engineering and engineering trades)",0.942115
"(music and performing arts, languages)",0.944477
"(music and performing arts, arts)",0.944871
"(music and performing arts, economics)",0.944871
"(music and performing arts, information and communication technologies (icts))",0.944871
"(music and performing arts, nursing and midwifery)",0.946052
"(music and performing arts, management and administration)",0.946643
"(music and performing arts, agriculture)",0.947627


Interestingly enough, music and performing arts seems to be the field removing which the most nodes get separated from the giant component, even though by itself it doesn't even appear in the top 10 for percentage of nodes in the giant component.\
I suspect this is due to the presence of conservatories and academies that only have one field of education. Let's see.

In [14]:
mobility_df.rename({'Sending Organization': 'source','Receiving Organization': 'target', 'Participants':'weight'},axis=1, inplace=True)
nomusic_df = mobility_df.loc[mobility_df['Field of Education'] != 'music and performing arts']
nomusic_serie = nomusic_df.groupby(['SendId', 'RecId']).sum()
row = np.array(nomusic_serie.index.get_level_values(1).tolist())
col = np.array(nomusic_serie.index.get_level_values(0).tolist())
val = nomusic_serie.values
nomusic_adj = sparse.csr_matrix((val.flatten(), (row, col)), shape=(nodes.size, nodes.size))
new_adj, giant_edges, giant_nodes = ns.keep_giant(ns.find_components(nomusic_adj, nodes_df), nomusic_df)

In [15]:
print(len(giant_nodes) / len(nodes)) #double check
music_art_institutes = set(nodes) - set(giant_nodes.Nodes)
print(sorted(list(music_art_institutes))[:10])

0.9549123843276236
['ACCADEMIA NAZIONALE D\'ARTE DRAMMATICA "SILVIO D\'AMICO"', 'AKADEMI FÖR LEDARSKAP OCH TEOLOGI', 'AKADEMIA MUZYCZNA IM. GRAZYNY I KIEJSTUTA BACEWICZOW W LODZI', 'AKADEMIA MUZYCZNA IM. KAROLA SZYMANOWSKIEGO', 'AKADEMIA MUZYCZNA IMIENIA FELIKSA NOWOWIEJSKIEGO W BYDGOSZCZY', 'AKADEMIA MUZYCZNA IMIENIA KAROLA LIPINSKIEGO WE WROCLAWIU', 'AKADEMIA MUZYCZNA W KRAKOWIE', 'AKADEMIA SZTUK TEATRALNYCH IM. STANISLAWA WYSPIANSKIEGO W KRAKOWIE', 'AKADEMIA TEATRALNA IM. ALEKSANDRA ZELWEROWICZA W WARSZAWIE', 'ANTON BRUCKNER PRIVATUNIVERSITAT']


To get better percentages inside/outside the giant component we could hand-pick the fields of education (or get access to infinite computational power)