In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('IGEM_teams_2004-2024.csv')
data = data[[year <= 2020 and year >= 2007 for year in data['Year']]]

In [3]:
all_awards = [awards.split(', ') for awards in data['Awards'].dropna()]

all_awards = [
    award
    for awards in all_awards
    for award in awards]

[all_awards.remove('-') for item in all_awards if item == '-']
all_awards = pd.Series(all_awards)
unique_awards =  pd.Series(all_awards.unique())
len(unique_awards.unique())

299

In [4]:
def score_award(award_name: str) -> float:
        if 'Grand Prize' in award_name and '(' not in award_name: return 10
        elif 'Grand Prize' in award_name: return 5
        if 'Best' in award_name and 'Project' in award_name: return 5
        elif 'Best' in award_name: return 2
        return 1

def score_medal(medal: str) -> float:
    match medal:
        case 'Gold': return 1
        case 'Silver': return 0.5
        case 'Bronze': return 0.25
        case _ : return 0

def score_team(medal, awards):
    score = 0
    if type(awards) is not float:
        for award in awards.split(', '):
            score += score_award(awards)
    score += score_medal(medal)
    return score

In [5]:
data['Absolute Score'] = [score_team(data.iloc[index]['Medal'], data.iloc[index]['Awards']) for index in range(len(data))]

In [6]:
data = data[data['Section'] == 'Undergrad']

In [7]:
from sklearn.preprocessing import minmax_scale

for year in data['Year'].sort_values().unique():
    data.loc[data['Year'] == year, 'Scaled Score'] = minmax_scale(data.loc[data['Year'] == year]['Absolute Score']).tolist()

'''
undergrad_data = data[data['Section'] == 'Undergrad']
undergrad_data['Score'] = minmax_scale(undergrad_data['Score'])
'''

"\nundergrad_data = data[data['Section'] == 'Undergrad']\nundergrad_data['Score'] = minmax_scale(undergrad_data['Score'])\n"

In [8]:
tracks_dict = pd.read_csv('tracks.csv', index_col=0)['category'].to_dict()

def assign_supertrack(track):
    supertrack = None
    try: supertrack = tracks_dict[str(track)]
    except:
        supertrack = 'Other'
    return supertrack

data['Supertrack'] = [assign_supertrack(track) for track in data['Track']]

In [9]:
tracks_list = list(set([tracks_dict[key] for key in tracks_dict.keys()]))
tracks_list

['High School',
 'Food, Agriculture & Energy',
 'Hardware',
 'Basic Research',
 'Entrepreneurship',
 'Medicine',
 'Applications Research',
 'Environment',
 'Manufacturing',
 'Other',
 'Software, Measurement, & Modeling',
 'Art, Design & Cosmetics',
 'Open/Community Research']

In [10]:
for track in tracks_list:
    data[track] = [1 if team_track == track else 0 for team_track in data['Supertrack']]

In [11]:
data

Unnamed: 0,Year,Team Name,Wiki,Region,Location,Institution,Section,Application,Project Title,Track,...,Basic Research,Entrepreneurship,Medicine,Applications Research,Environment,Manufacturing,Other,"Software, Measurement, & Modeling","Art, Design & Cosmetics",Open/Community Research
0,2012,Buenos Aires,http://2012.igem.org/Team:Buenos_Aires,latin-america,Argentina,Universidad de Buenos Aires,Undergrad,,Synthetic ecology,Foundational Advance,...,1,0,0,0,0,0,0,0,0,0
2,2007,Melbourne,http://2007.igem.org/Melbourne,asia,Australia,University of Melbourne,Undergrad,,COLIFORMING,The formation of complex scaffolds of extracel...,...,0,0,0,0,0,0,1,0,0,0
3,2008,Melbourne,http://2008.igem.org/Team:Melbourne,asia,Australia,University of Melbourne,Undergrad,,,-,...,0,0,0,0,0,0,1,0,0,0
4,2009,UQ-Australia,http://2009.igem.org/Team:UQ-Australia,asia,Australia,University of Queensland,Undergrad,,Mercury sequestration using a multicomponent o...,Environment,...,0,0,0,0,1,0,0,0,0,0
5,2009,Victoria Australia,http://2009.igem.org/Team:Victoria_Australia,asia,Australia,Royal Melbourne Institute of Technology Univer...,Undergrad,,An environmentally sustainable biological ligh...,Manufacturing,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3086,2020,William and Mary,https://2020.igem.org/Team:William_and_Mary,north-america,United States,-,Undergrad,accepted,-,Therapeutics,...,0,0,1,0,0,0,0,0,0,0
3100,2020,XJTU-China,https://2020.igem.org/Team:XJTU-China,asia,China,-,Undergrad,accepted,-,Environment,...,0,0,0,0,1,0,0,0,0,0
3101,2020,XMU-China,https://2020.igem.org/Team:XMU-China,asia,China,-,Undergrad,accepted,-,"Food & Nutrition, Best New Basic Part",...,0,0,0,0,0,0,1,0,0,0
3103,2020,ZJU-China,https://2020.igem.org/Team:ZJU-China,asia,China,-,Undergrad,accepted,-,Diagnostics,...,0,0,1,0,0,0,0,0,0,0


In [12]:
from sklearn.feature_selection import r_regression

In [13]:
corrs = r_regression(data[tracks_list], data['Scaled Score'])

In [14]:
corrs = pd.DataFrame(corrs, index=tracks_list)

In [15]:
corrs.sort_values(0)

Unnamed: 0,0
Entrepreneurship,-0.027516
Medicine,-0.021729
"Art, Design & Cosmetics",-0.014166
Hardware,-0.01254
Manufacturing,-0.004744
High School,0.0
"Food, Agriculture & Energy",0.001613
"Software, Measurement, & Modeling",0.003846
Environment,0.004591
Applications Research,0.007005


In [17]:
import numpy as np

means = pd.DataFrame([np.mean(data.loc[data['Supertrack'] == track]['Scaled Score']) for track in tracks_list], index=tracks_list)

In [19]:
means.sort_values(0)

Unnamed: 0,0
Entrepreneurship,0.00784
"Art, Design & Cosmetics",0.010625
Hardware,0.02588
Medicine,0.042406
Manufacturing,0.045858
"Food, Agriculture & Energy",0.04837
Environment,0.04892
"Software, Measurement, & Modeling",0.049238
Applications Research,0.049969
Other,0.051697


In [132]:
winners = [261, 985, 997, 1404, 1489, 1601, 1603, 1637, 1770,1837, 1930, 2184, 2461, 3078]
data.loc[winners]['Year'].value_counts().sort_index()

Year
2007    1
2008    1
2009    1
2010    1
2011    1
2012    1
2013    1
2014    1
2015    1
2016    1
2017    1
2018    1
2019    1
2020    1
Name: count, dtype: int64

In [135]:
data.loc[winners]['Supertrack'].value_counts()

Supertrack
Basic Research                       4
Other                                2
Food, Agriculture & Energy           2
Applications Research                2
Environment                          2
Medicine                             1
Software, Measurement, & Modeling    1
Name: count, dtype: int64