In [1]:
import pandas
import matplotlib.pyplot as plt
import seaborn

In [2]:
%matplotlib inline

In [3]:
seaborn.set_style()

In [4]:
def count_people(df):
    """Count the number of people listed in the people column."""
    
    for index, row in df.iterrows():
        if pandas.notnull(row['people']):
            npeople = len(row['people'].split(','))
            row['people'] = npeople
        else:
            row['people'] = 0
    
    return df

## Demographics

### Field of research

In [5]:
research_field_df = pandas.read_csv('../data/anzsrc_research_groups.csv', header=0, encoding='utf-8-sig')
research_field_df = count_people(research_field_df)

In [6]:
research_divisions_gdf = research_field_df.groupby('research_division')

In [7]:
research_divisions_totals = research_divisions_gdf.sum()['people'].copy()
research_divisions_totals.sort_values(inplace=True, ascending=False)
print(research_divisions_totals)

research_division
biological sciences                           28
medical and health sciences                   27
psychology and cognitive sciences             10
engineering                                    8
studies in human society                       5
history and archaeology                        4
earth sciences                                 4
mathematical sciences                          4
information and computing sciences             4
environmental sciences                         4
language, communication and culture            3
physical sciences                              2
built environment and design                   2
chemical sciences                              2
education                                      2
commerce, management, tourism and services     2
agricultural and veterinary sciences           1
economics                                      1
law and legal studies                          0
philosophy and religious studies               0
st

In [8]:
research_field_df.loc[research_field_df['research_division'] == 'biological sciences'].sort_values('people', ascending=False)

Unnamed: 0,research_group,research_division,people
33,ecology,biological sciences,11
35,genetics,biological sciences,7
39,zoology,biological sciences,5
32,biochemistry and cell biology,biological sciences,3
34,evolutionary biology,biological sciences,2
36,microbiology,biological sciences,0
37,physiology,biological sciences,0
38,plant biology,biological sciences,0
40,other biological sciences,biological sciences,0


In [9]:
research_field_df.loc[research_field_df['research_division'] == 'medical and health sciences'].sort_values('people', ascending=False)

Unnamed: 0,research_group,research_division,people
89,neurosciences,medical and health sciences,8
86,human movement and sports science,medical and health sciences,4
92,oncology and carcinogenesis,medical and health sciences,4
93,ophthalmology and optometry,medical and health sciences,3
97,public health and health services,medical and health sciences,3
87,immunology,medical and health sciences,2
83,clinical sciences,medical and health sciences,1
88,medical microbiology,medical and health sciences,1
95,pharmacology and pharmaceutical sciences,medical and health sciences,1
81,medical biochemistry and metabolomics,medical and health sciences,0


### Career stage

In [10]:
people_df = pandas.read_csv('../data/people.csv', header=0, encoding='utf-8-sig')

In [11]:
career_stage_gdf = people_df.groupby('career stage')

In [12]:
print('Total people:', len(people_df))
print('\n')
for career_stage, career_stage_df in career_stage_gdf:
    print(career_stage, len(career_stage_df))

Total people: 111


early-career 6
honours 1
masters 5
mid-career 13
phd 70
postdoc 11
research assistant 5


## Tools

### Programming languages

TODO: Add a category for "none" - for some disciplines, hardly anyone uses programming languages.

In [13]:
programming_language_df = pandas.read_csv('../data/derived/people_programming_languages.csv', header=0, encoding='utf-8-sig')

In [14]:
print(programming_language_df['tool'].value_counts())

#programming_language_df['tool'].value_counts().plot(kind='bar')
#plt.ylabel('frequency')
#plt.show()

R                37
Python           24
MATLAB           21
Unix Shell       14
C / C++           6
HTML              3
Javascript        3
Fortran           3
Visual Basic      1
Clojure           1
AppleScript       1
Make              1
Windows Shell     1
Perl              1
Name: tool, dtype: int64


### General data science tools

In [15]:
general_datasci_df = pandas.read_csv('../data/derived/people_general_datasci_tools.csv', header=0, encoding='utf-8-sig')

In [16]:
general_datasci_purpose_gdf = general_datasci_df.groupby('purpose')

In [17]:
for purpose, purpose_df in general_datasci_purpose_gdf:
    print('#', purpose)
    print(purpose_df['tool'].value_counts())
    
    #purpose_df['tool'].value_counts().plot(kind='bar')
    #plt.title(purpose)
    #plt.ylabel('frequency')
    #plt.show()

    print('\n')

# databases
Access    2
MySQL     2
SQLite    1
Name: tool, dtype: int64


# file manipulation
NCO    4
Name: tool, dtype: int64


# general
GraphPad Prism    14
Visio              4
Shiny              3
ggplot             3
Mathematica        3
matplotlib         2
JMP                2
dplyr              2
Gephi              2
xarray             1
Plotly             1
Tableau            1
Origin             1
gnuplot            1
SigmaPlot          1
SM                 1
D3.js              1
Gnumetric          1
Igor Pro           1
Name: tool, dtype: int64


# qualitative data
NVivo         8
Leximancer    1
Diogenes      1
Name: tool, dtype: int64


# spatial data
ArcGIS          15
Google Earth     4
QGIS             3
IDL              2
TileMill         1
cartopy          1
CARTO            1
SAGA             1
Name: tool, dtype: int64


# spreadsheets
Excel            46
Minitab           5
Google Sheets     2
Name: tool, dtype: int64


# statistical modelling
netlogo        2
La

In [30]:
general_datasci_df['cost'].value_counts()

pay                               148
free                               41
free with advanced pay options      2
Name: cost, dtype: int64

In [33]:
general_datasci_df['source code'].value_counts()

closed    158
open       27
Name: source code, dtype: int64

In [32]:
general_datasci_df['user interface'].value_counts()

graphical                 159
command line               27
graphical,command line      5
Name: user interface, dtype: int64

### Discipline specific tools

In [22]:
discipline_datasci_df = pandas.read_csv('../data/derived/people_discipline_datasci_tools.csv', header=0, encoding='utf-8-sig')

In [25]:
discipline_datasci_purpose_gdf = discipline_datasci_df.groupby('discipline')

In [26]:
for discipline, discipline_df in discipline_datasci_purpose_gdf:
    print('#', discipline)
    print(discipline_df['tool'].value_counts())
    
    #purpose_df['tool'].value_counts().plot(kind='bar')
    #plt.title(purpose)
    #plt.ylabel('frequency')
    #plt.show()

    print('\n')

# astronomy
Swarp         1
S2PLOT        1
Sextractor    1
TOPCAT        1
IRAF          1
Name: tool, dtype: int64


# bioinformatics
Bioconductor    1
Name: tool, dtype: int64


# biological image processing
Quantity One    1
Name: tool, dtype: int64


# biomedical
XLSTAT-Biomed    1
Name: tool, dtype: int64


# chemistry
GDIS               1
FINDSYM            1
moldraw            1
RDKit              1
Aspen Plus         1
DS Visualizer      1
OpenEye toolkit    1
ChemOffice         1
Jmol               1
Autodock           1
Name: tool, dtype: int64


# climate
CDO     3
iris    1
NCL     1
Name: tool, dtype: int64


# flow cytometry
FACSDIVA    2
FlowJo      1
Name: tool, dtype: int64


# genomics
IGV            2
VMD            2
Artemis        2
Karyostudio    1
Bayescan       1
GeneHunter     1
Easyfig        1
plink          1
BRIG           1
PGD Spider     1
Ikaros         1
Contiguity     1
GeneGO         1
FASTLINK       1
Arlequin       1
Structure      1
GeneMapper    

In [27]:
discipline_datasci_df['cost'].value_counts()

free    87
pay     22
Name: cost, dtype: int64

In [28]:
discipline_datasci_df['source code'].value_counts()

closed    62
open      47
Name: source code, dtype: int64

In [29]:
discipline_datasci_df['user interface'].value_counts()

graphical                     69
graphical and command line    20
command line                  17
website                        3
Name: user interface, dtype: int64

### Support tools

TODO: Add a category for "none" in the version control section (i.e. people who identify as using a programming language but aren't using version control)

In [18]:
support_tool_df = pandas.read_csv('../data/derived/people_support_tools.csv', header=0, encoding='utf-8-sig')

In [19]:
support_tool_gdf = support_tool_df.groupby(['category', 'task'])

In [20]:
for category_task, task_df in support_tool_gdf:
    print('#', category_task[0] + ': ' + category_task[1])
    print(task_df['tool'].value_counts())
    print('\n')
    #task_df['tool'].value_counts().plot(kind='bar')
    #plt.title(category_task[0] + ': ' + category_task[1])
    #plt.ylabel('frequency')
    #plt.show()

# 3D design: molecular visualisation
PyMOL                  1
Molsoft ICM-Browser    1
Name: tool, dtype: int64


# code development: editors
Notepad++       7
Sublime         7
Vim             6
TextWrangler    3
Nedit           2
Emacs           1
LiveCode        1
Kate            1
Name: tool, dtype: int64


# code development: interactive development environments
RStudio             16
Jupyter notebook     5
Spyder               1
IDLE                 1
Name: tool, dtype: int64


# data collection: audio editing/transcribing
Express Scribe    1
Audacity          1
Name: tool, dtype: int64


# data collection: behavioural observation
Observer XT    1
Name: tool, dtype: int64


# data collection: surveys
Google Forms     3
Qualtrics        2
Inquisit         2
Survey Monkey    1
REDCap           1
Limesurvey       1
Name: tool, dtype: int64


# data collection: video editing
VLC                    3
Adobe Premier          2
Stop Motion Studio     1
iMovie                 1
CamStudio 