In [1]:
import pandas
import matplotlib.pyplot as plt
import seaborn



In [2]:
%matplotlib inline

In [3]:
seaborn.set_style()

In [4]:
def count_people(df):
    """Count the number of people listed in the people column."""
    
    for index, row in df.iterrows():
        if pandas.notnull(row['people']):
            npeople = len(row['people'].split(','))
            row['people'] = npeople
        else:
            row['people'] = 0
    
    return df

## Demographics

### Field of research

In [5]:
research_field_df = pandas.read_csv('../data/anzsrc_research_groups.csv', header=0, encoding='utf-8-sig')
research_field_df = count_people(research_field_df)

In [6]:
research_divisions_gdf = research_field_df.groupby('research_division')

In [7]:
research_divisions_totals = research_divisions_gdf.sum()['people'].copy()
research_divisions_totals.sort_values(inplace=True, ascending=False)
print(research_divisions_totals)

research_division
biological sciences                           39
medical and health sciences                   34
psychology and cognitive sciences             11
engineering                                    8
studies in human society                       6
earth sciences                                 5
environmental sciences                         5
language, communication and culture            4
information and computing sciences             4
history and archaeology                        4
mathematical sciences                          4
commerce, management, tourism and services     3
economics                                      3
built environment and design                   2
physical sciences                              2
education                                      2
chemical sciences                              2
agricultural and veterinary sciences           1
studies in creative arts and writing           1
law and legal studies                          0
ph

In [8]:
research_field_df.loc[research_field_df['research_division'] == 'biological sciences'].sort_values('people', ascending=False)

Unnamed: 0,research_group,research_division,people
33,ecology,biological sciences,12
35,genetics,biological sciences,11
39,zoology,biological sciences,7
32,biochemistry and cell biology,biological sciences,6
34,evolutionary biology,biological sciences,2
37,physiology,biological sciences,1
36,microbiology,biological sciences,0
38,plant biology,biological sciences,0
40,other biological sciences,biological sciences,0


In [9]:
research_field_df.loc[research_field_df['research_division'] == 'medical and health sciences'].sort_values('people', ascending=False)

Unnamed: 0,research_group,research_division,people
89,neurosciences,medical and health sciences,11
97,public health and health services,medical and health sciences,6
86,human movement and sports science,medical and health sciences,4
92,oncology and carcinogenesis,medical and health sciences,4
93,ophthalmology and optometry,medical and health sciences,3
87,immunology,medical and health sciences,3
83,clinical sciences,medical and health sciences,1
88,medical microbiology,medical and health sciences,1
95,pharmacology and pharmaceutical sciences,medical and health sciences,1
81,medical biochemistry and metabolomics,medical and health sciences,0


### Career stage

In [10]:
people_df = pandas.read_csv('../data/people.csv', header=0, encoding='utf-8-sig')

In [11]:
career_stage_gdf = people_df.groupby('career stage')

In [12]:
print('Total people:', len(people_df))
print('\n')
for career_stage, career_stage_df in career_stage_gdf:
    print(career_stage, len(career_stage_df))

Total people: 138


early-career 6
honours 1
masters 8
mid-career 15
phd 87
postdoc 14
research assistant 6
senior-career 1


## Tools

### Programming languages

TODO: Add a category for "none" - for some disciplines, hardly anyone uses programming languages.

In [13]:
programming_language_df = pandas.read_csv('../data/derived/people_programming_languages.csv', header=0, encoding='utf-8-sig')

In [14]:
print(programming_language_df['tool'].value_counts())

#programming_language_df['tool'].value_counts().plot(kind='bar')
#plt.ylabel('frequency')
#plt.show()

R                53
Python           27
MATLAB           24
Unix Shell       18
C / C++           6
HTML              5
Javascript        4
MySQL             4
Fortran           3
CSS               2
Perl              1
Make              1
PostgreSQL        1
Visual Basic      1
BigDataScript     1
PHP               1
Windows Shell     1
SQLite            1
AppleScript       1
Clojure           1
Name: tool, dtype: int64


### General data science tools

In [15]:
general_datasci_df = pandas.read_csv('../data/derived/people_general_datasci_tools.csv', header=0, encoding='utf-8-sig')

In [16]:
general_datasci_purpose_gdf = general_datasci_df.groupby('purpose')

In [17]:
for purpose, purpose_df in general_datasci_purpose_gdf:
    print('#', purpose)
    print(purpose_df['tool'].value_counts())
    
    #purpose_df['tool'].value_counts().plot(kind='bar')
    #plt.title(purpose)
    #plt.ylabel('frequency')
    #plt.show()

    print('\n')

# databases
Access                   3
Navicat                  1
SQL Maestro for MySQL    1
Name: tool, dtype: int64


# file manipulation
NCO    5
Name: tool, dtype: int64


# general
GraphPad Prism    17
ggplot             6
Visio              5
Shiny              5
dplyr              3
Mathematica        3
Gephi              2
JMP                2
matplotlib         2
D3.js              1
Origin             1
Tableau            1
xarray             1
gnuplot            1
SigmaPlot          1
Igor Pro           1
Plotly             1
Gnumetric          1
SM                 1
Name: tool, dtype: int64


# qualitative data
NVivo         11
Leximancer     2
Diogenes       1
Name: tool, dtype: int64


# spatial data
ArcGIS          18
Google Earth     5
QGIS             4
IDL              2
SAGA             1
CARTO            1
cartopy          1
TileMill         1
Name: tool, dtype: int64


# spreadsheets
Excel            60
Minitab           5
Google Sheets     2
Name: tool, dtype: int

In [18]:
general_datasci_df['cost'].value_counts()

pay                               190
free                               47
free with advanced pay options      2
Name: cost, dtype: int64

In [19]:
general_datasci_df['source code'].value_counts()

closed    202
open       34
Name: source code, dtype: int64

In [20]:
general_datasci_df['user interface'].value_counts()

graphical                 203
command line               31
graphical,command line      5
Name: user interface, dtype: int64

### Discipline specific tools

In [21]:
discipline_datasci_df = pandas.read_csv('../data/derived/people_discipline_datasci_tools.csv', header=0, encoding='utf-8-sig')

In [22]:
discipline_datasci_purpose_gdf = discipline_datasci_df.groupby('discipline')

In [23]:
for discipline, discipline_df in discipline_datasci_purpose_gdf:
    print('#', discipline)
    print(discipline_df['tool'].value_counts())
    
    #purpose_df['tool'].value_counts().plot(kind='bar')
    #plt.title(purpose)
    #plt.ylabel('frequency')
    #plt.show()

    print('\n')

# astronomy
S2PLOT        1
Swarp         1
TOPCAT        1
Sextractor    1
IRAF          1
Name: tool, dtype: int64


# bioinformatics
Bioconductor    1
Name: tool, dtype: int64


# biological image processing
Quantity One    1
Name: tool, dtype: int64


# biomedical
XLSTAT-Biomed    1
Galaxy           1
Name: tool, dtype: int64


# chemistry
DS Visualizer      1
moldraw            1
Aspen Plus         1
FINDSYM            1
RDKit              1
OpenEye toolkit    1
Autodock           1
Jmol               1
ChemOffice         1
GDIS               1
Name: tool, dtype: int64


# climate
CDO     4
iris    1
NCL     1
Name: tool, dtype: int64


# economics
EViews    1
Name: tool, dtype: int64


# epidemiology
EpiData    1
Name: tool, dtype: int64


# flow cytometry
FACSDIVA    2
FlowJo      1
Name: tool, dtype: int64


# genomics
Artemis                    2
PGD Spider                 2
Arlequin                   2
IGV                        2
VMD                        2
DnaSP           

In [24]:
discipline_datasci_df['cost'].value_counts()

free    112
pay      25
Name: cost, dtype: int64

In [25]:
discipline_datasci_df['source code'].value_counts()

closed    70
open      66
Name: source code, dtype: int64

In [26]:
discipline_datasci_df['user interface'].value_counts()

graphical                     84
command line                  25
graphical and command line    23
website                        4
Name: user interface, dtype: int64

### Support tools

TODO: Add a category for "none" in the version control section (i.e. people who identify as using a programming language but aren't using version control)

In [27]:
support_tool_df = pandas.read_csv('../data/derived/people_support_tools.csv', header=0, encoding='utf-8-sig')

In [28]:
support_tool_gdf = support_tool_df.groupby(['category', 'task'])

In [29]:
for category_task, task_df in support_tool_gdf:
    print('#', category_task[0] + ': ' + category_task[1])
    print(task_df['tool'].value_counts())
    print('\n')
    #task_df['tool'].value_counts().plot(kind='bar')
    #plt.title(category_task[0] + ': ' + category_task[1])
    #plt.ylabel('frequency')
    #plt.show()

# 3D design: molecular visualisation
PyMOL                  1
Molsoft ICM-Browser    1
Name: tool, dtype: int64


# code development: editors
Sublime         9
Vim             8
Notepad++       8
TextWrangler    4
Atom            2
Nedit           2
JEdit           1
LiveCode        1
Emacs           1
GEdit           1
Kate            1
Name: tool, dtype: int64


# code development: interactive development environments
RStudio             26
Jupyter notebook     5
IDLE                 1
Spyder               1
Name: tool, dtype: int64


# data collection: audio editing/transcribing
Express Scribe    1
Audacity          1
Name: tool, dtype: int64


# data collection: behavioural observation
Observer XT    1
Name: tool, dtype: int64


# data collection: sound
Raven    1
Name: tool, dtype: int64


# data collection: surveys
Google Forms     3
Qualtrics        2
REDCap           2
Inquisit         2
Survey Monkey    1
Limesurvey       1
Name: tool, dtype: int64


# data collection: video e