In [43]:
import pandas
import matplotlib.pyplot as plt
import seaborn

In [44]:
%matplotlib inline

In [45]:
seaborn.set_style()

In [46]:
def count_people(df):
    """Count the number of people listed in the people column."""
    
    for index, row in df.iterrows():
        if pandas.notnull(row['people']):
            npeople = len(row['people'].split(','))
            row['people'] = npeople
        else:
            row['people'] = 0
    
    return df

## Demographics

### Field of research

In [47]:
research_field_df = pandas.read_csv('../data/anzsrc_research_groups.csv', header=0, encoding='utf-8-sig')
research_field_df = count_people(research_field_df)

In [48]:
research_divisions_gdf = research_field_df.groupby('research_division')

In [49]:
research_divisions_totals = research_divisions_gdf.sum()['people'].copy()
research_divisions_totals.sort_values(inplace=True, ascending=False)
print(research_divisions_totals.to_string())

research_division
biological sciences                           45
medical and health sciences                   39
psychology and cognitive sciences             12
engineering                                   10
information and computing sciences             8
studies in human society                       7
physical sciences                              6
language, communication and culture            6
environmental sciences                         6
earth sciences                                 6
economics                                      5
mathematical sciences                          5
history and archaeology                        4
commerce, management, tourism and services     4
built environment and design                   3
agricultural and veterinary sciences           3
education                                      3
chemical sciences                              2
studies in creative arts and writing           1
law and legal studies                          0
ph

In [50]:
research_field_df.loc[research_field_df['research_division'] == 'biological sciences'].sort_values('people', ascending=False)

Unnamed: 0,research_group,research_division,people
35,genetics,biological sciences,14
33,ecology,biological sciences,12
32,biochemistry and cell biology,biological sciences,9
39,zoology,biological sciences,7
34,evolutionary biology,biological sciences,2
37,physiology,biological sciences,1
36,microbiology,biological sciences,0
38,plant biology,biological sciences,0
40,other biological sciences,biological sciences,0


In [51]:
research_field_df.loc[research_field_df['research_division'] == 'medical and health sciences'].sort_values('people', ascending=False)

Unnamed: 0,research_group,research_division,people
89,neurosciences,medical and health sciences,14
97,public health and health services,medical and health sciences,6
86,human movement and sports science,medical and health sciences,4
92,oncology and carcinogenesis,medical and health sciences,4
93,ophthalmology and optometry,medical and health sciences,3
83,clinical sciences,medical and health sciences,3
87,immunology,medical and health sciences,3
88,medical microbiology,medical and health sciences,1
95,pharmacology and pharmaceutical sciences,medical and health sciences,1
81,medical biochemistry and metabolomics,medical and health sciences,0


### Career stage

In [52]:
people_df = pandas.read_csv('../data/people.csv', header=0, encoding='utf-8-sig')

In [53]:
career_stage_gdf = people_df.groupby('career stage')

In [54]:
print('Total people:', len(people_df), '\n')
for career_stage, career_stage_df in career_stage_gdf:
    print(career_stage, len(career_stage_df))

Total people: 173 

early-career 7
honours 1
masters 13
mid-career 17
phd 109
postdoc 19
research assistant 6
senior-career 1


## Tools

### Programming languages

TODO: Add a category for "none" - for some disciplines, hardly anyone uses programming languages.

In [55]:
programming_language_df = pandas.read_csv('../data/derived/people_programming_languages.csv', header=0, encoding='utf-8-sig')

In [56]:
print(programming_language_df['tool'].value_counts().to_string())

#programming_language_df['tool'].value_counts().plot(kind='bar')
#plt.ylabel('frequency')
#plt.show()

R                67
Python           45
MATLAB           30
Unix Shell       28
C / C++          12
HTML             11
Javascript       10
CSS               7
MySQL             7
Fortran           5
Perl              2
Make              2
SQLite            2
AppleScript       1
SQL               1
Visual Basic      1
Octave            1
Clojure           1
PostgreSQL        1
PHP               1
Windows Shell     1
BigDataScript     1


### General data science tools

In [57]:
general_datasci_df = pandas.read_csv('../data/derived/people_general_datasci_tools.csv', header=0, encoding='utf-8-sig')

In [58]:
general_datasci_purpose_gdf = general_datasci_df.groupby('purpose')

In [60]:
for purpose, purpose_df in general_datasci_purpose_gdf:
    print('#', purpose)
    print(purpose_df['tool'].value_counts().to_string(), '\n')
    
    #purpose_df['tool'].value_counts().plot(kind='bar')
    #plt.title(purpose)
    #plt.ylabel('frequency')
    #plt.show()

# databases
Access                   3
MySQL Workbench          1
Navicat                  1
SQL Maestro for MySQL    1 

# file manipulation
NCO    6 

# general
GraphPad Prism    19
ggplot             7
Visio              6
Shiny              5
matplotlib         4
Gephi              4
numpy              3
dplyr              3
Mathematica        3
gnuplot            2
Tableau            2
Plotly             2
D3.js              2
JMP                2
Gnumetric          1
xarray             1
SM                 1
Origin             1
Igor Pro           1
SigmaPlot          1 

# machine learning
Weka             1
NVIDIA DIGITS    1 

# qualitative data
NVivo         14
Leximancer     3
Diogenes       1 

# spatial data
ArcGIS          19
QGIS             5
Google Earth     5
IDL              3
CARTO            1
TileMill         1
cartopy          1
SAGA             1 

# spreadsheets
Excel            70
Minitab           5
pandas            2
Google Sheets     2 

# statistical mode

In [62]:
print(general_datasci_df['cost'].value_counts().to_string())

pay                               215
free                               67
free with advanced pay options      3


In [63]:
print(general_datasci_df['source code'].value_counts().to_string())

closed    230
open       52


In [64]:
print(general_datasci_df['user interface'].value_counts().to_string())

graphical                 233
command line               47
graphical,command line      5


### Discipline specific tools

In [65]:
discipline_datasci_df = pandas.read_csv('../data/derived/people_discipline_datasci_tools.csv', header=0, encoding='utf-8-sig')

In [66]:
discipline_datasci_purpose_gdf = discipline_datasci_df.groupby('discipline')

In [67]:
for discipline, discipline_df in discipline_datasci_purpose_gdf:
    print('#', discipline)
    print(discipline_df['tool'].value_counts().to_string(), '\n')
    
    #purpose_df['tool'].value_counts().plot(kind='bar')
    #plt.title(purpose)
    #plt.ylabel('frequency')
    #plt.show()

# astronomy
IRAF            2
TOPCAT          2
astropy         1
SAOImage DS9    1
Swarp           1
Sextractor      1
S2PLOT          1 

# bioinformatics
Bioconductor    2
biopypthon      1 

# biological image processing
BEDOPS          1
Quantity One    1 

# biomedical
XLSTAT-Biomed    1
Galaxy           1 

# chemistry
Jmol               1
Aspen Plus         1
DS Visualizer      1
FINDSYM            1
OpenEye toolkit    1
GDIS               1
ChemOffice         1
moldraw            1
RDKit              1
Autodock           1 

# climate
CDO     4
iris    1
NCL     1 

# computational linguistics
NLTK     1
spaCy    1 

# economics
EViews    1 

# epidemiology
EpiData    1 

# flow cytometry
FACSDIVA    2
FlowJo      1 

# genomics
IGV                        4
Samtools                   3
MEME                       2
Artemis                    2
bedtools                   2
PGD Spider                 2
VMD                        2
VCFtools                   2
Arlequin            

In [68]:
print(discipline_datasci_df['cost'].value_counts().to_string())

free    148
pay      26


In [69]:
print(discipline_datasci_df['source code'].value_counts().to_string())

open      93
closed    80


In [70]:
print(discipline_datasci_df['user interface'].value_counts().to_string())

graphical                     93
command line                  49
graphical and command line    27
website                        4


### Support tools

TODO: Add a category for "none" in the version control section (i.e. people who identify as using a programming language but aren't using version control)

In [71]:
support_tool_df = pandas.read_csv('../data/derived/people_support_tools.csv', header=0, encoding='utf-8-sig')

In [72]:
support_tool_gdf = support_tool_df.groupby(['category', 'task'])

In [73]:
for category_task, task_df in support_tool_gdf:
    print('#', category_task[0] + ': ' + category_task[1])
    print(task_df['tool'].value_counts().to_string(), '\n')
    #task_df['tool'].value_counts().plot(kind='bar')
    #plt.title(category_task[0] + ': ' + category_task[1])
    #plt.ylabel('frequency')
    #plt.show()

# 3D design: molecular visualisation
Molsoft ICM-Browser    1
PyMOL                  1 

# code development: editors
Sublime         16
Notepad++       11
Vim             10
TextWrangler     6
Atom             4
Nedit            3
Emacs            3
GEdit            2
LiClipse         1
JEdit            1
Kate             1
LiveCode         1 

# code development: interactive development environments
RStudio             30
Jupyter notebook     9
PyCharm              4
Visual Studio        3
Spyder               2
Mars Eclipse         1
IDLE                 1 

# data collection: audio editing/transcribing
Audacity          1
Express Scribe    1 

# data collection: behavioural observation
Observer XT    1 

# data collection: sound
Raven    1 

# data collection: surveys
Google Forms     3
REDCap           3
Survey Monkey    2
Qualtrics        2
Inquisit         2
Limesurvey       1 

# data collection: video editing
VLC                    3
Adobe Premier          2
iMovie             