# Analysis

This file contains the public data analysis for "Characterizing the US Research Computing and Data (RCD) Workforce."

In [1]:
library(tidyverse)
library(janitor)

── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘janitor’


The following objects are masked from ‘package:stats’:

    chisq.test, fisher.test




## Data

In [2]:
all_freq <- read_csv("data_to_share/var_frequencies_all_respondents.csv")
institutions <- read_csv("data_to_share/institutions_all.csv")
all_edu_work <- read_csv("data_to_share/education_work_all_respondents.csv")
rcd_edu_work <- read_csv("data_to_share/education_work_rcd_respondents.csv")
gender <- read_csv("data_to_share/gender_counts.csv")
ethnicity <- read_csv("data_to_share/ethnicity_counts.csv")
jobs <- read_csv("data_to_share/job_titles.csv")
facings <- read_csv("data_to_share/facings.csv")
satisfaction <- read_csv("data_to_share/satisfaction_rcd_respondents.csv")
inclusion <- read_csv("data_to_share/inclusion.csv")

[1mRows: [22m[34m150[39m [1mColumns: [22m[34m4[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (2): variable, category
[32mdbl[39m (2): n, prop

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m563[39m [1mColumns: [22m[34m6[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (5): employer_type, institution_type, institution_code, institution_comb...
[33mlgl[39m (1): academic_institution

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.

## 3: SURVEYING THE WORKFORCE

Institution Counts

In [6]:
length(unique(institutions$institution_code))

In [7]:
institutions %>%
  distinct(institution_code, .keep_all=TRUE) %>%
  tabyl(institution_type)

Unnamed: 0_level_0,institution_type,n,percent,valid_percent
Unnamed: 0_level_1,<chr>,<int>,<dbl>,<dbl>
1,Academic Computing Center,5,0.028571429,0.02873563
2,Company,8,0.045714286,0.04597701
3,Government,5,0.028571429,0.02873563
4,Medical,10,0.057142857,0.05747126
5,Other,7,0.04,0.04022989
6,Other Academic,18,0.102857143,0.10344828
7,R1,91,0.52,0.52298851
8,R2,30,0.171428571,0.17241379
9,,1,0.005714286,


In [8]:
institutions %>%
  filter(!is.na(institution_code)) %>%
  count(institution_code) %>%
  summarize(median(n),
            max(n))

median(n),max(n)
<dbl>,<int>
2,26


R1/R2

In [9]:
institutions %>%
  filter(academic_institution) %>%
  summarize(R1_respondents = sum(institution_type == "R1")/n(),
            R1_inst = n_distinct(institution_code[institution_type == "R1"])/n_distinct(institution_code))

R1_respondents,R1_inst
<dbl>,<dbl>
0.8163717,0.6319444


Respondents

In [10]:
filter(all_freq, variable == "rcd_employed")

variable,category,n,prop
<chr>,<chr>,<dbl>,<dbl>
rcd_employed,No/No Answer,24,0.04262877
rcd_employed,Yes,539,0.95737123


In [11]:
sum(rcd_edu_work$academic_institution, na.rm=TRUE)/sum(!is.na(rcd_edu_work$academic_institution))

## 4.1 Demographics

### Age

In [12]:
filter(all_freq, variable=="age")

variable,category,n,prop
<chr>,<chr>,<dbl>,<dbl>
age,20 - 24,11,0.01953819
age,25 - 34,96,0.1705151
age,35 - 44,167,0.29662522
age,45 - 54,171,0.30373002
age,55 - 64,99,0.17584369
age,65 - 74,19,0.03374778


### Race/Ethnicity

In [14]:
filter(all_freq, variable=="ethnicity1")

variable,category,n,prop
<chr>,<chr>,<dbl>,<dbl>
ethnicity1,Asian,59,0.10479574
ethnicity1,Black or African American,10,0.01776199
ethnicity1,Multiple/Native Hawaiian or Pacific Islander/Jewish/Middle Eastern/Mediterranean/Other Unspecified,15,0.02664298
ethnicity1,White (Hispanic),9,0.01598579
ethnicity1,White (Not Hispanic),435,0.77264654
ethnicity1,,35,0.06216696


In [15]:
filter(all_freq, variable=="hispanic")

variable,category,n,prop
<chr>,<chr>,<dbl>,<dbl>
hispanic,False,516,0.91651865
hispanic,True,13,0.02309059
hispanic,,34,0.06039076
