# Imports

In [1]:
import warnings

import numpy as np
import pandas as pd
from dateutil.parser import UnknownTimezoneWarning

from khan_helpers.constants import RAW_DIR

Experiment & Participant classes, helper functions, and variables used across multiple notebooks can be found in `/opt/conda/lib/python3.9/site-packages/khan_helpers`, or on GitHub, [here](https://github.com/contextlab/efficient-learning-khan/tree/master/code/khan_helpers).<br />You can also view source code directly from the notebook with:<br /><pre>    from khan_helpers.functions import show_source<br />    show_source(foo)</pre>

# Load & format data

In [2]:
demographics = pd.read_csv(RAW_DIR.joinpath('demographics.csv'))

# convert "Birth year" to "Age"
with warnings.catch_warnings():
    # ignore UnknownTimezoneWarning due to local timezone not being 
    # defined in container
    warnings.filterwarnings('ignore', category=UnknownTimezoneWarning)
    demographics['Age'] = (demographics['Timestamp']
                           .astype('datetime64[ns]')
                           .apply(lambda x: x.year)) - demographics['Birth year']
demographics.drop('Birth year', axis=1, inplace=True)

# 1 participant responded "Yes" when asked whether they had viewed any
# Khan Academy lectures in the past, but then repoted the total number
# of lectures they had viewed as 0, and also reported having viewed 0
# lectures from each individual category. For reporting demographic 
# data, we assume this initial "Yes" was a mistake that the participant
# then attempted to correct with their answers to subsequent questions
demographics.loc[10, 'Khan Academy user'] = 'No'
demographics.loc[10, 'Khan courses watched':'Watched Birth of Stars'] = np.nan

# encode alertness responses as numeric values
demographics['Alertness'] = demographics['Alertness'].replace({
    'Very sluggish': -2,
    'A little sluggish': -1, 
    'Neutral': 0, 
    'Fairly alert': 1, 
    'Very alert': 2
})

# correct entries that Google Forms automatically converted to dates
demographics['Hours of sleep'] = demographics['Hours of sleep'].replace({
    '0-2': '0 -- 2', 
    '4-Feb': '2 -- 4',
    '6-Apr': '4 -- 6', 
    '8-Jun': '6 -- 8', 
})
demographics['Khan courses watched'] = demographics['Khan courses watched'].replace({
    '2-Jan': '1 -- 2',
    '5-Mar': '3 -- 5',
    '10-May': '5 -- 10'
})

# combine "undecided" & "Currently undecided"; remove long 
# parentheticals from options. E.g.:
#   "Social sciences (e.g. psychology, sociology, economics, political 
#   science, linguistics, anthropology, archaeology, etc.)" -->
#   "Social sciences"
demographics['Undergraduate major (category)'] = demographics['Undergraduate major (category)'].replace({
    ' \([^;]+\)': '', 
    'Currently undecided': 'undecided'
}, regex=True)

demographics.head()

Unnamed: 0.1,Unnamed: 0,posix_timestamp,Timestamp,Data share consent,Native English,Hearing impairments,Normal color vision,Impaired focus,Gender,Ethnicity,...,Khan Academy user,Khan courses watched,Khan subjects watched,Math courses,Science and engineering courses,Other platform subjects,In-person subjects,Watched Four Fundamental Fources,Watched Birth of Stars,Age
0,0,1556300000000.0,2019/04/26 12:38:19 PM EST,Yes,Yes,No,Yes,No,Female,Not Hispanic or Latino,...,Yes,10+,Math,Algebra 1;Geometry;Algebra 2;Trigonometry;Prec...,None of the above,None of the above,Math;Science & engineering;Arts & humanities;E...,No,No,19
1,1,1556560000000.0,2019/04/29 1:24:02 PM EST,Yes,Yes,No,Yes,No,Female,Not Hispanic or Latino,...,Yes,1 -- 2,Math;Science & engineering,AP Calculus AB,Physics,None of the above,Math;Science & engineering;Arts & humanities,No,No,22
2,2,1556570000000.0,2019/04/29 2:10:09 PM EST,Yes,Yes,No,Yes,No,Male,Not Hispanic or Latino,...,No,,,,,,,,,19
3,3,1556740000000.0,2019/05/01 2:12:46 PM EST,Yes,Yes,No,Yes,No,Female,Not Hispanic or Latino,...,Yes,3 -- 5,Math;Science & engineering;Economics & finance,AP Calculus AB;AP Calculus BC,AP Physics 1;Chemistry;AP Chemistry;Biology;Hi...,Economics & finance,None of the above,No,No,20
4,4,1556900000000.0,2019/05/03 12:31:51 PM EST,Yes,Yes,No,Yes,No,Female,Not Hispanic or Latino,...,Yes,10+,Math;Science & engineering;Economics & finance...,Multivariable Calculus;Differential Equations,Chemistry;AP Chemistry;AP Biology,Test prep,Math;Science & engineering;Computing;Arts & hu...,No,No,19


# Native English speakers

In [3]:
demographics.value_counts('Native English')

Native English
Yes    49
No      1
dtype: int64

# Hearing impairments

In [4]:
demographics['Hearing impairments'].value_counts()

No     49
Yes     1
Name: Hearing impairments, dtype: int64

# Normal color vision

In [5]:
demographics['Normal color vision'].value_counts()

Yes    49
No      1
Name: Normal color vision, dtype: int64

# Impaired focus

In [6]:
demographics['Impaired focus'].value_counts()

No    50
Name: Impaired focus, dtype: int64

# Age

In [7]:
demographics['Age'].describe()

count    50.00000
mean     19.52000
std       1.09246
min      18.00000
25%      19.00000
50%      19.00000
75%      20.00000
max      22.00000
Name: Age, dtype: float64

# Gender

In [8]:
demographics['Gender'].value_counts()

Female    35
Male      15
Name: Gender, dtype: int64

# Ethnicity

In [9]:
demographics['Ethnicity'].value_counts()

Not Hispanic or Latino    47
Hispanic or Latino         3
Name: Ethnicity, dtype: int64

# Race

In [10]:
demographics['Race'].str.split(';', expand=True).stack().value_counts()

White                                        31
Asian                                        14
Black or African American                     5
A mix: Indian and White                       1
Native Hawaiian or Other Pacific Islander     1
Prefer not to say                             1
dtype: int64

# Highest degree achieved

In [11]:
demographics['Highest degree'].value_counts()

Some college            33
High school graduate    16
Bachelor's degree        1
Name: Highest degree, dtype: int64

# Undergraduate major (category)

In [12]:
(demographics['Undergraduate major (category)']
 .str
 .split(';', expand=True)
 .stack()
 .value_counts())

Social sciences                28
Natural sciences               16
Professional                    8
Mathematics and engineering     7
Humanities                      4
undecided                       3
dtype: int64

# Sleep

In [13]:
demographics['Hours of sleep'].value_counts().sort_index()

2 -- 4     1
4 -- 6     9
6 -- 8    35
8+         5
Name: Hours of sleep, dtype: int64

# Coffee consumption

In [14]:
demographics['Cups of coffee'].value_counts().sort_index()

0     38
1     10
3      1
4+     1
Name: Cups of coffee, dtype: int64

# Alertness

In [15]:
demographics['Alertness'].describe()

count    50.000000
mean     -0.100000
std       0.839096
min      -2.000000
25%      -1.000000
50%       0.000000
75%       1.000000
max       1.000000
Name: Alertness, dtype: float64

# Khan academy exposure

In [16]:
demographics['Khan Academy user'].value_counts()

Yes    45
No      5
Name: Khan Academy user, dtype: int64

In [17]:
demographics['Khan courses watched'].value_counts()[[3, 1, 2, 0]]

1 -- 2      7
3 -- 5     11
5 -- 10     8
10+        19
Name: Khan courses watched, dtype: int64

In [18]:
# Khan Academy math subjects watched
watched_any_math = (
    demographics['Khan subjects watched']
    .fillna('')
    .str
    .contains('Math')
) | (demographics['Math courses'].dropna() != 'None of the above')

(demographics.loc[watched_any_math, 'Math courses']
 .str
 .split(';', expand=True)
 .stack()
 .value_counts())

AP Calculus AB              21
Precalculus                 17
Algebra 2                   14
AP Calculus BC              12
Trigonometry                11
Algebra 1                   10
Geometry                     8
Pre-algebra                  7
Multivariable Calculus       5
Differential Equations       5
None of the above            5
Statistics & Probability     4
AP Statistics                2
Linear Algebra               2
Early Math                   1
Arithmetic                   1
dtype: int64

In [19]:
# Khan Academy science & engineering subjects watched
watched_any_science = (
    demographics['Khan subjects watched']
    .fillna('')
    .str
    .contains('Science & engineering')
) | (demographics['Science and engineering courses'].dropna() != 'None of the above')

(demographics.loc[watched_any_science, 'Science and engineering courses']
 .str
 .split(';', expand=True)
 .stack()
 .value_counts())

Chemistry              12
Physics                10
AP Chemistry            8
AP Biology              7
AP Physics 1            5
Biology                 5
None of the above       5
High school Biology     3
AP Physics 2            3
Organic Chemistry       1
Health & Medicine       1
dtype: int64

In [20]:
# Watched Four Fundemantal Fources?
demographics['Watched Four Fundamental Fources'].value_counts()

No          44
Not sure     1
Name: Watched Four Fundamental Fources, dtype: int64

In [21]:
# Watched Birth of Stars?
demographics['Watched Birth of Stars'].value_counts()

No    45
Name: Watched Birth of Stars, dtype: int64

# Non-Khan academy courses (online)

In [22]:
(demographics['Other platform subjects']
 .str
 .split(';', expand=True)
 .stack()
 .value_counts())

None of the above        17
Math                     15
Science & engineering    11
Test prep                 9
Economics & finance       3
Arts & humanities         2
Computing                 2
dtype: int64

# In-person courses

In [23]:
(demographics['In-person subjects']
 .str
 .split(';', expand=True)
 .stack()
 .value_counts())

Math                        38
Science & engineering       37
Arts & humanities           34
Test prep                   27
Economics & finance         26
Computing                   14
College, careers, & more     7
None of the above            6
dtype: int64