# Imports

In [None]:
import warnings

import numpy as np
import pandas as pd
from dateutil.parser import UnknownTimezoneWarning

from khan_helpers.constants import RAW_DIR

# Load & format data

In [None]:
demographics = pd.read_csv(RAW_DIR.joinpath('demographics.csv'))

# convert "Birth year" to "Age"
with warnings.catch_warnings():
    # ignore UnknownTimezoneWarning due to local timezone not being 
    # defined in container
    warnings.filterwarnings('ignore', category=UnknownTimezoneWarning)
    demographics['Age'] = (demographics['Timestamp']
                           .astype('datetime64[ns]')
                           .apply(lambda x: x.year)) - demographics['Birth year']
demographics.drop('Birth year', axis=1, inplace=True)

# 1 participant responded "Yes" when asked whether they had viewed any
# Khan Academy lectures in the past, but then repoted the total number
# of lectures they had viewed as 0, and also reported having viewed 0
# lectures from each individual category. For reporting demographic 
# data, we assume this initial "Yes" was a mistake that the participant
# then attempted to correct with their answers to subsequent questions
demographics.loc[10, 'Khan Academy user'] = 'No'
demographics.loc[10, 'Khan courses watched':'Watched Birth of Stars'] = np.nan

# encode alertness responses as numeric values
demographics['Alertness'] = demographics['Alertness'].replace({
    'Very sluggish': -2,
    'A little sluggish': -1, 
    'Neutral': 0, 
    'Fairly alert': 1, 
    'Very alert': 2
})

# correct entries that Google Forms automatically converted to dates
demographics['Hours of sleep'] = demographics['Hours of sleep'].replace({
    '0-2': '0 -- 2', 
    '4-Feb': '2 -- 4',
    '6-Apr': '4 -- 6', 
    '8-Jun': '6 -- 8', 
})
demographics['Khan courses watched'] = demographics['Khan courses watched'].replace({
    '2-Jan': '1 -- 2',
    '5-Mar': '3 -- 5',
    '10-May': '5 -- 10'
})

# combine "undecided" & "Currently undecided"; remove long 
# parentheticals from options. E.g.:
#   "Social sciences (e.g. psychology, sociology, economics, political 
#   science, linguistics, anthropology, archaeology, etc.)" -->
#   "Social sciences"
demographics['Undergraduate major (category)'] = demographics['Undergraduate major (category)'].replace({
    ' \([^;]+\)': '', 
    'Currently undecided': 'undecided'
}, regex=True)

demographics.head()

# Native English speakers

In [None]:
demographics.value_counts('Native English')

# Hearing impairments

In [None]:
demographics['Hearing impairments'].value_counts()

# Normal color vision

In [None]:
demographics['Normal color vision'].value_counts()

# Impaired focus

In [None]:
demographics['Impaired focus'].value_counts()

# Age

In [None]:
demographics['Age'].describe()

# Gender

In [None]:
demographics['Gender'].value_counts()

# Ethnicity

In [None]:
demographics['Ethnicity'].value_counts()

# Race

In [None]:
demographics['Race'].str.split(';', expand=True).stack().value_counts()

# Highest degree achieved

In [None]:
demographics['Highest degree'].value_counts()

# Undergraduate major (category)

In [None]:
(demographics['Undergraduate major (category)']
 .str
 .split(';', expand=True)
 .stack()
 .value_counts())

# Sleep

In [None]:
demographics['Hours of sleep'].value_counts().sort_index()

# Coffee consumption

In [None]:
demographics['Cups of coffee'].value_counts().sort_index()

# Alertness

In [None]:
demographics['Alertness'].describe()

# Khan academy exposure

In [None]:
demographics['Khan Academy user'].value_counts()

In [None]:
demographics['Khan courses watched'].value_counts()[[3, 1, 2, 0]]

In [None]:
# Khan Academy math subjects watched
watched_any_math = (
    demographics['Khan subjects watched']
    .fillna('')
    .str
    .contains('Math')
) | (demographics['Math courses'].dropna() != 'None of the above')

(demographics.loc[watched_any_math, 'Math courses']
 .str
 .split(';', expand=True)
 .stack()
 .value_counts())

In [None]:
# Khan Academy science & engineering subjects watched
watched_any_science = (
    demographics['Khan subjects watched']
    .fillna('')
    .str
    .contains('Science & engineering')
) | (demographics['Science and engineering courses'].dropna() != 'None of the above')

(demographics.loc[watched_any_science, 'Science and engineering courses']
 .str
 .split(';', expand=True)
 .stack()
 .value_counts())

In [None]:
# Watched Four Fundemantal Fources?
demographics['Watched Four Fundamental Fources'].value_counts()

In [None]:
# Watched Birth of Stars?
demographics['Watched Birth of Stars'].value_counts()

# Non-Khan academy courses (online)

In [None]:
(demographics['Other platform subjects']
 .str
 .split(';', expand=True)
 .stack()
 .value_counts())

# In-person courses

In [None]:
(demographics['In-person subjects']
 .str
 .split(';', expand=True)
 .stack()
 .value_counts())