# Imports

In [1]:
import numpy as np
import pandas as pd
from scipy.signal import find_peaks
from scipy.spatial.distance import cdist

from khan_helpers import Experiment
from khan_helpers.functions import _ts_to_sec, get_top_words, show_source

Experiment & Participant classes, helper functions, and variables used across multiple notebooks can be found in `/opt/conda/lib/python3.9/site-packages/khan_helpers`, or on GitHub, [here](https://github.com/contextlab/efficient-learning-khan/tree/master/code/khan_helpers).<br />You can also view source code directly from the notebook with:<br /><pre>    from khan_helpers.functions import show_source<br />    show_source(foo)</pre>

# Show functions defined in `khan_helpers`

In [2]:
show_source(_ts_to_sec)

In [3]:
show_source(get_top_words)

# Load data

In [106]:
exp = Experiment()
questions = exp.questions

# Supplementary Table 1

In [107]:
questions.index.name = 'ID'
questions.columns = ['Question set', 
                     'Question', 
                     'Correct response', 
                     'Alternative 1', 
                     'Alternative 2', 
                     'Alternative 3']
questions['Question set'] = questions['Question set'].replace({1: 'FFF', 
                                                               2: 'BoS', 
                                                               0: 'GPK'})

col_format = '|r|p{0.375in}|p{1.275in}|p{0.75in}|p{0.75in}|p{0.75in}|p{0.75in}|'
questions.head()

Unnamed: 0_level_0,Question set,Question,Correct response,Alternative 1,Alternative 2,Alternative 3
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,FFF,Why is the gravitational attraction between yo...,Neither you nor your computer has enough mass ...,You and your computer are too close for the gr...,Humans are too small to detect the force of gr...,The gravitational attraction between you and y...
2,FFF,Which of the following is an example of the We...,A neutron in a radioactive Cesium atom is conv...,Light from the sun collides with a satellite o...,Two protons bound together in a Helium nucleus...,A distant galaxy exerts a small but detectable...
3,FFF,Roughly how many times stronger is the Weak In...,10000000000000000000000000,10,1000000,The Weak Interaction is less strong than gravity
4,FFF,Why don't you and your computer experience any...,The weak interaction only acts over extremely ...,The weak interaction between you and your comp...,You and your computer have no net charge,Neither you nor your computer has enough mass ...
5,FFF,Which of the following is a difference between...,Gravity is only ever attractive while the elec...,Gravity is a much more powerful force than ele...,Gravity can only act over large distances whil...,The electromagnetic force can only act over sm...


In [113]:
colnames_bold = [f'\\textbf{{{c}}}' for c in questions.columns]

with pd.option_context("max_colwidth", 300):
    latex_code = questions.to_latex(column_format=col_format, 
                                    header=colnames_bold, 
                                    escape=False)
    print(latex_code
          .replace(r'\\', r'\\\hline')
          .replace('^', r'\textasciicircum ')
          .replace(r'\toprule''\n', '')
          .replace(r'\midrule''\n', '')
          .replace(r'\bottomrule''\n', ''))

\begin{tabular}{|r|p{0.375in}|p{1.275in}|p{0.75in}|p{0.75in}|p{0.75in}|p{0.75in}|}
{} & \textbf{Question set} &                                                                                                                                                                                                                                                                \textbf{Question} &                                                                                                             \textbf{Correct response} &                                                                                \textbf{Alternative 1} &                                                                                                     \textbf{Alternative 2} &                                                                                                            \textbf{Alternative 3} \\\hline
ID &                       &                                                                                 

# Supplementary Table 2

In [7]:
top_words = pd.DataFrame(get_top_words(exp.fit_cv, exp.fit_lda)).T
top_words.columns = range(1, len(top_words.columns) + 1)
top_words.index = range(1, len(top_words.index) + 1)
top_words.columns.name = 'Topic'
top_words = top_words.drop(9)
top_words

Topic,1,2,3,4,5,6,7,8,9,10
1,star,helium,main,mass,atomic,sequence,get,energy,fuse,hydrogen
2,charge,force,mass,gravity,strong,attract,large,strength,distance,electromagnetic
3,huge,force,electromagnetic,macro,way,scale,concentration,apply,kind,charge
4,atom,dense,go,hydrogen,slow,get,huge,condense,mass,would
5,fusion,get,threshold,core,occur,mass,something,start,several,jupiter
6,enough,ignition,proton,force,get,close,nucleus,coulomb,fusion,would
7,energy,pressure,ignition,mass,little,keep,provide,fusion,get,hydrogen
8,proton,weak,neutron,interaction,one,go,nucleon,cesium,extra,get
10,huge,cloud,space,float,imagine,hydrogen,atom,say,distance,combine
11,one,hydrogen,helium,go,proton,neutron,keep,atomic,detail,fuse


In [105]:
top_words.index.name = top_words.columns.name
top_words.columns.name = None
top_words = top_words.reset_index()
colnames_bold = [f'\\textbf{{{c}}}' for c in top_words.columns]

latex_code = top_words.to_latex(index=False, 
                                column_format='|r|l|l|l|l|l|l|l|l|l|l|', 
                                escape=False, 
                                header=colnames_bold)
print(latex_code.replace(r'\\', r'\\\hline')
      .replace(r'\toprule''\n', '')
      .replace(r'\midrule''\n', '')
      .replace(r'\bottomrule''\n', ''))

\begin{tabular}{|r|l|l|l|l|l|l|l|l|l|l|}
\textbf{index} & \textbf{Topic} & \textbf{1} & \textbf{2} &      \textbf{3} &  \textbf{4} & \textbf{5} & \textbf{6} &    \textbf{7} &  \textbf{8} & \textbf{9} &     \textbf{10} \\\hline
             0 &              1 &       star &     helium &            main &        mass &     atomic &   sequence &           get &      energy &       fuse &        hydrogen \\\hline
             1 &              2 &     charge &      force &            mass &     gravity &     strong &    attract &         large &    strength &   distance & electromagnetic \\\hline
             2 &              3 &       huge &      force & electromagnetic &       macro &        way &      scale & concentration &       apply &       kind &          charge \\\hline
             3 &              4 &       atom &      dense &              go &    hydrogen &       slow &        get &          huge &    condense &       mass &           would \\\hline
             4 &             

# Supplementary Table 3

In [84]:
# quiz question topic vectors
forces_qs = exp.get_question_vecs(lectures='forces')
bos_qs = exp.get_question_vecs(lectures='bos')

forces_qcorrs = 1 - cdist(exp.forces_traj, forces_qs, 'correlation')
bos_qcorrs = 1 - cdist(exp.bos_traj, bos_qs, 'correlation')

forces_transcript_lines = exp.forces_transcript.splitlines()
forces_transcript_text = np.array(forces_transcript_lines[1::2])
forces_timestamps = np.array(forces_transcript_lines[::2])
forces_timestamps_float = np.fromiter(map(_ts_to_sec, forces_timestamps), 
                                      dtype=float)

bos_transcript_lines = exp.bos_transcript.splitlines()
bos_transcript_text = np.array(bos_transcript_lines[1::2])
bos_timestamps = np.array(bos_transcript_lines[::2])
bos_timestamps_float = np.fromiter(map(_ts_to_sec, bos_timestamps), dtype=float)

In [85]:
MIN_PROMINENCE = 0.1
MIN_WIDTH = 15      # seconds
REL_HEIGHT = 0.5    # default

In [86]:
matched_text_df = questions.query('`Question set` in ("FFF", "BoS")')[['Question set', 'Question']]
matched_text_df['Matched lecture transcript text'] = ''

In [87]:
for q_ix, tpt_corrs in enumerate(forces_qcorrs.T, start=1):    
    # pad the correlation timeseries with 0's on both sides so peak 
    # detection works for peaks at beinning and end. Round to 0.01 to 
    # remove some noise within peaks
    tpt_corrs_0pad = np.concatenate(
        ([tpt_corrs.min()], tpt_corrs, [tpt_corrs.min()])
    ).round(2)
    # find peaks in correlation timeseries
    peaks, peak_data = find_peaks(tpt_corrs_0pad, 
                                  prominence=MIN_PROMINENCE, 
                                  width=MIN_WIDTH, 
                                  rel_height=REL_HEIGHT)
    if len(peaks) == 0:
        continue
    
    # get timepoints where contour line at width evaluation height 
    # (REL_HEIGHT * peak's prominence) intersects with timeseries curve.
    # Intersections returned by find_peaks are interpolated, so round 
    # "away from" peak to get "full" timepoint.
    # Subtract 1 to correct for 0-padding timeseries, but account for 
    # intersections at x < 1 being rounded down already.
    left_tpts = [max(np.floor(i) - 1, 0) for i in peak_data['left_ips']]
    right_tpts = [np.ceil(i) - 1 for i in peak_data['right_ips']]
    
    # merge overlapping intervals
    tpt_intervals = [[left_tpts[0], right_tpts[0]]]
    for (onset_tpt, offset_tpt) in zip(left_tpts[1:], right_tpts[1:]):
        if tpt_intervals[-1][0] <= onset_tpt <= tpt_intervals[-1][1]:
            tpt_intervals[-1][1] = max(tpt_intervals[-1][1], offset_tpt)
        else:
            tpt_intervals.append([onset_tpt, offset_tpt])
            
    lecture_text_samples = []
    for (onset_tpt, offset_tpt) in tpt_intervals:
        # get text from corresponding period of lecture transcript
        text_ixs = np.where((forces_timestamps_float >= onset_tpt) & 
                            (forces_timestamps_float <= offset_tpt))[0]
        transcript_text = ' '.join(forces_transcript_text[text_ixs])
    
        # get timestamps in 'MM:SS[.ss]' format
        onset_timestamp = forces_timestamps[text_ixs[0]]
        offset_timestamp = forces_timestamps[text_ixs[-1]]
        
        transcript_text_fmt = f'\\textbf{{[{onset_timestamp}--{offset_timestamp}]}} '
        if onset_tpt != 0:
            transcript_text_fmt = f'{transcript_text_fmt}...'
        transcript_text_fmt = f'{transcript_text_fmt}{transcript_text}'
        if offset_tpt != len(forces_transcript_text) - 1:
            transcript_text_fmt = f'{transcript_text_fmt}...'
        
        lecture_text_samples.append(transcript_text_fmt)
        
    matched_text_df.loc[
        q_ix, 'Matched lecture transcript text'
    ] = '\newline'.join(lecture_text_samples)

In [88]:
# repeat for Birth of Stars lecture
for q_ix, tpt_corrs in enumerate(bos_qcorrs.T, start=16):    
    tpt_corrs_0pad = np.concatenate(
        ([tpt_corrs.min()], tpt_corrs, [tpt_corrs.min()])
    ).round(2)
    peaks, peak_data = find_peaks(tpt_corrs_0pad, 
                                  prominence=MIN_PROMINENCE, 
                                  width=MIN_WIDTH, 
                                  rel_height=REL_HEIGHT)
    if len(peaks) == 0:
        continue

    left_tpts = [max(np.floor(i) - 1, 0) for i in peak_data['left_ips']]
    right_tpts = [np.ceil(i) - 1 for i in peak_data['right_ips']]
    
    tpt_intervals = [[left_tpts[0], right_tpts[0]]]
    for (onset_tpt, offset_tpt) in zip(left_tpts[1:], right_tpts[1:]):
        if tpt_intervals[-1][0] <= onset_tpt <= tpt_intervals[-1][1]:
            tpt_intervals[-1][1] = max(tpt_intervals[-1][1], offset_tpt)
        else:
            tpt_intervals.append([onset_tpt, offset_tpt])
            
    lecture_text_samples = []
    for (onset_tpt, offset_tpt) in tpt_intervals:
        text_ixs = np.where((bos_timestamps_float >= onset_tpt) & 
                            (bos_timestamps_float <= offset_tpt))[0]
        transcript_text = ' '.join(bos_transcript_text[text_ixs])
    
        onset_timestamp = bos_timestamps[text_ixs[0]]
        offset_timestamp = bos_timestamps[text_ixs[-1]]
        
        transcript_text_fmt = f'\\textbf{{[{onset_timestamp}--{offset_timestamp}]}} '
        if onset_tpt != 0:
            transcript_text_fmt = f'{transcript_text_fmt}...'
        transcript_text_fmt = f'{transcript_text_fmt}{transcript_text}'
        if offset_tpt != len(bos_transcript_text) - 1:
            transcript_text_fmt = f'{transcript_text_fmt}...'
        
        lecture_text_samples.append(transcript_text_fmt)
        
    matched_text_df.loc[
        q_ix, 'Matched lecture transcript text'
    ] = '\newline'.join(lecture_text_samples)

In [89]:
matched_text_df

Unnamed: 0_level_0,Question set,Question,Matched lecture transcript text
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,FFF,Why is the gravitational attraction between yo...,\textbf{[00:00.03--01:33.509]} what I want to ...
2,FFF,Which of the following is an example of the We...,\textbf{[01:22.439--03:05.1]} ...this is maybe...
3,FFF,Roughly how many times stronger is the Weak In...,\textbf{[02:52.38--04:43.73]} ...anti-electron...
4,FFF,Why don't you and your computer experience any...,\textbf{[00:00.03--02:42.9]} what I want to do...
5,FFF,Which of the following is a difference between...,\textbf{[04:34.28--05:52.61]} ...times the str...
6,FFF,Electricity and magnetism can be shown to be t...,\textbf{[04:34.28--05:52.61]} ...times the str...
7,FFF,Which of the following are the primary two fun...,\textbf{[00:00.03--01:33.509]} what I want to ...
8,FFF,Why does the universe have a very uneven distr...,\textbf{[08:24.11--10:26.27]} ...just run away...
9,FFF,"In your body, there are a tremendous amount of...",\textbf{[00:00.03--01:33.509]} what I want to ...
10,FFF,Which of the following is a similarity between...,\textbf{[02:52.38--04:43.73]} ...anti-electron...


In [104]:
colnames_bold = [f'\\textbf{{{c}}}' for c in matched_text_df.columns]
with pd.option_context('max_colwidth', 5000):
    latex_code = matched_text_df.to_latex(column_format='|r|p{0.375in}|p{1.275in}|p{3.5in}|', 
                                          escape=False, 
                                          header=colnames_bold)
print(latex_code
      .replace(r'\\', r'\\\hline')
      .replace(r'\toprule''\n', '')
      .replace(r'\midrule''\n', '')
      .replace(r'\bottomrule''\n', ''))

\begin{tabular}{|r|p{0.375in}|p{1.275in}|p{3.5in}|}
{} & \textbf{Question set} &                                                                                                                                                                                                                                                                \textbf{Question} &                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     