## Imports

In [3]:
import re
import numpy as np
import pandas as pd
import hypertools as hyp
from datetime import timedelta
from decimal import Decimal
from os.path import join as opj
from nltk.corpus import stopwords
from scipy.interpolate import interp1d
from scipy.spatial.distance import cdist
from scipy.stats import entropy
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_context('talk')

## Set paths

In [5]:
datadir = '../../data/'
rawdir = opj(datadir, 'raw')
figdir = '../../figures/'

## Load lecture and question data

In [6]:
# Four Forces
with open(opj(rawdir, 'forces_transcript_timestamped.txt'), 'r') as f:
    ff_transcript = f.read()
    
# Birth of Stars
with open(opj(rawdir, 'bos_transcript_timestamped.txt'), 'r') as f:
    bos_transcript = f.read()
    
# quiz questions
questions_df = pd.read_csv(opj(rawdir, 'questions.tsv'), sep='\t', 
                           names=['index', 'lecture', 'question', 
                                  'ans_A', 'ans_B', 'ans_C', 'ans_D'], 
                           index_col='index')

# quiz scores
scores_df = pd.read_csv(opj(rawdir, 'Graded_results_19f_49.csv'), index_col='Unnamed: 0')

## Set some parameters

In [10]:
# lecture transcript sliding window length
lecture_wsize = 15
# stop words corpus (see https://www.aclweb.org/anthology/W18-2502.pdf)
stop_words = stopwords.words('english') + ["let", "let's", "they'd", "they're", 
                                           "they've", "they'll", "that's", 
                                           "I'll", "I'm"]
# tokenizer params
cv_params = {
    'max_df': 0.95,
    'min_df': 2,
    'max_features': 500,
    'stop_words': stop_words
}

# topic model params
lda_params = {
    'n_components': 20,
    'learning_method': 'batch',
    'random_state': 0
}

## Define some functions

In [11]:
def format_text(windows, sw=stop_words):
    # some simple text preprocessing
    clean_text = []
    for chunk in windows:
        no_punc = re.sub("[^a-zA-Z\s'-]+", '', chunk.lower()).replace('-', ' ')
        no_stop = ' '.join([word for word in no_punc.split() if word not in sw])
        clean = re.sub("'+", '', no_stop)
        if clean:
            clean_text.append(clean)
    return clean_text

In [12]:
def _ts_to_secs(ts):
    mins, secs = ts.split(':')
    mins, secs = int(mins), int(secs)
    return timedelta(minutes=mins, seconds=secs).total_seconds()
    

def parse_windows(transcript, wsize):
    # formats lecture transcripts as overlapping sliding windows
    # to feed as documents to topic model
    # also returns timestamps of transcribed speech for interpolation
    lines = transcript.splitlines()
    text_lines = [l for ix, l in enumerate(lines) if ix % 2]
    ts_lines = [_ts_to_secs(l) for ix, l in enumerate(lines) if not ix % 2]    
    windows = []
    timestamps = []
    for ix in range(1, wsize):
        start, end = 0, ix
        windows.append(' '.join(text_lines[start : end]))
        timestamps.append((ts_lines[start] + ts_lines[end - 1]) / 2)

    for ix in range(len(ts_lines)):
        start = ix
        end = ix + wsize if ix + wsize <= len(text_lines) else len(text_lines)
        windows.append(' '.join(text_lines[start : end]))
        timestamps.append((ts_lines[start] + ts_lines[end - 1]) / 2)
        
    return windows, timestamps

In [13]:
def interp_lecture(lec_traj, timestamps):
    # interpolates lecture trajectories to 1 vector per second
    new_tpts = np.arange(timestamps[-1])
    interp_func = interp1d(timestamps, lec_traj, axis=0)
    return interp_func(new_tpts)

In [None]:
def symmetric_KL(a, b, c=.00000000001):
    # symmetrized KL divergence
    return np.divide(entropy(a + c, b + c) + entropy(b + c, a + c), 2)
    
def reconstruct_trace(video_model, questions_model, acc):
    # compute timepoints by questions correlation matrix
    wz = 1 - cdist(video_model, questions_model, metric=symmetric_KL)
    # normalize
    wz -= np.min(wz)
    wz /= np.max(wz)
    # sum over questions
    a = np.sum(wz, axis=1)
    # sum over correctly answered questions
    b = np.sum(wz[:, list(map(bool, acc))], axis=1)
    # divide b by a
    b_a = np.array(np.divide(b, a), ndmin=2)
    # weight the model
    return video_model * b_a.T

## process and reformat text

In [None]:
# get sliding windows & timestamps from lecture transcripts
ff_windows, ff_timestamps = parse_windows(ff_transcript, lecture_wsize)
bos_windows, bos_timestamps = parse_windows(bos_transcript, lecture_wsize)

# get text from quiz questions and answers


# remove punctuation, stop-words, digits, etc.
ff_windows = format_text(ff_windows)
bos_windows = format_text(bos_windows)



In [14]:
questions_df

Unnamed: 0_level_0,lecture,question,ans_A,ans_B,ans_C,ans_D
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,Why is the gravitational attraction between yo...,Neither you nor your computer has enough mass ...,You and your computer are too close for the gr...,Humans are too small to detect the force of gr...,The gravitational attraction between you and y...
2,1,Which of the following is an example of the We...,A neutron in a radioactive Cesium atom is conv...,Light from the sun collides with a satellite o...,Two protons bound together in a Helium nucleus...,A distant galaxy exerts a small but detectable...
3,1,Roughly how many times stronger is the Weak In...,10000000000000000000000000,10,1000000,The Weak Interaction is less strong than gravity
4,1,Why don't you and your computer experience any...,The weak interaction only acts over extremely ...,The weak interaction between you and your comp...,You and your computer have no net charge,Neither you nor your computer has enough mass ...
5,1,Which of the following is a difference between...,Gravity is only ever attractive while the elec...,Gravity is a much more powerful force than ele...,Gravity can only act over large distances whil...,The electromagnetic force can only act over sm...
6,1,Electricity and magnetism can be shown to be t...,View them in different frames of reference,Switch which charges we call positive and whic...,Consider both the effects over small distances...,Consider both the attractive and repulsive pro...
7,1,Which of the following are the primary two fun...,The Strong Force and the Electromagnetic Force,Gravity and the Weak Interaction,Gravity and the Electromagnetic Force,The Strong Force and the Weak Interaction
8,1,Why does the universe have a very uneven distr...,Positive and negative charges cancel out and b...,Masses tend to repel while charges tend to att...,Masses tend to attract while charges tend to r...,The gravitational interaction acting between m...
9,1,"In your body, there are a tremendous amount of...",The electrons' negative charges are balanced b...,An attractive gravitational force balances out...,The electromagnetic force only acts over very ...,The Electromagnetic force only acts over very ...
10,1,Which of the following is a similarity between...,Both act only over very small distances,Both are stronger than the Electromagnetic force,Both are weaker than Gravity,Both are responsible for attractions between d...
