## Imports

In [1]:
import pickle
import re
import numpy as np
import pandas as pd
from datetime import timedelta
from os.path import join as opj
from nltk.corpus import stopwords
from scipy.interpolate import interp1d
from scipy.spatial.distance import cdist
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

## Set paths

In [2]:
datadir = '../../data/'
rawdir = opj(datadir, 'raw')
trajs_dir = opj(datadir, 'trajectories')
models_dir = opj(datadir, 'models')

## Load lecture and question data

In [3]:
# Four Forces
with open(opj(rawdir, 'forces_transcript_timestamped.txt'), 'r') as f:
    ff_transcript = f.read()
    
# Birth of Stars
with open(opj(rawdir, 'bos_transcript_timestamped.txt'), 'r') as f:
    bos_transcript = f.read()
    
# quiz questions
questions_df = pd.read_csv(opj(rawdir, 'questions.tsv'), sep='\t', 
                           names=['index', 'lecture', 'question', 
                                  'ans_A', 'ans_B', 'ans_C', 'ans_D'], 
                           index_col='index')

## Set some parameters

In [4]:
# lecture transcript sliding window length
lecture_wsize = 15
# stop words corpus (see https://www.aclweb.org/anthology/W18-2502.pdf)
stop_words = stopwords.words('english') + ["let", "let's", "they'd", "they're", 
                                           "they've", "they'll", "that's", 
                                           "I'll", "I'm"]
# vectorizer params
cv_params = {
    'max_df': 0.95,
    'min_df': 2,
    'max_features': 500,
    'stop_words': stop_words
}

# topic model params
lda_params = {
    'n_components': 20,
    'learning_method': 'batch',
    'random_state': 0
}

## Define some functions

In [5]:
def format_text(windows, sw=stop_words):
    # some simple text preprocessing
    clean_text = []
    for chunk in windows:
        no_punc = re.sub("[^a-zA-Z\s'-]+", '', chunk.lower()).replace('-', ' ')
        no_stop = ' '.join([word for word in no_punc.split() if word not in sw])
        clean = re.sub("'+", '', no_stop)
        clean_text.append(clean)
    return clean_text

In [6]:
def _ts_to_secs(ts):
    mins, secs = ts.split(':')
    mins, secs = int(mins), int(secs)
    return timedelta(minutes=mins, seconds=secs).total_seconds()
    

def parse_windows(transcript, wsize):
    # formats lecture transcripts as overlapping sliding windows
    # to feed as documents to topic model
    # also returns timestamps of transcribed speech for interpolation
    lines = transcript.splitlines()
    text_lines = [l for ix, l in enumerate(lines) if ix % 2]
    ts_lines = [_ts_to_secs(l) for ix, l in enumerate(lines) if not ix % 2]    
    windows = []
    timestamps = []
    for ix in range(1, wsize):
        start, end = 0, ix
        windows.append(' '.join(text_lines[start : end]))
        timestamps.append((ts_lines[start] + ts_lines[end - 1]) / 2)

    for ix in range(len(ts_lines)):
        start = ix
        end = ix + wsize if ix + wsize <= len(text_lines) else len(text_lines)
        windows.append(' '.join(text_lines[start : end]))
        timestamps.append((ts_lines[start] + ts_lines[end - 1]) / 2)
        
    return windows, timestamps

In [7]:
def interp_lecture(lec_traj, timestamps):
    # interpolates lecture trajectories to 1 vector per second
    new_tpts = np.arange(timestamps[-1])
    interp_func = interp1d(timestamps, lec_traj, axis=0)
    return interp_func(new_tpts)

## process and reformat text

In [8]:
# get sliding windows & timestamps from lecture transcripts
ff_windows, ff_timestamps = parse_windows(ff_transcript, lecture_wsize)
bos_windows, bos_timestamps = parse_windows(bos_transcript, lecture_wsize)

# remove punctuation, stop-words, digits, etc.
ff_windows = format_text(ff_windows)
bos_windows = format_text(bos_windows)

# format quiz questions and correct answers
grouped_qdf = questions_df.groupby('lecture')
gen_qs, ff_qs, bos_qs = grouped_qdf['question'].apply(format_text)
gen_correct, ff_correct, bos_correct = grouped_qdf['ans_A'].apply(format_text)
all_qs = ff_qs + bos_qs + gen_qs
all_ans_correct = ff_correct + bos_correct + gen_correct

# format all answers
all_ans = questions_df.loc[:, 'ans_A':].apply(format_text, axis=1).to_dict()

## Model lectures and quiz questions

In [9]:
# create corpus
corpus = ff_windows + bos_windows + all_qs + all_ans_correct

# fit CountVectorizer model, vectorize corpus for fitting topic model
tf_vectorizer = CountVectorizer(**cv_params).fit(corpus)
corpus_tf = tf_vectorizer.transform(corpus)
# vectorize lecture windows
ff_lec_tf = tf_vectorizer.transform(ff_windows)
bos_lec_tf = tf_vectorizer.transform(bos_windows)
# vectorize questions
ff_qs_tf = tf_vectorizer.transform(ff_qs)
bos_qs_tf = tf_vectorizer.transform(bos_qs)
gen_qs_tf = tf_vectorizer.transform(gen_qs)

# fit LatentDirichletAllocation model
lda = LatentDirichletAllocation(**lda_params).fit(corpus_tf)
# transform lecture windows
ff_traj = lda.transform(ff_lec_tf)
bos_traj = lda.transform(bos_lec_tf)
# transform questions
ff_qs_vecs = lda.transform(ff_qs_tf)
bos_qs_vecs = lda.transform(bos_qs_tf)
gen_qs_vecs = lda.transform(gen_qs_tf)


# interpolate lecture trajectories to 1 sample per second
ff_traj = interp_lecture(ff_traj, ff_timestamps)
bos_traj = interp_lecture(bos_traj, bos_timestamps)

## Match question vectors to qIDs, model answers

In [10]:
all_questions = dict.fromkeys(range(1, 40))
all_answers = dict.fromkeys(all_questions.keys())
qs_concat = np.concatenate((ff_qs_vecs, bos_qs_vecs, gen_qs_vecs))
for qID, q_vec in enumerate(qs_concat, start=1):
    all_questions[qID] = q_vec
    all_answers[qID] = lda.transform(tf_vectorizer.transform(all_ans[qID]))

## Save trajectories and fit models

In [11]:
# np.save(opj(trajs_dir, 'forces_lecture'), ff_traj)
# np.save(opj(trajs_dir, 'bos_lecture'), ff_traj)
# np.save(opj(trajs_dir, 'forces_questions'), ff_qs_vecs)
# np.save(opj(trajs_dir, 'bos_questions'), bos_qs_vecs)
# np.save(opj(trajs_dir, 'general_questions'), gen_qs_vecs)

# with open(opj(trajs_dir, 'all_questions.p'), 'wb') as f:
#     pickle.dump(all_questions, f)
# with open(opj(trajs_dir, 'all_answers.p'), 'wb') as f:
#     pickle.dump(all_answers, f)    

# np.save(opj(models_dir, 'fit_CV'), tf_vectorizer)
# np.save(opj(models_dir, 'fit_LDA'), lda)