This notebook fits a topic model to the Sherlock text descriptions and then transformed the recall transcripts with the model.

## Import libraries

In [None]:
import os
import re
import numpy as np
import pandas as pd
import hypertools as hyp
from os.path import join as opj
from scipy.interpolate import interp1d

%matplotlib inline

## Set data paths

In [None]:
rawdir = '../../../data/raw/' 
datadir = '../../../data/processed/'

## Define functions

In [None]:
def format_text(text):
    if isinstance(text, pd.Series):
        text = ' '.join(list(text.dropna()))
        pattern = "[^\w\s]+"
    else:
        pattern = "[^.\w\s]+"

    no_possessive = text.lower().replace("'s", '')
    punc_stripped = re.sub(pattern, '', no_possessive)
    spaced = ' '.join(punc_stripped.split())
    return punc_stripped
    
def parse_windows(textlist, wsize):
    windows = []
    w_lengths = []
    for ix in range(1, wsize):
        start, end = 0, ix
        w_lengths.append((start, end))
        windows.append(' '.join(textlist[start : end]))

    for ix in range(len(textlist)):
        start = ix
        end = ix + wsize if ix + wsize <= len(textlist) else len(textlist)
        w_lengths.append((start, end))
        windows.append(' '.join(textlist[start : end]))

    return windows, w_lengths


def get_video_timepoints(window_spans):
    timepoints = []
    for first, last in window_spans:
        window_onset = video_text.loc[first, 'Start Time (s) ']
        window_offset = video_text.loc[last - 1, 'End Time (s) ']
        timepoints.append((window_onset + window_offset) / 2)
        
    return np.array(timepoints)

## Set model parameters

In [None]:
video_wsize = 50
n_topics = 100
recall_wsize = 10

# vectorizer parameters
vectorizer = {
    'model' : 'CountVectorizer', 
    'params' : {
        'stop_words' : 'english'
    }
}

# topic model parameters
semantic = {
    'model' : 'LatentDirichletAllocation', 
    'params' : {
        'n_components' : n_topics,
        'learning_method' : 'batch',
        'random_state' : 0,
    }
}

## Load and format data

In [None]:
video_text = pd.read_excel(opj(rawdir, 'Sherlock_Segments_1000_NN_2017.xlsx'))
video_text['Scene Segments'].fillna(method='ffill', inplace=True)

# drop 1s shot & 6s of black screen after end of 1st scan
video_text.drop(index=[480, 481], inplace=True)
video_text.reset_index(drop=True, inplace=True)

# timestamps for 2nd scan restart from 0; add duration of 1st scan to values
video_text.loc[480:, 'Start Time (s) ': 'End Time (s) '] += video_text.loc[479, 'End Time (s) ']

## Fit topic model to manually-annotated movie

In [None]:
# create a list of text samples from the scene descriptions / details to train the topic model
video = video_text.loc[:,'Scene Details - A Level ':'Words on Screen '].apply(format_text, axis=1).tolist()
video_windows, window_bounds = parse_windows(video, video_wsize)

# create video model with hypertools
video_model = hyp.tools.format_data(video_windows, 
                                    vectorizer=vectorizer, 
                                    semantic=semantic, 
                                    corpus=video_windows)[0]

# description are by scene, not TR so stretch the model to be in TRs
tr_spans = video_text[['Start Time (TRs, 1.5s)', 'End Time (TRs, 1.5s)']]
starts, stops = tr_spans.values.T
video_model_TRs = np.empty((1976, 100))

xvals = get_video_timepoints(window_bounds)
xvals_TR = xvals * 1976 / 2963
TR_times = np.arange(1, 1977)
interp_func = interp1d(xvals_TR, video_model, axis=0, fill_value='extrapolate')
video_model_TRs = interp_func(TR_times)

## Transform recalls

In [None]:
# loop over subjects
recall_w = []
for sub in range(1, 18):
    # load subject data
    transcript_path = opj(rawdir, f'NN{sub} transcript.txt')
    with open(transcript_path, 'r', encoding='cp1252') as f:
        recall = f.read().replace(b'\x92'.decode('cp1252'), "'").strip()

    # create overlapping windows of n sentences
    recall_fmt = format_text(recall).split('.')
    if not recall_fmt[-1]:
        recall_fmt = recall_fmt[:-1]
    sub_recall_w = parse_windows(recall_fmt, recall_wsize)[0]
    recall_w.append(sub_recall_w)
    
    # save example participant's recall windows 
    if sub == 17:
        np.save(opj(datadir, 'recall_text.npy'), sub_recall_w)
    
# create recall models
recall_models = hyp.tools.format_data(recall_w, 
                                      vectorizer=vectorizer, 
                                      semantic=semantic, 
                                      corpus=video_windows)

## Save video model, recall models, and text corpus

In [None]:
np.save(opj(datadir, f'models_t{n_topics}_v{video_wsize}_r{recall_wsize}'), [video_model_TRs, recall_models])
np.save(opj(datadir, 'video_text.npy'), video_windows)