This notebook fits a topic model to the Sherlock text descriptions and then transformed the recall transcripts with the model.

## Import libraries

In [1]:
import sys
import numpy as np
import pandas as pd
import hypertools as hyp
from os.path import abspath, join as opj
from scipy.interpolate import interp1d

%matplotlib inline

## Import analysis helpers 

In [2]:
sys.path.insert(0, abspath('../../helpers/'))
from analysis_helpers import (
    N_TOPICS,
    VIDEO_WSIZE,
    RECALL_WSIZE,
    VECTORIZER_PARAMS,
    SEMANTIC_PARAMS,
    format_text,
    parse_windows,
    get_video_timepoints
)

Functions and variables used across multiple notebooks can be found [here](https://github.com/contextlab/sherlock-topic-model-paper/blob/master/code/helpers/analysis_helpers.py)

In [3]:
# Show parameters
print(f'Number of topics: {N_TOPICS}')
print(f'Video window size: {VIDEO_WSIZE}')
print(f'Video window size: {RECALL_WSIZE}')
print(f'Vectorizer params: {VECTORIZER_PARAMS}')
print(f'LDA params: {SEMANTIC_PARAMS}')

Number of topics: 100
Video window size: 50
Video window size: 10
Vectorizer params: {'model': 'CountVectorizer', 'params': {'stop_words': 'english'}}
LDA params: {'model': 'LatentDirichletAllocation', 'params': {'n_components': 100, 'learning_method': 'batch', 'random_state': 0}}


## Set data paths

In [4]:
rawdir = '../../../data/raw/' 
datadir = '../../../data/processed/'

## Load and format data

In [5]:
video_text = pd.read_excel(opj(rawdir, 'Sherlock_Segments_1000_NN_2017.xlsx'))
video_text['Scene Segments'].fillna(method='ffill', inplace=True)

# drop 1s shot & 6s of black screen after end of 1st scan
video_text.drop(index=[480, 481], inplace=True)
video_text.reset_index(drop=True, inplace=True)

# timestamps for 2nd scan restart from 0; add duration of 1st scan to values
video_text.loc[480:, 'Start Time (s) ': 'End Time (s) '] += video_text.loc[479, 'End Time (s) ']

## Fit topic model to manually-annotated movie

In [6]:
# create a list of text samples from the scene descriptions / details to train the topic model
video = video_text.loc[:,'Scene Details - A Level ':'Words on Screen '].apply(format_text, axis=1).tolist()
video_windows, window_bounds = parse_windows(video, VIDEO_WSIZE)

# create video model with hypertools
video_model = hyp.tools.format_data(video_windows, 
                                    vectorizer=VECTORIZER_PARAMS, 
                                    semantic=SEMANTIC_PARAMS, 
                                    corpus=video_windows)[0]

# description are by scene, not TR so stretch the model to be in TRs
tr_spans = video_text[['Start Time (TRs, 1.5s)', 'End Time (TRs, 1.5s)']]
starts, stops = tr_spans.values.T
video_model_TRs = np.empty((1976, 100))

xvals = get_video_timepoints(window_bounds, video_text)
xvals_TR = xvals * 1976 / 2963
TR_times = np.arange(1, 1977)
interp_func = interp1d(xvals_TR, video_model, axis=0, fill_value='extrapolate')
video_model_TRs = interp_func(TR_times)

## Transform recalls

In [7]:
# loop over subjects
recall_w = []
for sub in range(1, 18):
    # load subject data
    transcript_path = opj(rawdir, f'NN{sub} transcript.txt')
    with open(transcript_path, 'r', encoding='cp1252') as f:
        recall = f.read().replace(b'\x92'.decode('cp1252'), "'").strip()

    # create overlapping windows of n sentences
    recall_fmt = format_text(recall).split('.')
    if not recall_fmt[-1]:
        recall_fmt = recall_fmt[:-1]
    sub_recall_w = parse_windows(recall_fmt, RECALL_WSIZE)[0]
    recall_w.append(sub_recall_w)
    
    # save example participant's recall windows 
    if sub == 17:
        np.save(opj(datadir, 'recall_text.npy'), sub_recall_w)
    
# create recall models
recall_models = hyp.tools.format_data(recall_w, 
                                      vectorizer=VECTORIZER_PARAMS, 
                                      semantic=SEMANTIC_PARAMS, 
                                      corpus=video_windows)

## Save video model, recall models, and text corpus

In [8]:
np.save(opj(datadir, f'models_t{N_TOPICS}_v{VIDEO_WSIZE}_r{RECALL_WSIZE}'), [video_model_TRs, recall_models])
np.save(opj(datadir, 'video_text.npy'), video_windows)