This notebook fits a topic model to the Sherlock text descriptions and then transformed the recall transcripts with the model.

## Import libraries

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import hypertools as hyp
import numpy as np
from scipy.spatial.distance import cdist
from scipy.signal import resample
from scipy import ndimage
from scipy.stats import zscore
from skimage.transform import resize
from fastdtw import fastdtw
from scipy.spatial.distance import correlation

sns.set_context('poster')
%matplotlib inline
plt.rc('figure', figsize=(12, 8))

def _z2r(z):
    """
    Function that calculates the inverse Fisher z-transformation

    Parameters
    ----------
    z : int or ndarray
        Fishers z transformed correlation value

    Returns
    ----------
    result : int or ndarray
        Correlation value

    """
    with np.errstate(invalid='ignore', divide='ignore'):
        return (np.exp(2 * z) - 1) / (np.exp(2 * z) + 1)


def _r2z(r):
    """
    Function that calculates the Fisher z-transformation

    Parameters
    ----------
    r : int or ndarray
        Correlation value

    Returns
    ----------
    result : int or ndarray
        Fishers z transformed correlation value

    """
    with np.errstate(invalid='ignore', divide='ignore'):
        return 0.5 * (np.log(1 + r) - np.log(1 - r))


## Set paths/other settings

In [12]:
datadir = '/Users/andrewheusser/Documents/cdl/projects/sherlock/data/'
cmap = sns.cubehelix_palette(100, light=.95)

## Load data and forward fill the segment labels

In [8]:
movie_text = pd.read_excel(datadir+'Sherlock_Segments_1000_NN_2017.xlsx')
movie_text['Scene Segments'].fillna(method='ffill', inplace=True)

## Fit topic model to manually-annotated movie

In [9]:
# create a list of text samples from the scene descriptions / details to train the topic model
movie = movie_text.loc[:,'Scene Details - A Level ':'Words on Screen '].apply(lambda x: ', '.join(x.fillna('')), axis=1).values.tolist()

# create a list of overlapping text samples
movie50 = []
wsize=50
for idx, sentence in enumerate(movie):
    movie50.append(','.join(movie[idx:idx+wsize]))

# vectorizer parameters
vectorizer = {
    'model' : 'CountVectorizer', 
    'params' : {
        'stop_words' : 'english'
    }
}

# topic model parameters
semantic = {
    'model' : 'LatentDirichletAllocation', 
    'params' : {
        'n_components' : 100,
        'learning_method' : 'batch',
        'random_state' : 0,
    }
}

# create movie model with hypertools
movie_model = hyp.tools.format_data(movie50, vectorizer=vectorizer, semantic=semantic, corpus=movie50)[0]

# description are by scene, not TR so stretch the model to be in TRs
ranges =[[d['Start Time (TRs, 1.5s)'],d['End Time (TRs, 1.5s)']] for i, d in movie_text.iterrows()] 
expanded = []
for i in range(1976):
    try:
        idx = np.where([i>=r[0] and i<=r[1] for r in ranges])[0][0]
        expanded.append(movie_model[idx, :])
    except:
        expanded.append(movie_model[0, :])
movie_model = np.array(expanded)

## Transform recalls

In [19]:
# loop over subjects
recall5 = []
wsize=10
for sub in range(1, 18):
    
    # load subject data
    recall = pd.read_csv(datadir+'NN'+str(sub)+' transcript.txt', header=None, sep='.', error_bad_lines=False, encoding='latin-1').values.tolist()[0][:-1]
    
    rs = []  
    # loop over sentences
    for sentence in recall:
        try:
            s = sentence.encode('utf-8').strip()
            rs.append(sentence)
        except:
            pass # skips over nans
    
    # create overlapping windows of n sentences
    sub_recall5 = []
    for idx, sentence in enumerate(rs):
        sub_recall5.append(','.join(rs[idx:idx+wsize]))
        
    recall5.append(sub_recall5)
    
# create recall models
recall_models = hyp.tools.format_data(recall5, vectorizer=vectorizer, semantic=semantic, corpus=movie50)

# resample the models
recall_models_rs = list(map(lambda x: resample(x, 1976), recall_models))

## Save video and recall models

In [9]:
np.save('../data/video_text', movie50)
np.save('../data/recall_text', recall5)
np.save('../data/models_t100_v50_r10', [movie_model, recall_models])
np.save('../data/models_t100_v50_r10_resampled', [movie_model, recall_models_rs])