In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hypertools as hyp
import quail
from scipy.stats import pearsonr as corr
from scipy.spatial.distance import cdist
from scipy.signal import resample

%matplotlib inline
sns.set_context('poster')
plt.rc('figure', figsize=(12, 8))

In [2]:
rawdir = '../../data/raw/'
datadir = '../../data/processed/'

In [3]:
# load in raw annotation file
movie_annotations = pd.read_excel(rawdir+'Sherlock_Segments_1000_NN_2017.xlsx')
movie_annotations['Scene Segments'].fillna(method='ffill', inplace=True)

In [6]:
# model parameters
ntopics = 100
m_wsize = 50
r_wsize = 10

# vectorizer parameters
vectorizer = {
    'model' : 'CountVectorizer', 
    'params' : {
        'stop_words' : 'english'
    }
}

# topic model parameters
semantic = {
    'model' : 'LatentDirichletAllocation', 
    'params' : {
        'n_components' : ntopics,
        'learning_method' : 'batch',
        'random_state' : 0
    }
}

## define some functions

In [9]:
def model_text():
    
    # create sliding window of text samples
    windows = []
    for idx, sentence in enumerate(text):
        windows.append(','.join(text[idx:idx+m_wsize]))
        
    # vectorizer and topic model parameters
    vectorizer = vec_params
    semantic = sem_params
    
    # create topic model with hypertools
    return hyp.tools.format_data(windows, vectorizer=vec_params, semantic=sem_params, corpus=windows)[0]

In [10]:
def model_movie(text, ntopics, w_size, vec_params, sem_params):
    m_model = model_text()
    
    # scene description are by shot, not TR, so stretch the model to be in TRs
    ranges =[[d['Start Time (TRs, 1.5s)'],d['End Time (TRs, 1.5s)']] for i, d in movie_annotations.iterrows()] 
    expanded = []
    for i in range(1976):
        try:
            idx = np.where([i>=r[0] and i<=r[1] for r in ranges])[0][0]
            expanded.append(m_model[idx, :])
        except:
            expanded.append(m_model[0, :])

    # make movie windows accessible to model_recall function
    model_movie.movie_windows = movie_windows
    
    return np.array(expanded)

In [11]:
def model_recall(sub, w_size, ntopics, vec_params, sem_params):
    
    # load recall from text file
    text = pd.read_csv(rawdir+'NN'+str(sub)+' transcript.txt', header=None, sep='.', error_bad_lines=False, encoding='latin-1').values.tolist()[0][:-1]
    
    sent = []
    # separate sentences
    for sentence in text:
        try:
            s = sentence.encode('utf-8').strip()
            sent.append(sentence)
        except AttributeError:
            pass # skip over nans
    
    # create recall model trained on windows from movie model
    model_text(corpus=model_movie.movie_windows)[0]
    
    # resample model to movie model dimensions
    return resample(recall_model, 1976)

In [None]:
# creates a list of subjects' recall models
def model_all_subs(subs, r_wsize, ntopics, vec_params, sem_params):
    return [model_recall(sub, r_wsize, ntopics, vec_params, sem_params) for sub in subs]

In [12]:
# drop a single feature from the model
def drop_feature(df, feature):
    return df.drop(feature, axis=1)

# computes correlation between movie model and recall model
def compare_mr_models(m_model, r_model):
    return corr(pd.DataFrame(m_model).T.corr().values.ravel(), pd.DataFrame(r_model).T.corr().values.ravel())[0]

## Which features are important to the movie/recall model relationship?

In [13]:
# load in full movie model and resampled recall models
movie_model = np.load(datadir+'models_t100_v50_r10_resampled.npy')[0]
recall_models_rs = np.load(datadir+'models_t100_v50_r10_resampled.npy')[1]

a


In [5]:
features_df = movie_annotations.loc[:,'Scene Details - A Level ':'Words on Screen ']
