In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hypertools as hyp
import quail
from scipy.stats import pearsonr as corr
from scipy.spatial.distance import cdist
from scipy.signal import resample

%matplotlib inline
sns.set_context('poster')
plt.rc('figure', figsize=(12, 8))

In [2]:
rawdir = '../../data/raw/'
datadir = '../../data/processed/'

In [3]:
# load in raw annotation file
movie_annotations = pd.read_excel(rawdir+'Sherlock_Segments_1000_NN_2017.xlsx')
movie_annotations['Scene Segments'].fillna(method='ffill', inplace=True)


In [4]:
# model parameters
ntopics = 100
m_wsize = 50
r_wsize = 10

# vectorizer parameters
vectorizer = {
    'model' : 'CountVectorizer', 
    'params' : {
        'stop_words' : 'english'
    }
}

# topic model parameters
semantic = {
    'model' : 'LatentDirichletAllocation', 
    'params' : {
        'n_components' : ntopics,
        'learning_method' : 'batch',
        'random_state' : 0
    }
}

## define some functions

In [5]:
def model_text(text, w_size, vec_params, sem_params, corpus=None, panes=None):
    
    # create sliding window of text samples
    windows = []
    
    # if the function is being called by model_movie()
    if corpus is None:
        # train the model on the movie windows...
        corpus = windows
        # ...and make them accessible to model_movie
        model_text.windows = windows
    
    # similarly,    
    if panes is None:
        # create windows from the movie annotations
        panes = text
    
    for idx, sentence in enumerate(panes):
        windows.append(','.join(panes[idx:idx+m_wsize]))
        
    # vectorizer and topic model parameters
    vectorizer = vec_params
    semantic = sem_params
    
    
    # create topic model with hypertools
    return hyp.tools.format_data(windows, vectorizer=vec_params, semantic=sem_params, corpus=corpus)[0]

In [6]:
def model_movie(text, w_size, vec_params, sem_params):
    m_model = model_text(text, w_size, vec_params, sem_params)
    
    # scene description are by shot, not TR, so stretch the model to be in TRs
    ranges =[[d['Start Time (TRs, 1.5s)'],d['End Time (TRs, 1.5s)']] for i, d in movie_annotations.iterrows()] 
    expanded = []
    for i in range(1976):
        try:
            idx = np.where([i>=r[0] and i<=r[1] for r in ranges])[0][0]
            expanded.append(m_model[idx, :])
        except:
            expanded.append(m_model[0, :])

    # make movie windows accessible to model_recall function
    model_movie.movie_windows = model_text.windows
    
    return np.array(expanded)

In [7]:
def model_recall(sub, w_size, vec_params, sem_params):
    
    # load recall from text file
    text = pd.read_csv(rawdir+'NN'+str(sub)+' transcript.txt', header=None, sep='.', error_bad_lines=False, encoding='latin-1').values.tolist()[0][:-1]
    
    sent = []
    # separate sentences
    for sentence in text:
        try:
            s = sentence.encode('utf-8').strip()
            sent.append(sentence)
        except AttributeError:
            pass # skip over nans
    
    # create recall model trained on windows from movie model
    recall_model = model_text(text, w_size, vec_params, sem_params, corpus=model_movie.movie_windows, panes=sent)
    
    # resample model to movie model dimensions
    return resample(recall_model, 1976)

In [8]:
# creates a list of subjects' recall models
def model_all_subs(subs, w_size, vec_params, sem_params):
    return [model_recall(sub, r_wsize, vec_params, sem_params) for sub in subs]

In [9]:
# drop a single feature from the model
def drop_feature(df, feature):
    return df.drop(feature, axis=1)

# computes correlation between movie model and recall model
def compare_mr_models(m_model, r_model):
    return corr(pd.DataFrame(m_model).T.corr().values.ravel(), pd.DataFrame(r_model).T.corr().values.ravel())[0]

## Which features are important to the movie/recall model relationship?

In [10]:
# load in full movie model and resampled recall models
full_movie_model = np.load(datadir+'models_t100_v50_r10_resampled.npy')[0]
full_recall_models_rs = np.load(datadir+'models_t100_v50_r10_resampled.npy')[1]

In [13]:
# isolate features used in model
features_df = movie_annotations.loc[:,'Scene Details - A Level ':'Words on Screen ']
features_df.columns = ['Narrative Details', 'Indoor vs Outdoor', 'Characters on Screen', 'Character in Focus', 'Character Speaking', 'Setting', 'Camera Angle', 'Music Presence', 'Text on Screen']

In [14]:
dropfeat_corrs = {}

# compute average recall model correlation to full movie model
avg_corr_full = np.mean([compare_mr_models(full_movie_model, full_recall_models_rs[sub]) for sub in range(np.shape(full_recall_models_rs)[0])])
print('with all features, correlation is ' + str(avg_corr_full))
dropfeat_corrs['None'] = avg_corr_full

# iteratively leave out one feature from model and recompute correlation
for feat in features_df.columns:
    print('\ndropping '+str(feat))
    partial_df = drop_feature(features_df,feat)
    partial_movie_text = partial_df.apply(lambda x: ', '.join(x.fillna('')), axis=1).values.tolist()
    
    print('computing movie model')
    partial_movie_model = model_movie(partial_movie_text, m_wsize, vectorizer, semantic)
    
    # remodel recalls based on partial movie model windows
    print('computing recall models')
    recall_models = model_all_subs([sub for sub in range(1,18)], r_wsize, vectorizer, semantic)
    
    sub_corrs = []
    for rec_mod in recall_models:
        sub_corrs.append(compare_mr_models(partial_movie_model, rec_mod))
    sub_corr_mean = np.mean(sub_corrs)
    print('avg correlation is '+str(sub_corr_mean))
    dropfeat_corrs[feat] = sub_corr_mean
    
    print('saving partial models')
    np.save(datadir+'dropfeat_models/%s_dropped_models' % (str(feat)), [partial_movie_model, recall_models])

with all features, correlation is 0.6280980859010863

dropping Narrative Details
computing movie model
computing recall models
avg correlation is 0.3340514399310907
saving partial models

dropping Indoor vs Outdoor
computing movie model
computing recall models
avg correlation is 0.39230536923174214
saving partial models

dropping Characters on Screen
computing movie model
computing recall models
avg correlation is 0.39894321812691064
saving partial models

dropping Character in Focus
computing movie model
computing recall models
avg correlation is 0.4304075495529114
saving partial models

dropping Character Speaking
computing movie model
computing recall models
avg correlation is 0.4076379241469465
saving partial models

dropping Setting
computing movie model
computing recall models
avg correlation is 0.3795508405214756
saving partial models

dropping Camera Angle
computing movie model
computing recall models
avg correlation is 0.40568718379362634
saving partial models

dropping Music 

In [65]:
np.array_equal(recall_models[0], full_recall_models_rs[0])

False

In [62]:
for i in recall_models[0]:
    print(i)

[3.58422939e-05 3.58422939e-05 3.58422939e-05 3.58422939e-05
 3.58422939e-05 3.58422939e-05 3.58422939e-05 3.58422939e-05
 3.58422939e-05 3.58422939e-05 3.58422939e-05 3.58422939e-05
 3.58422939e-05 3.58422939e-05 3.58422939e-05 3.58422939e-05
 8.08203698e-02 3.58422939e-05 3.58422939e-05 3.58422939e-05
 3.58422939e-05 3.58422939e-05 3.58422939e-05 3.58422939e-05
 3.58422939e-05 3.58422939e-05 3.58422939e-05 3.51740719e-02
 1.20254954e-01 3.58422939e-05 3.58422939e-05 3.58422939e-05
 3.58422939e-05 6.55061315e-02 4.72375630e-02 2.89842578e-02
 3.58422939e-05 3.58422939e-05 3.58422939e-05 3.58422939e-05
 3.58422939e-05 3.58422939e-05 3.58422939e-05 3.58422939e-05
 3.99140028e-02 3.58422939e-05 2.45781809e-02 3.58422939e-05
 1.39198691e-01 3.58422939e-05 3.58422939e-05 3.58422939e-05
 6.46877995e-02 3.58422939e-05 3.58422939e-05 1.30004507e-02
 5.40949183e-02 3.58422939e-05 3.58422939e-05 3.58422939e-05
 3.58422939e-05 3.58422939e-05 3.58422939e-05 1.67087589e-01
 3.58422939e-05 3.584229

  2.20116271e-04  7.44521388e-02  2.20116271e-04  2.20116271e-04]
[ 2.24626917e-04  2.24626917e-04  2.24626917e-04  2.24626917e-04
  2.24626917e-04  2.24626917e-04  2.24626917e-04  2.24626917e-04
  2.24626917e-04  2.24626917e-04  2.24626917e-04  2.24626917e-04
  2.24626917e-04  2.24626917e-04  2.24626917e-04  2.24626917e-04
  7.49022209e-02  2.24626917e-04  2.24626917e-04 -7.09259704e-03
  2.24626917e-04  2.24626917e-04  2.24626917e-04  2.24626917e-04
  2.24626917e-04 -6.67784931e-04  2.24626917e-04  3.93190557e-02
  1.23917391e-01  2.24626917e-04  2.24626917e-04  2.24626917e-04
 -5.56342470e-03  9.77457278e-02  5.31932141e-02  5.83903904e-02
  2.24626917e-04  2.62937547e-04  2.24626917e-04  2.24626917e-04
  2.24626917e-04  2.24626917e-04  2.24626917e-04  2.24626917e-04
  3.14485291e-02  2.41400238e-04  2.83611130e-02  2.24626917e-04
  1.27179592e-01  2.24626917e-04  2.24626917e-04  2.24626917e-04
  5.38397908e-03  2.24626917e-04  2.24626917e-04  1.45945941e-02
  5.21464623e-02  2.2462

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [25]:
single_sub_rec_model = model_recall(12, r_wsize, vec_params=vectorizer, sem_params=semantic)


In [26]:
np.shape(single_sub_rec_model)

(1976, 100)