In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hypertools as hyp
import quail
from scipy.stats import pearsonr as corr
from scipy.signal import resample

%matplotlib inline
sns.set_context('poster')
sns.set_style('ticks')
plt.rc('figure', figsize=(12, 8))

In [2]:
rawdir = '../../../data/raw/'
datadir = '../../../data/processed/'

In [3]:
# load in raw annotation file
movie_annotations = pd.read_excel(rawdir+'Sherlock_Segments_1000_NN_2017.xlsx')
movie_annotations['Scene Segments'].fillna(method='ffill', inplace=True)

In [4]:
# model parameters
ntopics = 100
m_wsize = 50

# vectorizer parameters
vectorizer = {
    'model' : 'CountVectorizer', 
    'params' : {
        'stop_words' : 'english'
    }
}

# topic model parameters
semantic = {
    'model' : 'LatentDirichletAllocation', 
    'params' : {
        'n_components' : ntopics,
        'learning_method' : 'batch',
        'random_state' : 0
    }
}

## define some functions

In [5]:
# train a topic model on text samples from the scene annotations
def model_movie(movie_df, w_size, vec_params, sem_params):

    # create list of text samples from annotations
    movie_text = movie_df.apply(lambda x: ','.join(x.fillna('')), axis=1).values.tolist()
    
    # create sliding window of text samples
    movie_windows = []
    for idx, sentence in enumerate(movie_text):
        movie_windows.append(','.join(movie_text[idx:idx+w_size]))

    # use hypertools to create movie model
    movie_model = hyp.tools.format_data(movie_windows, vectorizer=vec_params, semantic=sem_params, corpus=movie_windows)[0]

    # scene description are by shot, not TR, so stretch the model to be in TRs
    ranges =[[d['Start Time (TRs, 1.5s)'],d['End Time (TRs, 1.5s)']] for i, d in movie_annotations.iterrows()] 
    expanded = []
    for i in range(1976):
        try:
            idx = np.where([i>=r[0] and i<=r[1] for r in ranges])[0][0]
            expanded.append(movie_model[idx, :])
        except:
            expanded.append(movie_model[0, :])
    
    return np.array(expanded)

In [6]:
# drop a single feature from the model
def drop_feature(df, feature):
    return df.drop(feature, axis=1)

# computes correlation between movie models
def compare_m_models(model1, model2):
    return corr(pd.DataFrame(model1).T.corr().values.ravel(), pd.DataFrame(model2).T.corr().values.ravel())[0]

## which features are important to movie model structure?

In [7]:
# isolate features used in model
features_df = movie_annotations.loc[:,'Scene Details - A Level ':'Words on Screen ']
features_df.columns = ['Narrative details', 'Indoor vs outdoor', 'Characters on screen', 'Character in focus', 'Character speaking', 'location', 'Camera angle', 'Music presence', 'Text on screen']

In [8]:
# load in full movie model
full_movie_model = np.load(datadir+'models_t100_v50_r10_resampled.npy', allow_pickle=True)[0]

In [9]:
dropfeat_corr = {}

# iteratively leave out one feature from the model 
for feat in features_df.columns:
    print('\ndropping '+str(feat))
    partial_df = drop_feature(features_df,feat)
    
    # compute partial movie model
    print('computing movie model')
    partial_movie_model = model_movie(partial_df, m_wsize, vectorizer, semantic)
    
    # correlate to full model
    model_corr = compare_m_models(partial_movie_model, full_movie_model)
    print('model correlation is '+ str(model_corr))
    dropfeat_corr[feat] = model_corr


dropping Narrative details
computing movie model
model correlation is 0.8169994166199158

dropping Indoor vs outdoor
computing movie model
model correlation is 0.8766734235574383

dropping Characters on screen
computing movie model
model correlation is 0.8852409813813542

dropping Character in focus
computing movie model
model correlation is 0.9471895100684964

dropping Character speaking
computing movie model
model correlation is 0.8748901536160685

dropping location
computing movie model
model correlation is 0.9062585672012943

dropping Camera angle
computing movie model
model correlation is 0.8927190384602313

dropping Music presence
computing movie model
model correlation is 0.9288485515393562

dropping Text on screen
computing movie model
model correlation is 0.9994925539950315


In [10]:
# export data to plot
np.save(datadir+'dropfeat_models/dropfeat_m_model_impact', dropfeat_corr)

In [11]:
# plot correlation with full model
# sns.set_palette('muted')

# series_ord = pd.Series(dropfeat_corr).sort_values()
# series_ord.plot(kind='bar', ylim=[0,1])
# plt.ylabel('Correlation with full model', labelpad=15)
# plt.xlabel('Feature removed', labelpad=20)

# for idx, corr in enumerate(series_ord.values):
#     plt.text(idx-.1, 0.42, '%.5f'%corr, rotation=90)
    
# plt.savefig('../../paper/figs/5.1_dropfeat_m_model_corr.pdf', bbox_inches='tight')