## Imports

In [1]:
import numpy as np
import pandas as pd
import hypertools as hyp
from scipy.stats import pearsonr, sem
from scipy.interpolate import interp1d
from tqdm.notebook import tqdm

from sherlock_helpers.constants import (
    DATA_DIR, RAW_DIR, 
    SEMANTIC_PARAMS, 
    VECTORIZER_PARAMS, 
    VIDEO_WSIZE
)
from sherlock_helpers.functions import (
    create_diag_mask, 
    format_text, 
    get_video_timepoints, 
    parse_windows, 
    show_source
)

%matplotlib inline

## Define/inspect some functions

In [2]:
show_source(format_text)

In [3]:
show_source(parse_windows)

In [4]:
show_source(get_video_timepoints)

In [5]:
show_source(create_diag_mask)

In [6]:
def transform_single_feature(feature, annotations):
    feat_list = annotations[feature].fillna('').apply(format_text).tolist()
    feature_windows, feature_bounds = parse_windows(feat_list, VIDEO_WSIZE)
    feature_model = hyp.tools.format_data(feature_windows, 
                                          vectorizer=VECTORIZER_PARAMS, 
                                          semantic=SEMANTIC_PARAMS,
                                          corpus=corpus)[0]
    
    feature_model_TRs = np.empty((1976, 100))
    xvals = get_video_timepoints(feature_bounds, video_text)
    xvals_TR = np.array(xvals) * 1976 / 2963
    TR_times = np.arange(1, 1977)
    interp_func = interp1d(xvals_TR, 
                           feature_model, 
                           axis=0, 
                           fill_value='extrapolate')
    return interp_func(TR_times)

## Load & format data

In [7]:
features = ['Narrative details', 'Indoor vs outdoor', 'Characters on screen', 
            'Character in focus', 'Character speaking', 'Location', 
            'Camera angle', 'Music presence', 'Text on screen']

In [8]:
video_text = pd.read_excel(RAW_DIR.joinpath('Sherlock_Segments_1000_NN_2017.xlsx'))
video_text['Scene Segments'].fillna(method='ffill', inplace=True)

# drop 1s shot & 6s of black screen after end of 1st scan
video_text.drop(index=[480, 481], inplace=True)
video_text.reset_index(drop=True, inplace=True)

# timestamps for 2nd scan restart from 0; add duration of 1st scan to values
video_text.loc[480:, 'Start Time (s) ': 'End Time (s) '] += video_text.loc[479, 'End Time (s) ']

keep_cols = np.append(video_text.columns[1:5], video_text.columns[6:15])
video_text = video_text.loc[:, keep_cols]
video_text.columns = list(video_text.columns[:4]) + features

# trajectories created from all features
full_traj = np.load(DATA_DIR.joinpath('models_t100_v50_r10.npy'), 
                    allow_pickle=True)[0]

# create corpus from all features for fitting each individual feature topic model
features_df = video_text.loc[:, 'Narrative details':]
corpus = parse_windows(features_df.apply(format_text, axis=1).tolist(), 
                       VIDEO_WSIZE)[0]

## Transform each individual feature

In [9]:
feature_models = {}

# add the intact trajectory's structure
full_corrmat = np.corrcoef(full_traj)
diag_mask = create_diag_mask(full_corrmat)
proximal_corrs = full_corrmat[diag_mask]
feature_models['All features'] = proximal_corrs

# iteratively isolate one feature from the descriptions
for feature in tqdm(features, leave=False):
    print(f'transforming {feature}...')
    feature_traj = transform_single_feature(feature, features_df)
    # compute feature trajectory's temporal autocorrelation matrix

    feature_corrmat = np.corrcoef(feature_traj)
    proximal_corrs = feature_corrmat[diag_mask]
    feature_models[feature] = proximal_corrs

# compute correlation matrix of feature models' structures
feature_corrs = pd.DataFrame({feat : cm for feat, cm in feature_models.items()}).corr()

transforming Narrative details...
transforming Indoor vs outdoor...
transforming Characters on screen...
transforming Character in focus...
transforming Character speaking...
transforming Location...
transforming Camera angle...
transforming Music presence...
transforming Text on screen...


In [10]:
# feature_corrs.to_pickle(DATA_DIR.joinpath('feature_similarity.p'))