## Imports

In [1]:
import pickle
import re
import numpy as np
import pandas as pd
import hypertools as hyp
from os.path import join as opj
from scipy.stats import pearsonr, sem
from scipy.interpolate import interp1d
%matplotlib inline

## Set paths & parameters

In [2]:
rawdir = '../../../data/raw/'
datadir = '../../../data/processed/'

In [3]:
n_topics = 100
video_wsize = 50

vectorizer = {
    'model' : 'CountVectorizer', 
    'params' : {
        'stop_words' : 'english'
    }
}

semantic = {
    'model' : 'LatentDirichletAllocation', 
    'params' : {
        'n_components' : n_topics,
        'learning_method' : 'batch',
        'random_state' : 0
    }
}

## Define some functions

In [4]:
def format_text(text):
    if isinstance(text, pd.Series):
        text = ' '.join(list(text.fillna('')))
        
    no_possessive = text.lower().replace("'s", '')
    punc_stripped = re.sub("[^\w\s]+", '', no_possessive)
    spaced = ' '.join(punc_stripped.split())
    return spaced

def parse_windows(textlist, wsize):
    windows = []
    w_lengths = []
    for ix in range(1, wsize):
        start, end = 0, ix
        w_lengths.append((start, end))
        windows.append(' '.join(textlist[start : end]))

    for ix in range(len(textlist)):
        start = ix
        end = ix + wsize if ix + wsize <= len(textlist) else len(textlist)
        w_lengths.append((start, end))
        windows.append(' '.join(textlist[start : end]))

    return windows, w_lengths


def get_video_timepoints(window_spans):
    timepoints = []
    for first, last in window_spans:
        window_onset = video_text.loc[first, 'Start Time (s) ']
        window_offset = video_text.loc[last - 1, 'End Time (s) ']
        timepoints.append((window_onset + window_offset) / 2)
        
    return np.array(timepoints)

In [5]:
def transform_single_feature(feature, annotations):
    feat_list = annotations[feature].fillna('').apply(format_text).tolist()
    feature_windows, feature_bounds = parse_windows(feat_list, video_wsize)
    feature_model = hyp.tools.format_data(feature_windows, 
                                          vectorizer=vectorizer, 
                                          semantic=semantic,
                                          corpus=corpus)[0]
    
    tr_spans = video_text[['Start Time (TRs, 1.5s)', 'End Time (TRs, 1.5s)']]
    starts, stops = tr_spans.values.T
    feature_model_TRs = np.empty((1976, 100))
    xvals = get_video_timepoints(feature_bounds)
    xvals_TR = np.array(xvals) * 1976 / 2963
    TR_times = np.arange(1, 1977)
    interp_func = interp1d(xvals_TR, feature_model, axis=0, fill_value='extrapolate')
    feature_model_TRs = interp_func(TR_times)
    return feature_model_TRs

## Load & format data

In [6]:
video_text = pd.read_excel(opj(rawdir, 'Sherlock_Segments_1000_NN_2017.xlsx'))
video_text['Scene Segments'].fillna(method='ffill', inplace=True)
video_text.drop(index=[480, 481], inplace=True)
video_text.reset_index(drop=True, inplace=True)
video_text.loc[480:, 'Start Time (s) ': 'End Time (s) '] += video_text.loc[479, 'End Time (s) ']
keep_cols = np.append(video_text.columns[1:5], video_text.columns[6:15])
video_text = video_text.loc[:, keep_cols]
features = ['Narrative details', 'Indoor vs outdoor', 'Characters on screen', 
            'Character in focus', 'Character speaking', 'location', 
            'Camera angle', 'Music presence', 'Text on screen']
video_text.columns = list(video_text.columns[:4]) + features

# trajectories created from all features
full_traj = np.load(opj(datadir, 'models_t100_v50_r10.npy'), allow_pickle=True)[0]

# create corpus from all features for fitting each individual feature topic model
features_df = video_text.loc[:, 'Narrative details':]
corpus = parse_windows(features_df.apply(format_text, axis=1).tolist(), video_wsize)[0]

## Transform each individual feature

In [7]:
feature_models = {}

# iteratively isolate one feature from the descriptions
for feature in features:
    print(f'transforming {feature}...')
    feature_traj = transform_single_feature(feature, features_df)
    # compute feature trajectory's temporal autocorrelation matrix
    feature_structure = np.corrcoef(feature_traj)
    feature_models[feature] = feature_structure
    
# add the intact trajectory's structure
feature_models['All features'] = np.corrcoef(full_traj)

# compute correlation matrix of feature models' structures
feature_corrs = pd.DataFrame({feat : cm.ravel() for feat, cm in feature_models.items()}).corr()

transforming Narrative details...
transforming Indoor vs outdoor...
transforming Characters on screen...
transforming Character in focus...
transforming Character speaking...
transforming location...
transforming Camera angle...
transforming Music presence...
transforming Text on screen...


In [8]:
feature_corrs.to_pickle(opj(datadir, 'feature_similarity.p'))