In [None]:
## Lifelines
import glob
import json

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import signal
from scipy.stats import sem
from itertools import groupby
import spacy
import copy
from PIL import Image
import PIL

nlp = spacy.load("en_core_web_sm")

titles = ['Horror', 'Adventure', 'Drama', 'Biography', 'Action', 'Fantasy', 'SciFi', 'Animation']

In [None]:
def plot_lines(ys, title, wtp_mean, wtp_std, mean=False, word=None):
    if mean:
        ys_mean = np.stack(ys).mean(axis=0)
        ys_sem = sem(np.stack(ys), axis=0)
    else:
        ys_mean = ys

    xs = np.linspace(0, 90, num=900)

    plt.figure()
    poly = np.polyfit(xs, ys_mean, 10)
    poly_y = np.poly1d(poly)(xs)
    plt.plot(xs, poly_y, color="#3da803", label="Smooth")
    plt.plot(xs, ys_mean, color="#040dc2", label="Original")

    if mean:
        plt.fill_between(xs, ys_mean - ys_sem, ys_mean + ys_sem, alpha=0.2)

    plt.title("Enjoyment of {} Movie Trailer Over Time\nWTP M={}, SD={}".format(title, str(wtp_mean), str(wtp_std)), )

    if wtp_std is None:
        plt.title("WTP = {}, Word = {}".format(str(wtp_mean), word), fontsize = 25)
    else:
        plt.ylabel("Enjoyment")
        plt.xlabel("Time (sec)")

    plt.tight_layout()

    ax = plt.gca()
    ax.set_ylim([0, 100])
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)


    if wtp_std is None:
        ax.get_xaxis().set_visible(False)
        plt.savefig("./plots/individual/lifeline_{}.png".format(title), dpi=50)
    else:
        plt.savefig("./plots/analysis_plots/lifeline_{}.png".format(title), dpi=300)

In [None]:

data = pd.read_csv('./data/data_prolific.csv')

## Performquad(np.poly1d(np.polyfit(xs, ys_mean, 10)), 0, 100) Exclusions
data = data.drop(data[data.Finished != 'True'].index)

## All passed at least one of the two attention and comprehension checks
## Drop low connections
#data = data.drop(data[data.ResponseId == 'R_10ovXcMfOnkIZYm'].index) # ON MTURK DATA

lengths = []

def listify(row):
    row = str(row).split(',')
    row = [float(y) for y in row]

    return row

for title in titles:
    data.loc[:, '{}Enjoyment'.format(title)] = data.loc[:, '{}Enjoyment'.format(title)].apply(listify, args=())
    estr = '4' if title not in ['Action', 'Adventure'] else '1'

## Remove data with too few or many points, and exclude those who have the same enjoyments for more than 30 seconds
rms = []
for index, row in data.iterrows():
    for title in titles:
        if abs(len(row['{}Enjoyment'.format(title)]) - 900) > 300:
            print("Low data qual {}".format(row['ResponseId']))
            print(len(row['{}Enjoyment'.format(title)]))
            rms.append(row['ResponseId'])

        if max([sum(1 for i in g) for k,g in groupby(row['{}Enjoyment'.format(title)])]) > 300:
            print("Bigger than 300.. {}".format(row['ResponseId']))
            print(max([sum(1 for i in g) for k,g in groupby(row['{}Enjoyment'.format(title)])]))
            rms.append(row['ResponseId'])



rms = list(dict.fromkeys(rms))
data = data[~data['ResponseId'].isin(rms)]
print(rms)
print(len(rms))

def resample_time(row):
    return signal.resample(row, 900)

##  Resample the data to be 900 ms, and plot the graphs
for title in titles:
    data.loc[:, '{}Enjoyment'.format(title)] = data.loc[:, '{}Enjoyment'.format(title)].apply(resample_time)
    estr = '4' if title not in ['Action', 'Adventure'] else '1'
    plot_lines(data.loc[:, '{}Enjoyment'.format(title)], title='{}'.format(title), wtp_mean=round(data['{}_willing_{}'.format(title.lower(), estr)].astype(int).mean(), 2), wtp_std=round(data['{}_willing_{}'.format(title.lower(), estr)].astype(int).std(), 2), mean=True)

print(0)

In [None]:
import copy

# Calculate metrics for predicting enjoyments...

xs = np.linspace(0, 90, num=900)

# Number of Peaks
def get_num_peaks(row):
    poly = np.polyfit(xs, row, 10)
    poly_y = np.poly1d(poly)(xs)

    peaks, _ = signal.find_peaks(poly_y)

    #plot_lines(row, "asd", 0, 0)
    return len(peaks)

# Number of Valleys
def get_num_valleys(row):
    poly = np.polyfit(xs, row, 10)
    poly_y = np.poly1d(poly)(xs)

    # Find peaks of negative signal ---
    peaks, _ = signal.find_peaks(-poly_y)

    #plot_lines(row, "asd", 0, 0)
    return len(peaks)

# Number of Extrema
def get_num_extrema(row):
    return get_num_peaks(row) + get_num_valleys(row)

def get_first_derivative(row):
    poly = np.polyfit(xs, row, 10)
    return json.dumps(list(np.polyder(poly)))

def get_poly(row):
    #plot_lines(row, "asd", 0, 0)
    return json.dumps(list(np.polyfit(xs, row, 10)))

## Save the polynomials for R
for title in titles:
    data[title + '_first_derivative'] = data.loc[:, '{}Enjoyment'.format(title)].apply(get_first_derivative)
    data[title + '_equation'] = data.loc[:, '{}Enjoyment'.format(title)].apply(get_poly)
    data[title + '_number_peaks'] = data.loc[:, '{}Enjoyment'.format(title)].apply(get_num_peaks)
    data[title + '_number_valleys'] = data.loc[:, '{}Enjoyment'.format(title)].apply(get_num_valleys)
    data[title + '_number_extrema'] = data.loc[:, '{}Enjoyment'.format(title)].apply(get_num_extrema)


data.to_csv('./data/lifelines_cleaned.csv')

In [None]:
################## FLATTEN THE DATA ##################

#    Input: dataframe with number of rows = n_subjects
#    Output: dataframe with number of rows = n_subjects * n_genres (=8)
data = pd.read_csv('./data/lifelines_w_features.csv')
include_points = True

df = None
count = 1

def flatten_data(data, count, df, include_points=False):
    # Split the columns, based on name
    for title in titles:
        a = data.filter(regex=(title))
        b = data.filter(regex=(title.lower()))

        acols = [c for c in a.columns][4:]
        bcols = [c for c in b.columns][5:]

        a = a[acols]
        b = b[bcols]

        X = pd.concat([a, b], axis=1)

        old_colnames = list(X.columns)
        new_colnames = {}

        if include_points:
            def listify(row):
                row = str(row).replace('[', '').replace(']', '').replace('\n', '').split(' ')
                row = [float(y) for y in row if y != '']

                return row


            X['points'] = data['{}Enjoyment'.format(title)]
            X.loc[:, 'points'] = X.loc[:, 'points'].apply(listify)

        for old in old_colnames:
            new_colnames[old] = old.lower().split(title.lower() + "_")[1]

            if 'willing' in new_colnames[old]:
                new_colnames[old] = new_colnames[old][:-2]

        X = X.rename(columns = new_colnames)
        X['genre'] = title

        X['subject'] = [c for c in range(1, data.shape[0] + 1)]
        X['movie_choice'] = data['movie_choice']
        count += 1


        if df is None:
            df = copy.deepcopy(X)
        else:
            df = pd.concat([df, X], axis=0)

    return df

df = flatten_data(data, count, df, include_points)
df = df.sort_values(by=['subject', 'genre'])

def get_token(row):
    return [token.pos_ for token in nlp(row)][0]

def get_tokens_sentences(row):
    return [token.pos_ for token in nlp(row)]

df.loc[:, 'word_tag'] = df.loc[:, 'word'].apply(get_token)

if include_points:
    df.to_csv('./data/data_long.csv')
else:
    df.to_csv('./data/data.csv')

In [None]:
################## PLOTTING ALL PARTICIPANTS ##################

# Plot each participant separately

plots = {}

for title in titles:
    count = 1
    for index, row in data.iterrows():
        plot_lines(row['{}Enjoyment'.format(title)], str(count) + "_{}".format(title), wtp_mean=row.filter(regex='{}_willing'.format(title).lower())[0], wtp_std=None, word=row.filter(regex='{}_word'.format(title).lower())[0])
        count += 1


In [None]:
# Concatenate vertically
def get_concat_v(im1, im2):
    if im2 is None:
        return im1
    dst = Image.new('RGB', (im1.width, im1.height + im2.height))
    dst.paste(im1, (0, 0))
    dst.paste(im2, (0, im1.height))
    return dst


# Concatenate horizontally
def get_concat_h(im1, im2):
    if im2 is None:
        return im1
    dst = Image.new('RGB', (im1.width + im2.width, im1.height))
    dst.paste(im1, (0, 0))
    dst.paste(im2, (im1.width, 0))
    return dst


# Concatanate all participant plots into one huge plot
himg2 = None
for title in titles:
    im2 = None
    for i in range(1, 234):
        img_file = glob.glob('./plots/individual/lifeline_{}_{}.png'.format(i, title))[0]
        print(img_file)
        im2 = get_concat_v(Image.open(img_file), im2)

    im2.save("{}_combined.jpg".format(title))
    himg2 = get_concat_h(im2, himg2)

himg2.save("all_combined.jpg".format(title))
