# Summarize all data for the dashboard
***
## Workflow
1. Imports
2. Load data and merge it to a dataframe

## 1. Imports

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import ast
import re
from IPython.display import HTML

## 2. Load data and merge
- simple feature
- timestamps
- sentiments
- explanations

In [3]:
# simple word count like features
simple_features = pd.read_csv('data/wordcount_uniquewordcount.csv')
simple_features.drop(columns=['text_unprocessed', 'tokenize', 'stemming'], inplace = True)
simple_features.rename(columns = {'Person':'Participant'}, inplace = True)

In [4]:
def timestamp_extraction(string):
    #print(string)
    time_stamps = string.replace(']]', '').replace('[[','')
    time_stamps = time_stamps.split('], [')
    time_stamps = np.array([sub.split(',') for sub in time_stamps])
    time_stamps = time_stamps.astype(float)
    return time_stamps

def fill_nan_timestamps(list_of_lists, length):
    # list of timestamps, max length of interview
    for i, start_stop in enumerate(list_of_lists):
        if np.isnan(start_stop[0]) or np.isnan(start_stop[1]):
            if np.isnan(start_stop[0]):
                start_stop[0] = 0
            elif np.isnan(start_stop[1]):
                start_stop[1] = length

In [6]:
timestamps = pd.read_csv('data/new_time_stamps.csv')
timestamps.drop(columns=['Unnamed: 0', 'tmp1'], inplace = True)
timestamps['tokenized_sentences'] = timestamps['tokenized_sentences'].apply(lambda x: ast.literal_eval(x))
timestamps['new_time_stamps'] = timestamps['new_time_stamps'].apply(lambda x: timestamp_extraction(x))

In [8]:
merged = simple_features.merge(timestamps, on='Participant', validate='one_to_one')
merged.apply(lambda x: fill_nan_timestamps(x['new_time_stamps'], x['interview_length']), axis=1)

In [10]:
explanations = pd.read_csv('data/explanations_all.csv')
explanations.drop(columns=['tokenize_sentence'], inplace = True)
explanations['explanations'] = explanations['explanations'].apply(lambda x: ast.literal_eval(x))

In [12]:
def make_list(string):
    out = re.sub("array\(", "", string) # remove array
    out = re.sub(",\n[ ]*dtype=float32\)", "", out) # remove last interviewer statement from text
    out = re.sub(",[ ]*dtype=float32\)", " ", out) # remove last interviewer statement from text
    out = re.sub("\n", " ", out) # remove last interviewer statement from text
    out = re.sub("[ ]*,", ",", out) # remove last interviewer statement from text
    out = re.sub("[ ]*]", "]", out) # remove last interviewer statement from text
    out = re.sub("[ ]{2}", " ", out)
    return out

In [13]:
emotions = pd.read_csv('data/MIT_dataset_emotion_prediction_percentage.csv')
emotions.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 
                       'text_remove_interview_signs', 'tokenize_sentence'], inplace = True)

emotions['prediction'] = emotions['prediction'].apply(lambda x: ast.literal_eval(make_list(x)))

In [14]:
df = merged.merge(emotions, on='Participant').merge(explanations, on='Participant')
df['new_time_stamps'] = df['new_time_stamps'].apply(lambda x: x.tolist())

In [103]:
df.to_csv('data/full_scores', index=False)

In [106]:
df = pd.read_csv('data/full_scores')