# Feature-rich free recall analyses

### Import required libraries

In [1]:
from sqlalchemy import create_engine, MetaData, Table
import json
import pandas as pd
import numpy as np
import math
from __future__ import division
import re
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import numpy.ma as ma
from itertools import izip_longest
from collections import Counter



### Load the data into a pandas dataframe

In [2]:
db_url = "sqlite:///../data/encoding/participants-test-room1.db"
table_name = 'turkdemo'
data_column_name = 'datastring'

# boilerplace sqlalchemy setup
engine = create_engine(db_url)
metadata = MetaData()
metadata.bind = engine
table = Table(table_name, metadata, autoload=True)

# make a query and loop through
s = table.select()
rows = s.execute()

data = []
for row in rows:
    data.append(row[data_column_name])
    
# Now we have all participant datastrings in a list.
# Let's make it a bit easier to work with:

# parse each participant's datastring as json object
# and take the 'data' sub-object
data = [json.loads(part)['data'] for part in data if part is not None]

# insert uniqueid field into trialdata in case it wasn't added
# in experiment:
for part in data:
    for record in part:
#         print(record)
        if type(record['trialdata']) is list:

            record['trialdata'] = {record['trialdata'][0]:record['trialdata'][1]}
        record['trialdata']['uniqueid'] = record['uniqueid']
        
# flatten nested list so we just have a list of the trialdata recorded
# each time psiturk.recordTrialData(trialdata) was called.
def isNotNumber(s):
    try:
        float(s)
        return False
    except ValueError:
        return True

data = [record['trialdata'] for part in data for record in part]

# filter out fields that we dont want using isNotNumber function
filtered_data = [{k:v for (k,v) in part.items() if isNotNumber(k)} for part in data]
    
# Put all subjects' trial data into a dataframe object from the
# 'pandas' python library: one option among many for analysis
data_frame = pd.DataFrame(filtered_data)

### Add a column to keep track of experiment version number

In [3]:
db_url = "sqlite:///../data/encoding/participants-test-room1.db"
table_name = 'turkdemo'
data_column_name = 'codeversion'

# boilerplace sqlalchemy setup
engine = create_engine(db_url)
metadata = MetaData()
metadata.bind = engine
table = Table(table_name, metadata, autoload=True)

# make a query and loop through
s = table.select()
rows = s.execute()

versions = []
for row in rows:
    versions.append(row[data_column_name])
    
version_col = []
for idx,sub in enumerate(data_frame['uniqueid'].unique()):
    for i in range(sum(data_frame['uniqueid']==sub)):
        version_col.append(versions[idx])
data_frame['exp_version']=version_col

#print(data_frame['exp_version'])

### Number of subjects in each experiment

In [4]:
subids = list(data_frame[data_frame['listNumber']==15]['uniqueid'].unique())

d = dict()
for sub in subids:
    key = data_frame[data_frame['uniqueid']==sub]['exp_version'].values[0]
    if key in d:
        d[key] += 1
    else:
        d[key] = 1
print('Here is a count of how many subjects we have in each experiment: ',d)

('Here is a count of how many subjects we have in each experiment: ', {u'5.1': 18, u'4.1': 23, u'1.1': 16, u'0.0': 2, u'3.2': 21, u'3.1': 14, u'1.0': 3, u'2.1': 21})


## List audio files for each experiment

In [5]:
subids = list(data_frame[data_frame['listNumber']==15]['uniqueid'].unique())

d = dict()
for sub in subids:
    key = data_frame[data_frame['uniqueid']==sub]['exp_version'].values[0]
    if key in d:
        d[key].append(sub)
    else:
        d[key]=[sub]


#reaplce these values with the experiment number
#three values for the case of experiment 1 only
#print (d["0.0"], d['1.0'], d['1.1'])

exp1=d["0.0"]+d['1.0']+d['1.1']
print(exp1)

[u'debugN8TPWO:debugF1XWCH', u'debugZQ55YL:debug5WQHPC', u'debugWF2JFB:debugPNRZFQ', u'debugLXMXTP:debugJAXRZL', u'debugQRX0V3:debugFIWAG8', u'debugIAU8V9:debugT1DECK', u'debug02E4FI:debugF7UOXH', u'debugGPNALW:debugXSJ1FD', u'debugS4GATI:debug2LRP6X', u'debugJAPX2W:debugFZOLSG', u'debugA98B98:debug5H8QRL', u'debugKDM8HT:debugH2I05W', u'debugQS9870:debugKM1SRC', u'debugHP65NS:debugLWS9KB', u'debugVFPD79:debugIP75FV', u'debugX84L2K:debugCDN40O', u'debugSU1T93:debugKCB9VM', u'debugE1CAO3:debugONZ2R5', u'debug8DEMRS:debugC55CO6', u'debugKUWU41:debug9FG9EP', u'debugXUZA8U:debugMR3K3X']


### Read in word pool

In [6]:
# read in stimulus library
wordpool = pd.read_csv('../stimuli/cut_wordpool.csv')

### Define data processing functions

In [7]:
# this function takes the data frame and returns subject specific data based on the subid variable
def filterData(data_frame,subid):
    filtered_stim_data = data_frame[data_frame['stimulus'].notnull() & data_frame['listNumber'].notnull()]
    filtered_stim_data = filtered_stim_data[filtered_stim_data['trial_type']=='single-stim']
    filtered_stim_data =  filtered_stim_data[filtered_stim_data['uniqueid']==subid]
    return filtered_stim_data

# this function parses the data creating an array of dictionaries, where each dictionary represents a trial (word presented) along with the stimulus attributes
def createStimDict(data):
    stimDict = []
    for index, row in data.iterrows():
        stimDict.append({
                'text': str(re.findall('>(.+)<',row['stimulus'])[0]),
                'color' : { 'r' : int(re.findall('rgb\((.+)\)',row['stimulus'])[0].split(',')[0]),
                           'g' : int(re.findall('rgb\((.+)\)',row['stimulus'])[0].split(',')[1]),
                           'b' : int(re.findall('rgb\((.+)\)',row['stimulus'])[0].split(',')[2])
                           },
                'location' : {
                    'top': float(re.findall('top:(.+)\%;', row['stimulus'])[0]),
                    'left' : float(re.findall('left:(.+)\%', row['stimulus'])[0])
                    },
                'category' : wordpool['CATEGORY'].iloc[list(wordpool['WORD'].values).index(str(re.findall('>(.+)<',row['stimulus'])[0]))],
                'size' : wordpool['SIZE'].iloc[list(wordpool['WORD'].values).index(str(re.findall('>(.+)<',row['stimulus'])[0]))],
                'wordLength' : len(str(re.findall('>(.+)<',row['stimulus'])[0])),
                'firstLetter' : str(re.findall('>(.+)<',row['stimulus'])[0])[0],
                'listnum' : row['listNumber']
            })
    return stimDict

# this function loads in the recall data into an array of arrays, where each array represents a list of words
def loadRecallData(subid):
    recalledWords = []
    for i in range(0,16):
        try:
            f = open('../data/recall/room1/' + subid + '/' + subid + '-' + str(i) + '.wav.txt', 'rb')
            spamreader = csv.reader(f, delimiter=' ', quotechar='|')
        except (IOError, OSError) as e:
            print(e)
        for row in spamreader:
            recalledWords.append(row[0].split(','))
    return recalledWords

# this function computes accuracy for a series of lists
def computeListAcc(stimDict,recalledWords):
    accVec = []
    for i in range(0,16):
        stim = [stim['text'] for stim in stimDict if stim['listnum']==i]
        recalled= recalledWords[i]
        
        acc = 0
        tmpstim = stim[:]
        for word in recalled:
            if word in tmpstim:
                tmpstim.remove(word)
                acc+=1
        accVec.append(acc/len(stim))
    return accVec

### Define fingerprint class (this will be moved to an importable module at some point)

In [8]:
# class that computes the fingerprint based on stimulus features and recall organization

# -*- coding: utf-8 -*-
import math
import numpy as np

class Pyfingerprint(object):
    '''pyfingerprint module'''

    def __init__(self, state=None, features=['category', 'size', 'firstLetter', 'wordLength', 'location', 'color', 'temporal'], weights=None, alpha=4, tau=1, sortby=None):
        self.state = state
        self.features = features
        self.weights = weights
        self.alpha = alpha
        self.tau = tau
        self.sortby = sortby

    #### public functions ####

    # given a stimulus list and recalled words, compute the weights
    def computeWeights(self, currentList, recalledWords):
        currentList = self._computeDistance(currentList)
        return self._computeFeatureWeights(currentList, recalledWords, self.features)

    def updateWeights(self,newWeights):
        if self.weights is not None:
            print('weights exist, updating..')
            for feature in self.weights: 
                self.weights[feature].append(newWeights[feature]);
        else:
            print('new weights..')
            self.weights = {};
            for feature in newWeights:
                self.weights[feature] = [];
                self.weights[feature].append(newWeights[feature]);
        print('weights: ', self.weights)

    def getReorderedList(self,nextList):
        print('Reordering list according to state: ' + str(self.state))
        if self.state == 'feature-based':
            return _featurizeList(nextList)
        elif self.state == 'random':
            return _randomizeList(nextList)
        elif self.state == 'optimal':
            return _optimizeList(nextList)
        elif self.state == 'opposite':
            return _oppositizeList(nextList)
        elif self.state == 'strip-features':
            return _stripFeatures(nextList)
        else:
            print('Warning: No fingerprint state assigned, returning same list..')
            return nextList
        
    #### private functions ####
    
    def _computeDistance(self,stimArray):
        
        # initialize distance dictionary
        for stimulus in stimArray:
            stimulus['distances'] = {}
            for feature in self.features:
                stimulus['distances'][feature] = []
                
        # loop over the lists to create distance matrices
        for i,stimulus1 in enumerate(stimArray):
            for j,stimulus2 in enumerate(stimArray):
                
                # logic for temporal clustering
                stimArray[i]['distances']['temporal'].append({
                        'word' : stimArray[j]['text'],
                        'dist' : abs(i - j)
                    })
                
                # logic for category, need to add if statement if we are using category as a feature
                stimArray[i]['distances']['category'].append({
                        'word' : stimArray[j]['text'],
                        'dist' : int(stimArray[i]['category'] != stimArray[j]['category'])
                    })

                # logic for size
                stimArray[i]['distances']['size'].append({
                    'word': stimArray[j]['text'],
                    'dist': int(stimArray[i]['size'] != stimArray[j]['size'])
                })

                # logic for first letter
                stimArray[i]['distances']['firstLetter'].append({
                    'word': stimArray[j]['text'],
                    'dist': int(stimArray[i]['firstLetter'] != stimArray[j]['firstLetter'])
                })

                # logic for word length
                stimArray[i]['distances']['wordLength'].append({
                    'word': stimArray[j]['text'],
                    'dist': abs(stimArray[i]['wordLength'] - stimArray[j]['wordLength'])
                });

                # logic for color distance
                stimArray[i]['distances']['color'].append({
                    'word': stimArray[j]['text'],
                    'dist': math.sqrt(math.pow(stimArray[i]['color']['r'] - stimArray[j]['color']['r'], 2) + math.pow(stimArray[i]['color']['g'] - stimArray[j]['color']['g'], 2) +
                        math.pow(stimArray[i]['color']['b'] - stimArray[j]['color']['b'], 2))
                });

                # logic for spatial distance
                stimArray[i]['distances']['location'].append({
                    'word': stimArray[j]['text'],
                    'dist': math.sqrt(pow(stimArray[i]['location']['top'] - stimArray[j]['location']['top'], 2) + pow(stimArray[i]['location']['left'] - stimArray[j]['location']['left'], 2))
                })
                
        return stimArray
    
    def _computeFeatureWeights(self,currentList, recalledWords, features):

        # initialize the weights object for just this list
        listWeights = {}
        for feature in self.features:
            listWeights[feature] = []

        # return default list if there is not enough data to compute the fingerprint
        if len(recalledWords) <= 2:
            print('Not enough recalls to compute fingerprint, returning default fingerprint.. (everything is .5)')
            for feature in features:
                listWeights[feature] = .5
            return listWeights
        
        # initialize pastWords list
        pastWords = []

        # finger print analysis
        for i in range(0,len(recalledWords)-1):

            # grab current word
            currentWord = recalledWords[i]

            # grab the next word
            nextWord = recalledWords[i + 1]
            
            # grab the words from the encoding list
            encodingWords = [stimulus['text'] for stimulus in currentList]
            
            # append current word to past words log
            # pastWords.append(currentWord)
            
            # if both recalled words are in the encoding list
            if (currentWord in encodingWords and nextWord in encodingWords) and (currentWord not in pastWords and nextWord not in pastWords): 
                # print(currentWord,nextWord,encodingWords,pastWords)
                

                for feature in features:

                    # get the distance vector for the current word
                    distVec = currentList[encodingWords.index(currentWord)]['distances'][feature]

                    # filter distVec removing the words that have already been analyzed from future calculations
                    filteredDistVec = []
                    for word in distVec:
                        if word['word'] in pastWords:
                            pass
                        else:
                            filteredDistVec.append(word)
                            

                    # sort distWords by distances
                    filteredDistVec = sorted(filteredDistVec, key=lambda item:item['dist'])
                    
                    # compute the category listWeights
                    nextWordIdx = [word['word'] for word in filteredDistVec].index(nextWord)

                    # not sure about this part
                    idxs = []
                    for idx,word in enumerate(filteredDistVec):
                        if filteredDistVec[nextWordIdx]['dist'] == word['dist']:
                            idxs.append(idx)

                    listWeights[feature].append(1 - (sum(idxs)/len(idxs) / len(filteredDistVec)))

                pastWords.append(currentWord)

        for feature in listWeights:
            listWeights[feature] = np.mean(listWeights[feature])

        return listWeights

In [9]:
# subjects who have completed the exp
subids = list(data_frame[data_frame['listNumber']==15]['uniqueid'].unique())

# issue with this subject - need to look into it further
# subids.remove('debugGPNALW:debugXSJ1FD')
# subids.remove('debug4PXFJG:debug3V9BT9')
subids.remove('debugAD2211:debugB3TKJQ') # this was Andy testing all the way through
subids.remove('debug7XDZDR:debugO8OCCV') # another test
subids.remove('debugTX7U35:debugZFTPLT') # another test - allison

# for each subject that completed the experiment
for idx,sub in enumerate(subids):
    
    #print('Running analysis for subject: ', sub)    
        
    # get the subjects data
    filteredStimData = filterData(data_frame,sub)
    
    # parse the subjects data
    stimDict = createStimDict(filteredStimData)
    
    # load in the recall data
    recalledWords = loadRecallData(sub)
    
    # initialize the fingerprint
    pyfingerprint = Pyfingerprint()
    fingerprints= []
    
    # compute a fingerprint for each list
    for i in range(0,16):
        fingerprints.append(pyfingerprint.computeWeights([stim for stim in stimDict if stim['listnum']==i],recalledWords[i]))
        fingerprints[i]['listNum']=i
    tmp = pd.DataFrame(fingerprints)
    
    # compute accuracy
    accVec = computeListAcc(stimDict,recalledWords)
    
    # organize the data
    tmp['accuracy']=accVec
    tmp['subId']=idx
    tmp['experiment']=filteredStimData['exp_version'].values[0]
    cols = ['experiment','subId','listNum','category','color','firstLetter','location','size','wordLength','temporal','accuracy']
    
    if idx==0:
        fingerprintsDF = tmp[cols]
    else:
        fingerprintsDF = fingerprintsDF.append(tmp[cols],ignore_index=True)

fingerprintsDF['experiment'] = fingerprintsDF['experiment'].replace('0.0','1.1')

  out=out, **kwargs)


Not enough recalls to compute fingerprint, returning default fingerprint.. (everything is .5)
Not enough recalls to compute fingerprint, returning default fingerprint.. (everything is .5)
Not enough recalls to compute fingerprint, returning default fingerprint.. (everything is .5)
Not enough recalls to compute fingerprint, returning default fingerprint.. (everything is .5)
Not enough recalls to compute fingerprint, returning default fingerprint.. (everything is .5)
Not enough recalls to compute fingerprint, returning default fingerprint.. (everything is .5)
Not enough recalls to compute fingerprint, returning default fingerprint.. (everything is .5)
Not enough recalls to compute fingerprint, returning default fingerprint.. (everything is .5)


<h1>Show Fingerprint</h1>

In [10]:
fingerprintsDF

Unnamed: 0,experiment,subId,listNum,category,color,firstLetter,location,size,wordLength,temporal,accuracy
0,1.1,0,0,0.613908,0.449856,0.466893,0.523690,0.563436,0.520399,0.384419,0.5000
1,1.1,0,1,0.530546,0.435829,0.482738,0.459776,0.535331,0.414692,0.617519,0.3125
2,1.1,0,2,0.614249,0.590496,0.489744,0.486057,0.634865,0.579871,0.683409,0.5625
3,1.1,0,3,0.744383,0.402122,0.478443,0.450027,0.669413,0.494155,0.538344,0.6875
4,1.1,0,4,0.745429,0.335629,0.463408,0.665979,0.607054,0.403548,0.513316,0.5000
5,1.1,0,5,0.619295,0.434533,0.485034,0.549787,0.419429,0.432732,0.499359,0.5000
6,1.1,0,6,0.721030,0.519652,0.458439,0.152940,0.676690,0.544460,0.867386,0.3750
7,1.1,0,7,0.483290,0.514537,0.479527,0.540226,0.491559,0.392564,0.455210,0.4375
8,1.1,0,8,0.656989,0.501429,0.490412,0.559808,0.654244,0.621819,0.541401,0.5625
9,1.1,0,9,0.757992,0.428349,0.584105,0.605766,0.560245,0.571774,0.491764,0.6250


In [11]:
# Single subject's average fingerprint

# all subjects who have completed the exp
subids = list(data_frame[data_frame['listNumber']==15]['uniqueid'].unique())

# issue with this subject - need to look into it further
# subids.remove('debugGPNALW:debugXSJ1FD')
# subids.remove('debug4PXFJG:debug3V9BT9')
subids.remove('debugAD2211:debugB3TKJQ') # this was Andy testing all the way through
subids.remove('debug7XDZDR:debugO8OCCV') # another test
subids.remove('debugTX7U35:debugZFTPLT') # another test - allison

# just take a subject at some index in the list
singlesub = subids[1]

# get the subjects data
filteredStimData = filterData(data_frame,singlesub)

# parse the subjects data
stimDict = createStimDict(filteredStimData)

# load in the recall data
recalledWords = loadRecallData(singlesub)

# initialize the fingerprint
pyfingerprint = Pyfingerprint()
fingerprintlist= []

# create fingerprint for subject, append it to fingerprintlist
pyfingerprint.computeWeights((stim for stim in stimDict), recalledWords)

fingerprintlist.append(pyfingerprint)

# compute accuracy
accVec = computeListAcc(stimDict,recalledWords)

# organize the data
tmp['accuracy']=accVec
tmp['subId']=idx
tmp['experiment']=filteredStimData['exp_version'].values[0]
cols = ['experiment','subId','listNum','category','color','firstLetter','location','size','wordLength','temporal','accuracy']

singlesubfingerprintsDF = tmp[cols]


singlesubfingerprintsDF['experiment'] = singlesubfingerprintsDF['experiment'].replace('0.0','1.1')

<h2>Fingerprint for Single Sub -PF</h2>

In [12]:
singlesubfingerprintsDF

Unnamed: 0,experiment,subId,listNum,category,color,firstLetter,location,size,wordLength,temporal,accuracy
0,1.1,114,0,0.491497,0.560442,0.559276,0.550024,0.452118,0.456853,0.828156,1.0
1,1.1,114,1,0.483748,0.662833,0.469532,0.661872,0.608865,0.517761,0.380684,0.875
2,1.1,114,2,0.61564,0.281046,0.614713,0.574537,0.63719,0.392201,0.658044,0.9375
3,1.1,114,3,0.791137,0.668873,0.484939,0.460256,0.66504,0.330723,0.584413,0.75
4,1.1,114,4,0.523579,0.369518,0.531692,0.453442,0.598689,0.476019,0.633413,0.5
5,1.1,114,5,0.40158,0.537273,0.547348,0.545065,0.361209,0.618717,0.70626,0.9375
6,1.1,114,6,0.404859,0.460371,0.598644,0.741918,0.673449,0.706851,0.456078,0.875
7,1.1,114,7,0.580075,0.606857,0.487103,0.392104,0.553467,0.378532,0.587677,1.0
8,1.1,114,8,0.699181,0.538834,0.528447,0.558202,0.703407,0.496765,0.542528,0.9375
9,1.1,114,9,0.543754,0.411191,0.464595,0.608696,0.559088,0.469566,0.676312,1.0


<h2> take the average </h2>

In [13]:
avg = singlesubfingerprintsDF.mean()
avg = pd.DataFrame(avg)
avg.transpose()

Unnamed: 0,subId,listNum,category,color,firstLetter,location,size,wordLength,temporal,accuracy
0,114.0,7.5,0.586465,0.511002,0.513262,0.484277,0.576388,0.452089,0.610132,0.875


In [14]:
# average fingerprint for one experiment

# all subjects who have completed the exp
subids = list(data_frame[data_frame['listNumber']==15]['uniqueid'].unique())

# issue with this subject - need to look into it further
# subids.remove('debugGPNALW:debugXSJ1FD')
# subids.remove('debug4PXFJG:debug3V9BT9')
subids.remove('debugAD2211:debugB3TKJQ') # this was Andy testing all the way through
subids.remove('debug7XDZDR:debugO8OCCV') # another test
subids.remove('debugTX7U35:debugZFTPLT') # another test - allison

# for each subject that completed the experiment
for idx,sub in enumerate(subids):
    
    if float(data_frame.loc[data_frame['uniqueid'] == sub, 'exp_version'].iloc[0])==1.1 or 0.0:
      
        
        # get the subjects data
        filteredStimData = filterData(data_frame,sub)

        # parse the subjects data
        stimDict = createStimDict(filteredStimData)

        # load in the recall data
        recalledWords = loadRecallData(sub)

        # initialize the fingerprint
        pyfingerprint = Pyfingerprint()
        fingerprints= []

        # compute a fingerprint for each list
        for i in range(0,16):
            fingerprints.append(pyfingerprint.computeWeights([stim for stim in stimDict if stim['listnum']==i],recalledWords[i]))
            fingerprints[i]['listNum']=i
        tmp = pd.DataFrame(fingerprints)

        # compute accuracy
        accVec = computeListAcc(stimDict,recalledWords)

        # organize the data
        tmp['accuracy']=accVec
        tmp['subId']=idx
        tmp['experiment']=filteredStimData['exp_version'].values[0]
        cols = ['experiment','subId','listNum','category','color','firstLetter','location','size','wordLength','temporal','accuracy']

        if idx==0:
            exp1fingerprintsDF = tmp[cols]
        else:
            exp1fingerprintsDF = exp1fingerprintsDF.append(tmp[cols],ignore_index=True)

exp1fingerprintsDF['experiment'] = exp1fingerprintsDF['experiment'].replace('0.0','1.1')

<h2>Fingerprint for one experiment -PF</h2>

In [15]:
exp1fingerprintsDF

Unnamed: 0,experiment,subId,listNum,category,color,firstLetter,location,size,wordLength,temporal,accuracy
0,1.1,0,0,0.613908,0.449856,0.466893,0.523690,0.563436,0.520399,0.384419,0.5000
1,1.1,0,1,0.530546,0.435829,0.482738,0.459776,0.535331,0.414692,0.617519,0.3125
2,1.1,0,2,0.614249,0.590496,0.489744,0.486057,0.634865,0.579871,0.683409,0.5625
3,1.1,0,3,0.744383,0.402122,0.478443,0.450027,0.669413,0.494155,0.538344,0.6875
4,1.1,0,4,0.745429,0.335629,0.463408,0.665979,0.607054,0.403548,0.513316,0.5000
5,1.1,0,5,0.619295,0.434533,0.485034,0.549787,0.419429,0.432732,0.499359,0.5000
6,1.1,0,6,0.721030,0.519652,0.458439,0.152940,0.676690,0.544460,0.867386,0.3750
7,1.1,0,7,0.483290,0.514537,0.479527,0.540226,0.491559,0.392564,0.455210,0.4375
8,1.1,0,8,0.656989,0.501429,0.490412,0.559808,0.654244,0.621819,0.541401,0.5625
9,1.1,0,9,0.757992,0.428349,0.584105,0.605766,0.560245,0.571774,0.491764,0.6250


<h2>take the average</h2>

In [23]:
avgexp1 = exp1fingerprintsDF.mean()
avgexp1 = pd.DataFrame(avg)
avgexp1.transpose()

# KZ 
# make sure this is averaging correctly
# (curious as to why subID is 114; that does not seem like an average as the subIDs appear to span 0-15)
# no need to output subID AND Listnum here as they make no sense averaged


Unnamed: 0,subId,listNum,category,color,firstLetter,location,size,wordLength,temporal,accuracy
0,114.0,7.5,0.586465,0.511002,0.513262,0.484277,0.576388,0.452089,0.610132,0.875


<h2>average fingerprint for each list in exp 1 -PF</h2>

In [24]:
# average fingerprint for each list within exp 1 (across subjects)

# create empty list for average values
avgs = []
for i in range (0,16):
    # store the averaged value in a temporary variable and append it to the list
    temp = exp1fingerprintsDF.loc[exp1fingerprintsDF['listNum']==i].mean(0)
    avgs.append(temp)

# create a new dataframe out of the averaged values
exp1listfingerprintsDF = pd.DataFrame(avgs)

exp1listfingerprintsDF

# KZ
# this looks great
# similiarly, can remove subID as it doesn't make sense as an average (optional)

Unnamed: 0,subId,listNum,category,color,firstLetter,location,size,wordLength,temporal,accuracy
0,7.5,0.0,0.595564,0.46749,0.489089,0.458735,0.582434,0.499952,0.569963,0.572266
1,7.5,1.0,0.594761,0.533797,0.512854,0.530738,0.528231,0.482654,0.658908,0.511719
2,7.5,2.0,0.644803,0.537601,0.493365,0.521587,0.59853,0.475277,0.536675,0.527344
3,7.5,3.0,0.659154,0.512319,0.499461,0.492886,0.622583,0.513314,0.543502,0.535156
4,7.5,4.0,0.706469,0.504555,0.503407,0.525877,0.610511,0.456016,0.576971,0.527344
5,7.5,5.0,0.702303,0.483377,0.489964,0.507556,0.632906,0.474284,0.536131,0.541016
6,7.5,6.0,0.712568,0.508406,0.499946,0.494165,0.663509,0.49379,0.604089,0.513672
7,7.5,7.0,0.652349,0.498059,0.498897,0.516936,0.570573,0.472783,0.574437,0.611328
8,7.5,8.0,0.665244,0.470366,0.481672,0.538124,0.571317,0.513382,0.482898,0.548828
9,7.5,9.0,0.655702,0.521961,0.529776,0.466462,0.622921,0.506045,0.547674,0.490234


<h2>average fingerprint trajectory, exp 1</h2>

In [35]:
avgexp1

# KZ
# as far as I know, this data structure cannot be passed into hypertools, nor is this the data you want

Unnamed: 0,0
subId,114.0
listNum,7.5
category,0.586465
color,0.511002
firstLetter,0.513262
location,0.484277
size,0.576388
wordLength,0.452089
temporal,0.610132
accuracy,0.875


In [33]:
# DESIRED: plot of av. fingerprint traj. for exp1 (rows = lists, cols = fingerprint dims. )

# TO COMPLETE:
# 1.) remove subID and listNum from exp1listfingerprintsDF
# 2.) convert exp1listfingerprintsDF into numpy array
# 3.) pass the numpy array into hypertools


import hypertools as hyp
hyp.plot(avgexp1, color='g')



# plot for average fingerprint trajectory should have S=1, or "1 sample" so it should be in two dimensions, right?

# KZ - we are treating each fingerprint feature (color, size, word length, etc) as a dimension, 
# thus we are working in a multi-dimensional space. I think it would reduce to 3d by default, but we could also look
# at 2d plots if you are interested. I suspect Jeremy is expectin 3d, but sometimes 2D looks better in certain cases


TypeError: unhashable type

<h2>all subjects' fingerprint trajectories (exp1)</h2>

In [34]:
# DESIRED: plot of all subs' fingerprint trajectories (exp1)

# TO COMPLETE:
# 1.) split exp1fingerprintsDF into smaller data frames, by subject (one per subject)
# 2.) remove experiment, subId, and listNum from each
# 3.) convert each to numpy array
# 4.) make a list containing all of the resulting numpy arrays (order does not matter)
# 5.) pass the list of arrays into hypertools
    # it should automatically present each subject's line in a different color
 



hyp.plot(exp1fingerprintsDF, color='b')
exp1fingerprintsDF



# tried to make each subject's color different using "group=fingerprints" and get TypeError: unhashable type: 'dict'
# tried to make each subject's color different using "group=subids" and the plot displays, but no data lines display

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

### 2.) Select Subject Data

In [32]:
##SELECT EXPERIMENT
subids = list(data_frame[data_frame['listNumber']==15]['uniqueid'].unique())

d = dict()
for sub in subids:
    key = data_frame[data_frame['uniqueid']==sub]['exp_version'].values[0]
    if key in d:
        d[key].append(sub)
    else:
        d[key]=[sub]

#reaplce these values with the experiment number
#three values for the case of experiment 1 only
#print (d["0.0"], d['1.0'], d['1.1'])

exp1=d["0.0"]+d['1.0']+d['1.1']


exp1.remove('debugGPNALW:debugXSJ1FD')
# subids.remove('debug4PXFJG:debug3V9BT9')
# subids.remove('debugAD2211:debugB3TKJQ')
# subids.remove('debug7XDZDR:debugO8OCCV') # all of the audio files are empty ?!
# subids.remove('debugTX7U35:debugZFTPLT')
##################



In [None]:
newData = []
for index, row in fingerprintsDF.iterrows():
    for feature in ['category','color','firstLetter','location','size','wordLength','temporal']:
        newData.append({
                'experiment': row['experiment'],
                'subId': row['subId'],
                'listNum': row['listNum'],
                'feature': feature,
                'accuracy': row['accuracy'],
                'value': row[feature]
            })
fingerprintsDF = pd.DataFrame(newData)

### import libraries and config for seaborn and statistical tests

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import ttest_ind as ttest

sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=2, rc={"lines.linewidth": 2.5})
%matplotlib inline

### avg accuracy for exp 1 and 2 together - all lists

In [None]:
plt.figure(figsize=(12, 10))
exp1 = fingerprintsDF['experiment']=='1.1'
exp2 = fingerprintsDF['experiment']=='2.1'
data = fingerprintsDF[(exp1 | exp2)]
data = data.groupby(['subId','experiment']).mean().reset_index(level=['experiment'])
ax = sns.violinplot(y='accuracy',x='experiment', data=data)
plt.ylabel('Proportion of words recalled')
plt.ylim(0,1)
plt.show()

exp1 = data[data['experiment']=='1.1']['accuracy']
exp2 = data[data['experiment']=='2.1']['accuracy']
ttest(exp1,exp2)

# plt.savefig('avgAcc_exp1&2.pdf',format='pdf')

### avg accuracy for exp 1 and 2 together - first half

In [None]:
plt.figure(figsize=(12, 10))
exp1 = fingerprintsDF['experiment']=='1.1'
exp2 = fingerprintsDF['experiment']=='2.1'
firsthalf = fingerprintsDF['listNum']<8
data = fingerprintsDF[(exp1 | exp2) & firsthalf].groupby(['subId','experiment']).mean().reset_index(level=['experiment'])
ax = sns.violinplot(y='accuracy',x='experiment', data=data)
plt.ylim(0,1)
plt.ylabel('Proportion of words recalled')
plt.show()

exp1 = data[data['experiment']=='1.1']['accuracy']
exp2 = data[data['experiment']=='2.1']['accuracy']
ttest(exp1,exp2)

# plt.savefig('avgAcc_exp1&2_firstHalf.pdf',format='pdf')

### avg accuracy for exp 1 and 2 together - second half

In [None]:
plt.figure(figsize=(12, 10))
sns.set_style("whitegrid")
exp1 = fingerprintsDF['experiment']=='1.1'
exp2 = fingerprintsDF['experiment']=='2.1'
secondhalf = fingerprintsDF['listNum']>7
data = fingerprintsDF[(exp1 | exp2) & secondhalf].groupby(['subId','experiment']).mean().reset_index(level=['experiment'])
ax = sns.violinplot(y='accuracy',x='experiment', data=data)
plt.ylabel('Proportion of words recalled')
plt.ylim(0,1)
plt.show()

exp1 = data[data['experiment']=='1.1']['accuracy']
exp2 = data[data['experiment']=='2.1']['accuracy']
ttest(exp1,exp2)

# plt.savefig('avgAcc_exp1&2_secondHalf.pdf',format='pdf')

### avg fingerprint for exp 1

In [None]:
plt.figure(figsize=(12, 10))
exp1 = fingerprintsDF['experiment']=='1.1'
data = fingerprintsDF[exp1].groupby(['subId','experiment','feature']).mean().reset_index(level=['experiment','feature'])
ax = sns.violinplot(y='value',x='feature', data=data,order=['category','color','firstLetter', 'location','size','wordLength','temporal'])
plt.ylabel('Clustering score')
plt.ylim(.3,1)
plt.show()

# plt.savefig('avgFingerprint_exp1.pdf',format='pdf')

### avg fingerprint for exp 2

In [None]:
plt.figure(figsize=(12, 10))
exp2 = fingerprintsDF['experiment']=='2.1'
data = fingerprintsDF[exp2].groupby(['subId','experiment','feature']).mean().reset_index(level=['experiment','feature'])
ax = sns.violinplot(y='value',x='feature', data=data,order=['category','color','firstLetter', 'location','size','wordLength','temporal'])
plt.ylabel('Clustering score')
plt.ylim(.3,1)

#plt.savefig('avgFingerprint_exp2.pdf',format='pdf')

### avg fingerprint for exp 1 and 2 together

In [None]:
plt.figure(figsize=(12, 10))
exp1 = fingerprintsDF['experiment']=='1.1'
exp2 = fingerprintsDF['experiment']=='2.1'
data = fingerprintsDF[exp1 | exp2].groupby(['subId','experiment','feature']).mean().reset_index(level=['experiment','feature'])
ax = sns.violinplot(y='value',x='feature',hue='experiment',data=data,split=True,scale="count", inner="quartile",order=['category','color','firstLetter', 'location','size','wordLength','temporal'])
plt.ylabel('Clustering score')
plt.ylim(.3,1)
plt.show()

exp1=data['experiment']=='1.1'
exp2=data['experiment']=='2.1'

for feature in ['category','color','firstLetter','location','size','temporal','wordLength']:
    featuren=data['feature']==feature
    data1=data[exp1&featuren]['value']
    data2=data[exp2&featuren]['value']
    print(feature + ' ttest (two-sided)',ttest(data1,data2))
    
# plt.savefig('avgFingerprint_exp1&2.pdf',format='pdf')

### avg fingerprint for exp 1 and 2 together - first half

In [None]:
plt.figure(figsize=(12, 10))
exp1 = fingerprintsDF['experiment']=='1.1'
exp2 = fingerprintsDF['experiment']=='2.1'
firsthalf = fingerprintsDF['listNum']<8
data = fingerprintsDF[(exp1 | exp2) & firsthalf].groupby(['subId','experiment','feature']).mean().reset_index(level=['experiment','feature'])
ax = sns.violinplot(y='value',x='feature',hue='experiment',data=data,split=True,scale="count", inner="quartile",order=['category','color','firstLetter', 'location','size','wordLength','temporal'])
plt.ylabel('Clustering score')
plt.ylim(.3,1)
plt.show()

exp1=data['experiment']=='1.1'
exp2=data['experiment']=='3.2'

for feature in ['category','color','firstLetter','location','size','temporal','wordLength']:
    featuren=data['feature']==feature
    data1=data[exp1&featuren]['value']
    data2=data[exp2&featuren]['value']
    print(feature + ' ttest (two-sided)',ttest(data1,data2))
    
# plt.savefig('avgFingerprint_exp1&2_firstHalf.pdf',format='pdf')

### avg fingerprint for exp 1 and 2 together - second half

In [None]:
plt.figure(figsize=(12, 10))
exp1 = fingerprintsDF['experiment']=='1.1'
exp2 = fingerprintsDF['experiment']=='2.1'
secondhalf = fingerprintsDF['listNum']>7
data = fingerprintsDF[(exp1 | exp2) & secondhalf].groupby(['subId','experiment','feature']).mean().reset_index(level=['experiment','feature'])
ax = sns.violinplot(y='value',x='feature',hue='experiment',data=data,split=True,scale="count", inner="quartile",order=['category','color','firstLetter', 'location','size','wordLength','temporal'])
plt.ylabel('Clustering score')
plt.ylim(.3,1)
plt.show()

exp1=data['experiment']=='1.1'
exp2=data['experiment']=='3.2'

for feature in ['category','color','firstLetter','location','size','temporal','wordLength']:
    featuren=data['feature']==feature
    data1=data[exp1&featuren]['value']
    data2=data[exp2&featuren]['value']
    print(feature + ' ttest (two-sided)',ttest(data1,data2))
    
# plt.savefig('avgFingerprint_exp1&2_secondHalf.pdf',format='pdf')

### avg fingerprint for exp 1 and 2 together - second half - split by accuracy

In [None]:
plt.figure(figsize=(12, 10))
exp1 = fingerprintsDF['experiment']=='1.1'
exp2 = fingerprintsDF['experiment']=='2.1'
secondhalf = fingerprintsDF['listNum']>7
data = fingerprintsDF[(exp1) & secondhalf].groupby(['subId','experiment','feature']).mean().reset_index(level=['experiment','feature'])
accMean = fingerprintsDF[(exp1) & secondhalf].groupby(['subId']).mean()['accuracy'].mean()

def f(row):
    if row['accuracy'] > accMean:
        val = 'high'
    else:
        val = 'low'
    return val

data['accSplit'] = data.apply(f, axis=1)

ax = sns.violinplot(y='value',x='feature',hue='accSplit',hue_order=['low','high'],data=data,split=True, scale="count", inner="quartile",order=['category','color','firstLetter', 'location','size','wordLength','temporal'])
plt.ylabel('Clustering score')
plt.ylim(.3,1)
plt.show()

# high=data['accSplit']=='high'
# low=data['accSplit']=='low'

# for feature in ['category','color','firstLetter','location','size','temporal','wordLength']:
#     featuren=data['feature']==feature
#     data1=data[high&featuren]['value']
#     data2=data[low&featuren]['value']
#     print(feature + ' ttest (two-sided)',ttest(data1,data2))

# plt.savefig('avgFingerprint_exp1_secondHalf_accSplit.pdf',format='pdf')

### avg fingerprint for exp 1 and 2 together - second half - split by accuracy

In [None]:
plt.figure(figsize=(12, 10))
exp1 = fingerprintsDF['experiment']=='1.1'
exp2 = fingerprintsDF['experiment']=='2.1'
secondhalf = fingerprintsDF['listNum']<8
data = fingerprintsDF[(exp2) & secondhalf].groupby(['subId','experiment','feature']).mean().reset_index(level=['experiment','feature'])
accMean = fingerprintsDF[(exp2) & secondhalf].groupby(['subId']).mean()['accuracy'].mean()

def f(row):
    if row['accuracy'] > accMean:
        val = 'high'
    else:
        val = 'low'
    return val

data['accSplit'] = data.apply(f, axis=1)

ax = sns.violinplot(y='value',x='feature',hue='accSplit',hue_order=['low','high'],data=data,split=True, scale="count", inner="quartile",order=['category','color','firstLetter', 'location','size','wordLength','temporal'])
plt.ylabel('Clustering score')
plt.ylim(.3,1)
plt.show()

# high=data['accSplit']=='high'
# low=data['accSplit']=='low'

# for feature in ['category','color','firstLetter','location','size','temporal','wordLength']:
#     featuren=data['feature']==feature
#     data1=data[high&featuren]['value']
#     data2=data[low&featuren]['value']
#     print(feature + ' ttest (two-sided)',ttest(data1,data2))

#plt.savefig('avgFingerprint_exp2_firstHalf_accSplit.pdf',format='pdf')


### avg fingerprint for exp 1 and 2 together - second half - split by accuracy

In [None]:
plt.figure(figsize=(12, 10))
exp1 = fingerprintsDF['experiment']=='1.1'
exp2 = fingerprintsDF['experiment']=='2.1'
def f(row):
    if row['listNum'] < 8 :
        val = 'first half'
    else:
        val = 'second half'
    return val

fingerprintsDF['time'] = fingerprintsDF.apply(f, axis=1)

data = fingerprintsDF[exp2].groupby(['subId','time','feature']).mean().reset_index(level=['feature','time'])
ax = sns.violinplot(y='value',x='feature',hue='time',hue_order=['first half','second half'],data=data,split=True, scale="count", inner="quartile",order=['category','color','firstLetter', 'location','size','wordLength','temporal'])
plt.ylabel('Clustering score')
plt.ylim(.3,1)
# plt.show()

# plt.savefig('avgFingerprint_exp2_splitByTime.pdf',format='pdf')
# plt.show()
# # high=data['accSplit']=='high'
# # low=data['accSplit']=='low'

# # for feature in ['category','color','firstLetter','location','size','temporal','wordLength']:
# #     featuren=data['feature']==feature
# #     data1=data[high&featuren]['value']
# #     data2=data[low&featuren]['value']
# #     print(feature + ' ttest (two-sided)',ttest(data1,data2))
# data

In [None]:
# exp1 = fingerprintsDF['experiment']=='1.1'
# exp2 = fingerprintsDF['experiment']=='2.1'
# secondHalf = fingerprintsDF['listNum']>7
# feature=fingerprintsDF['feature']=='temporal'
# data = fingerprintsDF[exp1 & feature].groupby(['subId','feature']).mean().reset_index(level=['feature'])
# data = data[['feature','accuracy','value']]

# g = sns.pairplot(data,hue='feature')
# plt.show()

# feature='temporal'
# a=data[data['feature']==feature]['accuracy']
# b=data[data['feature']==feature]['value']
# import scipy
# print(scipy.stats.pearsonr(a,b))

# corrs={}
# corrs['category']=0.43111982873068738
# corrs['color']=0.042551908675165041
# corrs['size']=0.56232253397336707
# corrs['location']=-0.15929961652715122
# corrs['wordLength']=0.057218271958420434
# corrs['firstLetter']=0.50747948723671299
# corrs['temporal']=-0.52346593256870289

# x=['category','color','size','location','wordLength','firstLetter','temporal']
# y=['clustering score / recall accuracy correlation']
# data = pd.Series(corrs)
# sns.barplot(data=data,y=[key for key in corrs])