In [1]:
# Import
import pandas as pd
import pickle
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import bz2
import difflib
import logging
import time
import zlib
import itertools
import collections
import seaborn as sns
import glob
import os
import random
import re
import statsmodels.api as sm
import seaborn as sn
from itertools import chain, combinations, tee, islice, permutations
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from statsmodels.formula.api import ols
from pathlib import Path
from collections import Counter 
from IPython.display import display
from os import path

-------------------------------
## Content
### A. For general use:
* [A1. Similarity measures](#A1.-Similarity-measures) 
    * [A1.1.: Set-theoretic measures (Intersection, Overlap, Jaccard, SMC, LCS, ngram_abs)](#A1.1.:-Set-theoretic-measures-(Intersection,-Overlap,-Jaccard,-SMC,-LCS,-ngram_abs)) 
    * [A1.2.: ITR measures](#A1.2.:-ITR-measures)
        * [A1.2.1. Here we first calculate the observed, expected and maximum items of a specific unit size](#A1.2.1.-Here-we-first-calculate-the-observed,-expected-and-maximum-items-of-a-specific-unit-size)
        * [A1.2.2. Then, we summarize them in final metrics (e.g., OmE (Pair Frequency/SOMA), OdE, OmEdM, OdM, OmEdMmE)](#A1.2.2.-Then,-we-summarize-them-in-final-metrics-(e.g.,-OmE-(Pair-Frequency/SOMA),-OdE,-OmEdM,-OdM,-OmEdMmE))
    * [A1.3: Edit distance measures](#A1.3:-Edit-distance-measures)
    * [A1.4: Graveyard for old similarity measure functions](#1.4:-Gaveyard-or-old-similarity-measure-functions)

### B. Project specific: Data wrangling for experiments
* [B1. Experiment & data overview](#B1.-Experiment-&-data-overview)
* [B2. Pickle file conversion (old --> new)](#B2.-Pickle-file-conversion-(old--->-new))
* [B3. Data frame creation (incl. spell check); to be used for subsequent analysis](#B3.-Data-frame-creation-(incl.-spell-check);-to-be-used-for-subsequent-analysis)
* [B4. Similarity for dyads by sub-group](#B4.-Similarity-for-dyads-by-sub-group)
* [B5. Group means (Results presented in Manuscript)](#B5.-Group-means-(Results-presented-in-Manuscript))

### A1. Similarity measures
#### A1.1.: Set-theoretic measures (Intersection, Overlap, Jaccard, SMC, LCS, ngram_abs)

In [3]:
def intersection(str1, str2):
    intersection = len(np.intersect1d(str1, str2))
    return intersection

In [4]:
def overlap(str1, str2):
    intersection = len(np.intersect1d(str1, str2))

    # Transform to correct input format, if it isn't already a list or a numpy array
    try:
        str1 = str1.values.tolist()
        str2 = str2.values.tolist()
    except:
        pass
    
    if len(str1) >= len(str2):
        larger = len(str1) 
        smaller = len(str2)
    else:
        smaller = len(str1)
        larger = len(str2) 
    overlap = intersection / smaller
    return overlap

In [5]:
def Jaccard(str1, str2): 
    # Transform to correct input format, if it isn't already a list or a numpy array
    try:
        str1 = str1.values.tolist()
        str2 = str2.values.tolist()
    except:
        pass
    
    union = len(np.union1d(str1, str2))
    intersection = len(np.intersect1d(str1, str2))

    Jaccard = intersection / union
    #print(Jaccard)
    return Jaccard

In [6]:
def SMC(str1, str2): # This counts the number of muturally forgotten items as "similar". TBD in case of Experiments 1B and 3
    encoding = list(range(1,85)) + [157,158,159,160] # Manual entry to get all words from the original study list             
    
    try:
        str1 = str1.values.tolist()
        str2 = str2.values.tolist()
    except:
        pass
    
    intersection = len(np.intersect1d(str1, str2))
    
    Forgotten = 0
    for word in encoding:
        if word in str1 or word in str2:
            continue
        else:
            Forgotten += 1
    
    return (((Forgotten+intersection) / len(encoding)))

In [7]:
def ngram_abs(str1, str2, unitSize, y, unidirectional=True):
    
    """
    USAGE:
    #of bigrams/trigrams/ngrams = ngram_abs(str1, str2, unitSize, y, unidirectional=True)

    ARGUMENTS:
    str1            The first string
    str2            The second string
    unitSize        Size of unit of interest (word pair, triple, quartuple etc.)
    unidirectional  Order/Direction of unit (only 1-2 valid, or also 2-1)

    DESCRIPTION:
    Can be used to calculate the joint number of word pairs/bigrams or triplets/trigrams across two participants
    """
     
    #str1 = str1.values.tolist()
    #str2 = str2.values.tolist()
    str1 = None
    str2 = None
    
    if unidirectional == True:
        #return [np.array(x) for x in zip(string[0:-1], string[1:])]
        iters = tee(str1, unitSize)                                                     
        for i, it in enumerate(iters):                                               
            next(islice(it, i, i), None)
            
        iters2 = tee(str2, unitSize)                                                     
        for i, it in enumerate(iters2):                                               
            next(islice(it, i, i), None)
           
        str1 = list(zip(*iters))
        str2 = list(zip(*iters2))
    else:
        s = list(str1)
        powerset = chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
        s2 = list(str2)
        powerset2 = chain.from_iterable(combinations(s2, r) for r in range(len(s2)+1))
        str1 = [(x) for x in powerset if len(x)==unitSize]
        str2 = [(x) for x in powerset2 if len(x)==unitSize]
    #print(str1)
    #print(str2)

    # This is for assining unique numbers to the different n-grams. At the moment it is executed elsewhere,\ 
    # but one could also execute it in here
    #y = permu(unitSize)
    #all_p = pd.DataFrame()
    #all_p['AllPermutations'] = y
    
    a_list = []
    for i in (str1):
        if i in y:
            x = y.index(i)
        else:
            continue
        a_list.append(x)

    b_list = []
    for i in (str2):
        if i in y:
            x = y.index(i)
        else:
            continue
        b_list.append(x)
    
    count = 0
    for i in str1:
        if i in str1 and i in str2:
            count += 1
        else:
            count += 0
    #print(count)
    #a = np.array(a_list)
    #b = np.array(b_list)
    
    return count

In [9]:
# Adapted from source: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_substring#Python
def lcs(str1, str2):
    m = [[0] * (1 + len(str2)) for i in range(1 + len(str1))]
    longest, x_longest = 0, 0
    for x in range(1, 1 + len(str1)):
        for y in range(1, 1 + len(str2)):
            if str1[x - 1] == str2[y - 1]:
                m[x][y] = m[x - 1][y - 1] + 1
                if m[x][y] > longest:
                    longest = m[x][y]
                    x_longest = x
            else:
                m[x][y] = 0
    string = (str1[x_longest - longest: x_longest])
    lcs = len(string)
    #print(string)
    return lcs

## A1.2.: ITR measures

### A1.2.1. Here we first calculate the observed, expected and maximum items of a specific unit size

In [10]:
def observed(str1, str2, unitSize, unidirectional=True):
    """
    USAGE:
    score = observed(str1, str2, unitSize, unidirectional=True)

    ARGUMENTS:
    str1            The first string
    str2            The second string
    unitSize        Size of unit of interest (word pair, triple, quartuple etc.)
    unidirectional  Direction of unit (only word1-word2 valid, or also word2-word1)

    DESCRIPTION:
    This calculates the observed words of a specific unit size to be used in subsequent shared organization measures
    """
    
    # Transform to correct input format, if it isn't already a list or a numpy array
    try:
        str1 = str1.values.tolist()
        str2 = str2.values.tolist()
    except:
        pass
    
    if len(str1) == 0:
        return []
    elif len(str2) ==0:
        return []
            
    ob_freq = 0
    for i in range(len(str1)-1):
        p1 = str1[i]
        p2 = str1[i+1]
        if unitSize ==3:
            p3 = str1[i+2]
        elif unitSize ==4:
            p3 = str1[i+2]
            p4 = str1[i+3]

        if unitSize == 2:
            if p1 in str2 and p2 in str2:
                i1 = np.nonzero(np.array(str2) == p1)
                i2 = np.nonzero(np.array(str2) == p2)
                # Directionality. Difference absolute or not absolute
                if unidirectional == True:
                    if (i2[0] - i1[0]) == 1:
                        ob_freq += 1
                else:
                    if abs(i2[0] - i1[0]) == 1:
                        ob_freq += 1                
            
        elif unitSize == 3:
            if p1 in str2 and p2 in str2 and p3 in str2:
                i1 = np.nonzero(np.array(str2) == p1)
                i2 = np.nonzero(np.array(str2) == p2)
                i3 = np.nonzero(np.array(str2) == p3)
                # Directionality. Difference absolute or not absolute
                if unidirectional == True:
                    if (i2[0] - i1[0]) == 1 and (i3[0] - i1[0]) == 2 and (i3[0] - i2[0]) == 1:
                        ob_freq += 1
                else:
                    if abs(i2[0] - i1[0]) <= 2 and abs(i3[0] - i2[0]) <= 2 and abs(i3[0] - i1[0]) <= 2: # I think I might not need the last and
                        ob_freq += 1  
            
        elif unitSize == 4:
            if p1 in str2 and p2 in str2 and p3 in str2:
                i1 = np.nonzero(np.array(str2) == p1)
                i2 = np.nonzero(np.array(str2) == p2)
                i3 = np.nonzero(np.array(str2) == p3)
                i4 = np.nonzero(np.array(str2) == p4)
                # Directionality. Difference absolute or not absolute
                if unidirectional == True:
                    if (i4[0] - i1[0]) == 3 and (i3[0] - i1[0]) == 2 and (i2[0] - i1[0]) == 1 and \
                    (i4[0] - i2[0]) == 2 and (i4[0] - i3[0] == 1) and (i3[0]-i2[0] == 1):
                        ob_freq += 1
                else:
                    if \
                    abs(i2[0] - i1[0]) <= 3 and abs(i3[0] - i1[0]) <= 3 and abs(i4[0] - i1[0]) <= 3 and \
                    abs(i3[0] - i2[0]) <= 3 and abs(i4[0] - i2[0]) <= 3 and \
                    abs(i4[0] - i3[0]) <= 3: 
                        ob_freq += 1     
           
    return ob_freq

In [11]:
def expected_BB(str1, str2, unitSize, unidirectional=True):
    """
    USAGE:
    score = expected_BB(str1, str2, unitSize, unidirectional=True)

    ARGUMENTS:
    str1            The first string
    str2            The second string
    unitSize        Size of unit of interest (word pair, triple, quartuple etc.) --> Is always 2 in this case
    unidirectional  Direction of unit (only 1-2 valid, or also 2-1)

    DESCRIPTION:
    This calculates the expected organization of word pairs to be used in subsequent shared organization measures
    """
    
    # Transform to correct input format, if it isn't already a list or a numpy array
    try:
        str1 = str1.values.tolist()
        str2 = str2.values.tolist()
    except:
        pass
    
    # This is only for pairs   
    if unitSize != 2:
        return "Use different expected calculation for higher order unit sizes"
    
    # Calc expected
    num_common_items = len(np.intersect1d(str1, str2))
    if unidirectional == True:
        x = 1
    else:
        x = 2
        
    exp_freq = (x*num_common_items*(num_common_items-1)) / float(len(str1)*len(str2))

    if num_common_items == 0:
        return 0
    else:
        return exp_freq

In [12]:
def expected_generalized(str1, str2, unitSize, unidirectional=True):
    """
    USAGE:
    score = expected_generalized(str1, str2, unitSize, unidirectional=True)

    ARGUMENTS:
    str1            The first string
    str2            The second string
    unitSize        Size of unit of interest (word pair, triple, quartuple etc.)
    unidirectional  Direction of unit (only 1-2 valid, or also 2-1)

    DESCRIPTION:
    This calculates the expected organization of word pairs: ((N-X-1)! * A * (M-X + 1-R)) / N!
    Not used.
    """
    
    # Transform to correct input format, if it isn't already a list or a numpy array
    try:
        str1 = str1.values.tolist()
        str2 = str2.values.tolist()
    except:
        pass
    
    
    # Calculate expected value according to Boulsfield & Boulsfield
    num_common_items = len(np.intersect1d(str1, str2))
    if unidirectional == True:
        x = 1
    else:
        x = 2
    exp_freq = (x*num_common_items*(num_common_items-1)) / float(len(str1)*len(str2))
    #print('x', x)
    
    return exp_freq

In [13]:
def maximum(str1, str2, unitSize, unidirectional=True):
    """
    USAGE:
    score = maximum(str1, str2, unitSize, unidirectional=True)

    ARGUMENTS:
    str1            The first string
    str2            The second string
    unitSize        Size of unit of interest (word pair, triple, quartuple etc.)
    unidirectional  Direction of unit (only 1-2 valid, or also 2-1)

    DESCRIPTION:
    This calculates the maximum possible organization of word pairs to be used in subsequent shared organization measures: (c - x + 1)
    """
    
    # Transform to correct input format, if it isn't already a list or a numpy array
    try:
        str1 = str1.values.tolist()
        str2 = str2.values.tolist()
    except:
        pass
    
    num_common_items = len(np.intersect1d(str1, str2))

    if num_common_items == 0:
        return 0
    elif num_common_items == 1:
        return 0
    else:
        return (num_common_items - unitSize + 1)

#### A1.2.2. Then, we summarize them in final metrics (e.g., OmE (Pair Frequency/SOMA), OdE, OmEdM, OdM, OmEdMmE)

In [14]:
def OmE(str1, str2, unitSize, unidirectional=True):   
    """
    DESCRIPTION:
    Unidirectional & pair = ITR (Bousfield & Bousfield (1966))
    Bidirectional & pari = Pair(ed) Frequency (Anderson & Watts (1969); Rosner (1970))
    """
    
    if unidirectional==True:
        return observed(str1, str2, unitSize, unidirectional=True) - expected_BB(str1, str2, unitSize, unidirectional=True)
    else:
        return observed(str1, str2, unitSize, unidirectional=False) - expected_BB(str1, str2, unitSize, unidirectional=False)

In [15]:
def OdE(str1, str2, unitSize, unidirectional=True):
    """
    DESCRIPTION:
    Referred to as Sequential Consistency by Sternberg & Tulving (1977), developed by Gorfein, Blair, & Rowland (1968)
    """
        
    if expected_BB(str1, str2, unitSize, unidirectional=True)==0:
        return 0
    else:
        if unidirectional==True:
            return observed(str1, str2, unitSize, unidirectional=True) / expected_BB(str1, str2, unitSize, unidirectional=True)
        else:
            return observed(str1, str2, unitSize, unidirectional=False) / expected_BB(str1, str2, unitSize, unidirectional=False)


In [16]:
def OmEdM(str1, str2, unitSize, unidirectional=True):
    """
    DESCRIPTION:
    Referred to as Sequential Consistency by Sternberg & Tulving (1977)
    Unidirectional pairs, Fagan (1968)
    Bidirectional pairs, Postman (1970)
    """
    
    if maximum(str1, str2, unitSize, unidirectional=True)==0:
        return 0
    else:
        if unidirectional==True:
            return ((observed(str1, str2, unitSize, unidirectional=True) - expected_BB(str1, str2, unitSize, unidirectional=True)) / maximum(str1, str2, unitSize, unidirectional=True))
        else:
            return ((observed(str1, str2, unitSize, unidirectional=False) - expected_BB(str1, str2, unitSize, unidirectional=False)) / maximum(str1, str2, unitSize, unidirectional=False))

In [17]:
def OdM(str1, str2, unitSize, unidirectional=True):
    """
    DESCRIPTION:
    For bidirectional pairs referred to as ITR(2) (Mandler & Dean (1969))
    """
    
    if maximum(str1, str2, unitSize, unidirectional=True)==0:
        return 0
    else:
        if unidirectional==True:
            return (observed(str1, str2, unitSize, unidirectional=True) / maximum(str1, str2, unitSize, unidirectional=True))
        else:
            return (observed(str1, str2, unitSize, unidirectional=False) / maximum(str1, str2, unitSize, unidirectional=False))

In [18]:
def OmEdMmE(str1, str2, unitSize, unidirectional=True):
    """
    DESCRIPTION:
    Generally referred to as ARC' (Pellegrino (1971); Pellegrino & Battig (1974))
    """
    
    if maximum(str1, str2, unitSize, unidirectional=True)==0 and expected_BB(str1, str2, unitSize, unidirectional=True)==0:
        return 0
    else:
        if unidirectional==True:
            return ((observed(str1, str2, unitSize, unidirectional=True) - expected_BB(str1, str2, unitSize, unidirectional=True)) / (maximum(str1, str2, unitSize, unidirectional=True) - expected_BB(str1, str2, unitSize, unidirectional=True)))
        else:
            return ((observed(str1, str2, unitSize, unidirectional=False) - expected_BB(str1, str2, unitSize, unidirectional=False)) / (maximum(str1, str2, unitSize, unidirectional=False) - expected_BB(str1, str2, unitSize, unidirectional=False)))

#### A1.3: Edit distance measures

In [19]:
def edit_dists(str1, str2, insert=True, delete=True, substitute=True, transpose=True):
    """
    USAGE:
    score = edit_dists(str1, str2, insert=True, delete=True, substitute=True, transpose=True)

    ARGUMENTS:
    str1            The first string
    str2            The second string
    insert          Is insertion as an operation allowed?
    delete          Is deletion as an operation allowed?
    substitute      Is substition as an operation allowed?
    transpose       Is transpotion as an operation allowed?

    DESCRIPTION:
    This calculates different types of edit distances dependent on which operations are allowed
    """
    

    if len(str1) > len(str2):
        str1, str2 = str2, str1

    n1 = len(str1)
    n2 = len(str2)
    d = np.zeros((n1 + 1, n2 + 1), dtype=int)

    for i in range(n1 + 1):
        d[i, 0] = i

    for j in range(n2 + 1):
        d[0, j] = j

    for i in range(n1):
        for j in range(n2):
            options = []
            # insertion
            if insert:
                options += [d[i, j+1] + 1]

            # deletion
            if delete:
                options += [d[i+1, j] + 1]

            # substitution
            if substitute and not(str1[i] == str2[j]):
                options += [d[i, j] + 1]

            # identical entries are free
            elif str1[i] == str2[j]:
                options += [d[i, j] + 0]


            d[i+1, j+1] = min(options)
            #d[i+1, j+1] = min(d[i, j+1] + 1, # insert
            #                  d[i+1, j] + 1, # delete
            #                  d[i, j] + cost) # replace


            if transpose:
                if i > 0 and j > 0 and str1[i] == str2[j-1] and str1[i-1] == str2[j]:
                    d[i+1, j+1] = min(d[i+1, j+1], d[i-1, j-1] + int(not(str1[i] == str2[j]))) # transpose


    # if substitution, max # of edits is max(n1, n2)
    if substitute:
        return 1 - ( d[n1, n2] / max(n1, n2) )

    # otherwise, it's n1 + n2 (delete each of s1, then insert each of s2)
    else:
        return 1 - ( d[n1, n2] / (n1 + n2) )
    #return d

#### A1.4: Graveyard for old similarity measure functions

In [20]:
def pairedFreq(str2, str1):
    
    # Transform to correct input format, if it isn't already a list or a numpy array
    try:
        str1 = str1.values.tolist()
        str2 = str2.values.tolist()
    except:
        pass
    
    if len(str2) == 0:
        return []
    ob_freq = 0
    for i in range(len(str2)-1):
        p1 = str2[i]
        p2 = str2[i+1]
        if p1 in str1 and p2 in str1:
            i1 = np.nonzero(np.array(str1) == p1)
            i2 = np.nonzero(np.array(str1) == p2)
            if abs(i1[0] - i2[0]) == 1:
                ob_freq += 1
                #print(ob_freq)
    #print(str2)
    #print(str1)
    #print("ob_freq=", ob_freq)
    
    if str2 == []:
        return 0

    
    # This is the code from Christian
    # num_common_items = len(final)
    # 
    # num_common_items = np.intersect1d(final, orig)
    # print("num_common_items=", num_common_items)
    # 
    # if type(num_common_items) == np.dtype(int):
    #    num_common_items = 1
    # else:
    #    print(num_common_items)
    #    num_common_items = len(num_common_items)

    num_common_items = len(np.intersect1d(str1, str2)) #Alex New

    exp_freq = (2*num_common_items*(num_common_items-1)) / float(len(str2)*len(str1))
    #print("exp_freq=", exp_freq)
    PF = ob_freq - exp_freq
    #print("PF=", PF)
    return PF

In [21]:
# Normalized observed bidirectional Inter-Trial-Repetition (O(ITR2) - Max(ITR2)) (by Alex, build on adapted PF from Christian)
# "The maximum ITR value is a function of the number of items common to both sets of events and does not depend on the absolute 
# number of words recalled or presented. It is equal to the number of items common to both events minus one." (Mandler & Dean, 1969)
def ITR2(str1, str2, shortest = True):
    
    # Transform to correct input format, if it isn't already a list or a numpy array
    try:
        str1 = str1.values.tolist()
        str2 = str2.values.tolist()
    except:
        pass
    
    if len(str2) == 0:
        return []
    ob_freq = 0
    for i in range(len(str2)-1):
        p1 = str2[i]
        p2 = str2[i+1]
        if p1 in str1 and p2 in str1:
            i1 = np.nonzero(np.array(str1) == p1)
            i2 = np.nonzero(np.array(str1) == p2)
            if abs(i1[0] - i2[0]) == 1:
                ob_freq += 1
    #print("ob_freq=", ob_freq)
    
    if str2 == []:
        return 0

    num_common_items = len(np.intersect1d(str1, str2))
    if num_common_items == 0: #Check with Christian
        ITR2 = 0
    elif num_common_items == 1:
        ITR2 = 0
    else:
        ITR2 = (ob_freq / (num_common_items - 1)) #M(ITR) = M(ITR2) = c-1
    
    return ITR2

In [22]:
# Normalized observed bidirectional Inter-Trial-Repetition (O(ITR2) - Max(ITR2)) (by Alex, build on adapted PF from Christian)
# "The maximum ITR value is a function of the number of items common to both sets of events and does not depend on the absolute 
# number of words recalled or presented. It is equal to the number of items common to both events minus one." (Mandler & Dean, 1969)
# According to Pellegrino (1971) M(ITRx) can either be calculated M(ITR) can be calculated either using   
# MAX(ITRa/x) = M - X + 1 -R    or   MAX(ITRa/ x) =M- X + 1
# where M=number of items recalled on Trial t, x=ngram-size and R=number of units of Size X from Trial t that have one or more items not recalled on t + 1
def ITRx(str1, str2, x):
    
    # Transform to correct input format, if it isn't already a list or a numpy array
    try:
        str1 = str1.values.tolist()
        str2 = str2.values.tolist()
    except:
        pass
    
    if len(str2) == 0:
        return []
    ob_freq = 0
    for i in range(len(str2)-1):
        p1 = str2[i]
        p2 = str2[i+1]
        if p1 in str1 and p2 in str1:
            i1 = np.nonzero(np.array(str1) == p1)
            i2 = np.nonzero(np.array(str1) == p2)
            if abs(i1[0] - i2[0]) == 1:
                ob_freq += 1
    #print("ob_freq=", ob_freq)
    
    if str2 == []:
        return 0

    num_common_items = len(np.intersect1d(str1, str2))
    if num_common_items == 0:
        ITRx = 0
    elif num_common_items == 1:
        ITRx = 0
    else:
        ITRx = (ob_freq / (num_common_items - x + 1))
    
    return ITRx

In [23]:
# Different normalization for observed bidirectional Inter-Trial-Repetition 
# (O(ITR2) - E(ITR2)) / (M(ITR2) - E(ITR2)) (by Alex, build on adapted PF from Christian)
def ARC2(str1, str2):
    
    # Transform to correct input format, if it isn't already a list or a numpy array
    try:
        str1 = str1.values.tolist()
        str2 = str2.values.tolist()
    except:
        pass
    
    if len(str2) == 0:
        return []
    ob_freq = 0
    for i in range(len(str2)-1):
        p1 = str2[i]
        p2 = str2[i+1]
        if p1 in str1 and p2 in str1:
            i1 = np.nonzero(np.array(str1) == p1)
            i2 = np.nonzero(np.array(str1) == p2)
            if abs(i1[0] - i2[0]) == 1:
                ob_freq += 1
    #print("ob_freq=", ob_freq)
    
    if str2 == []:
        return 0

    num_common_items = len(np.intersect1d(str1, str2))
    #print("num_common_items=", num_common_items) 

    exp_freq = (2*num_common_items*(num_common_items-1)) / float(len(str2)*len(str1))
    #print("exp_freq=", exp_freq) 

    max_freq = (num_common_items - 1)
    #print("max_freq=", max_freq) 
    
    if (max_freq - exp_freq)== 0 : # Double check with Christian (happens when c=1)
        ARC2 = 0
        #print('watch out')
    else:
        ARC2 = (ob_freq - exp_freq) / (max_freq - exp_freq)
    #print(ob_freq, exp_freq, max_freq)
    return ARC2

In [24]:
def editdist(str1, str2, min_threshold = None):
  #"""Return approximate string comparator measure (between 0.0 and 1.0)
   #  using the edit (or Levenshtein) distance.

#  USAGE:
#    score = editdist(str1, str2, min_threshold)

#  ARGUMENTS:
#    str1           The first string
#    str2           The second string
#    min_threshold  Minimum threshold between 0 and 1
#
#  DESCRIPTION:
#    The edit distance is the minimal number of insertions, deletions and
#    substitutions needed to make two strings equal.#
#
#    For more information on the modified Soundex see:
#    - http://www.nist.gov/dads/HTML/editdistance.html
#  """

#  # Quick check if the strings are empty or the same - - - - - - - - - - - - -
#  #
#  #if (str1 == '') or (str2 == ''):
    if (str1.size == 0) or (str2.size == 0):
        return 0.0
    #if len(str1) == len(str2) and len(str1) > 1 and all(str1 == str2):
    #    return 1.0
    #elif len(str1) == len(str2) and len(str1) == 1 and (str1 == str2):
    #    return 1.0

    n = len(str1)
    m = len(str2)
    max_len = max(n,m)

    if (min_threshold != None):
        if (isinstance(min_threshold, float)) and (min_threshold > 0.0) and (min_threshold > 0.0):

            len_diff = abs(n-m)
            w = 1.0 - float(len_diff) / float(max_len)

            if (w  < min_threshold):
                return 0.0  # Similariy is smaller than minimum threshold

        else: # Calculate the maximum distance possible with this threshold
            max_dist = (1.0-min_threshold)*max_len

    else:
        logging.exception('Illegal value for minimum threshold (not between' + \
                        ' 0 and 1): %f' % (min_threshold))
        raise Exception

    if (n > m):  # Make sure n <= m, to use O(min(n,m)) space
        str1, str2 = str2, str1
        n, m =       m, n

    current = range(n+1)

    for i in range(1, m+1):

        previous = current
        current =  [i]+n*[0]
        str2char = str2[i-1]

        for j in range(1,n+1):
            substitute = previous[j-1]
            if (str1[j-1] != str2char):
                substitute += 1

      # Get minimum of insert, delete and substitute
      #
            current[j] = min(previous[j]+1, current[j-1]+1, substitute)

        if (min_threshold != None) and (min(current) > max_dist):
            return 1.0 - float(max_dist+1) / float(max_len)

    w = 1.0 - float(current[n]) / float(max_len)

    assert (w >= 0.0) and (w <= 1.0), 'Similarity weight outside 0-1: %f' % (w)

  # A log message - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
    logging.debug('Edit-distance comparator string "%s" with "%s" value: %.3f' \
                % (str1, str2, w))
    return w

In [25]:
def mod_editdist(str1, str2, min_threshold = None):
#Return approximate string comparator measure (between 0.0 and 1.0)
#     using a modified edit (or Levenshtein) distance that counts transpositions
#     as elementary operations as well. This is also called the Damerau-
#     Levenshtein distance.

#  USAGE:
#    score = mod_editdist(str1, str2, min_threshold)

#  ARGUMENTS:
#    str1           The first string
#    str2           The second string
#    min_threshold  Minimum threshold between 0 and 1

#  DESCRIPTION:
#    The edit distance is the minimal number of insertions, deletions,
#    substitutions and transpositions needed to make two strings equal.

#    Compared to the original editdist function, which handles a transposition
#    (like: 'sydney' <-> 'sydeny' as 2 operations (two substitutions or one
#    insert and one delet), this modified version handles this as 1 operation.

#    Based on code from Justin Zobel's 'vrank'.
    
# Quick check if the strings are empty or the same - - - - - - - - - - - - -
#
#if (str1 == '') or (str2 == ''):
#print([str1, str2])
    #assert(0)
    if (str1.size == 0) or (str2.size == 0):
        return 0.0
    #elif (str1 == str2):
    elif np.array_equal(str1, str2):
        return 1.0

    n = len(str1)
    m = len(str2)
    max_len = max(n,m)
    #print('n', n)
    #print('m', m)
    #print('max_len',max_len)

    if (min_threshold != None):
        if (isinstance(min_threshold, float)) and (min_threshold > 0.0) and (min_threshold > 0.0): #I don't get this one
        
            len_diff = abs(n-m)
            w = 1.0 - float(len_diff) / float(max_len)

            if (w  < min_threshold):
                return 0.0  # Similariy is smaller than minimum threshold

        else: # Calculate the maximum distance possible with this threshold
            max_dist = (1.0-min_threshold)*max_len

    else:
        logging.exception('Illegal value for minimum threshold (not between' + ' 0 and 1): %f' % (min_threshold))
        raise Exception

    if (n > m):  # Make sure n <= m, to use O(min(n,m)) space
        str1, str2 = str2, str1
        n, m =       m, n

    d = []  # Table with the full distance matrix

    current = range(n+1)
    d.append(current)

    for i in range(1,m+1):

        previous = current
        current =  [i]+n*[0]
        str2char = str2[i-1]

        for j in range(1,n+1):
            substitute = previous[j-1]
            if (str1[j-1] != str2char):
                substitute += 1

            if (i == 1) or (j == 1):  # First characters, no transposition possible

            # Get minimum of insert, delete and substitute
            #
                current[j] = min(previous[j]+1, current[j-1]+1, substitute)

            else:
                if (str1[j-2] == str2[i-1]) and (str1[j-1] == str2[i-2]):
                    transpose = d[i-2][j-2] + 1
                else:
                    transpose = d[i-2][j-2] + 3

                current[j] = min(previous[j]+1, current[j-1]+1, substitute, transpose)

        d.append(current)

        if (min_threshold != None) and (min(current) > max_dist):
            return 1.0 - float(max_dist+1) / float(max_len)

    w = 1.0 - float(current[n]) / float(max_len)
    
    assert (w >= 0.0) and (w <= 1.0), 'Similarity weight outside 0-1: %f' % (w)

  # A log message - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
    logging.debug('Modified edit-distance comparator string "%s" with "%s" ' % \
                (str1, str2) + 'value: %.3f' % (w))

    return w

-------------------------------
### B. Project specific: Data wrangling for experiments
#### B1. Experiment & Data overview
* 4 phases: encoding --> individual recall --> individual or collaborative recall --> individual recall
* 2 "real" variables: individual/collaborative recall; biased/non-biased (deep/shallow encoding)
* 2 "counter balancing" variables were considered: word list 1 or 2; order of lists encoded for biased participants
* always a biased and unbiased participant collaborated
* order is only relevant for biased participants. two orders are possible dependent on subject number

#### __B2. Pickle file conversion (old --> new)__

In [None]:
# Convert files to new pickle files as the files were created with an older pickle version

# TO DO
# Check whether I can also just use a * 


# Which participants to include
first = 1
last = 9192 # Change to max SN number
skip = []
# Files not usable from SNs: 13, 14, 20, 21, 25, 27, 44, 45, 63, 65 (do not have to be skipped as they are in different folder)

# Encoding files
for subjectNumber in range(first, last+1):

    if subjectNumber in skip:
        continue

    try:
        original = ('encoding_' + str(subjectNumber) +'_1.pkl')
        destination = ('new_encoding_' + str(subjectNumber) +'_1.pkl')

        content = ''
        outsize = 0
        with open(original, 'rb') as infile:
            content = infile.read()
        with open(destination, 'wb') as output:
            for line in content.splitlines():
                outsize += len(line) + 1
                output.write(line + str.encode('\n'))

        with open(destination, 'rb') as f:
            try:
                test = pickle.load(f, encoding='latin1')
                print(test)
            except EOFError:
                pass

    except FileNotFoundError:
        pass

print("Done. Saved %s bytes." % (len(content)-outsize))

for subjectNumber in range(first, last+1):

    if subjectNumber in skip:
        continue

    try:
        original = ('encoding_' + str(subjectNumber) +'_0.pkl')
        destination = ('new_encoding_' + str(subjectNumber) +'_0.pkl')

        content = ''
        outsize = 0
        with open(original, 'rb') as infile:
            content = infile.read()
        with open(destination, 'wb') as output:
            for line in content.splitlines():
                outsize += len(line) + 1
                output.write(line + str.encode('\n'))

        with open(destination, 'rb') as f:
            try:
                test = pickle.load(f, encoding='latin1')
                print(test)
            except EOFError:
                pass

    except FileNotFoundError:
        pass

print("Done. Saved %s bytes." % (len(content)-outsize))


# Retrieval 1  files
for subjectNumber in range(first, last+1):

    if subjectNumber in skip:
        continue

    try:
        original = ('retrieve_' + str(subjectNumber) +'_0_1.pkl')
        destination = ('new_retrieve_' + str(subjectNumber) +'_0_1.pkl')

        content = ''
        outsize = 0
        with open(original, 'rb') as infile:
            content = infile.read()
        with open(destination, 'wb') as output:
            for line in content.splitlines():
                outsize += len(line) + 1
                output.write(line + str.encode('\n'))

        with open(destination, 'rb') as f:
            try:
                test = pickle.load(f, encoding='latin1')
                print(test)
            except EOFError:
                pass

    except FileNotFoundError:
        pass

# Retrieval 2  files
for subjectNumber in range(first, last+1):

    if subjectNumber in skip:
        continue

    try:
        original = ('retrieve_' + str(subjectNumber) +'_0_2.pkl')
        destination = ('new_retrieve_' + str(subjectNumber) +'_0_2.pkl')

        content = ''
        outsize = 0
        with open(original, 'rb') as infile:
            content = infile.read()
        with open(destination, 'wb') as output:
            for line in content.splitlines():
                outsize += len(line) + 1
                output.write(line + str.encode('\n'))

        with open(destination, 'rb') as f:
            try:
                test = pickle.load(f, encoding='latin1')
                print(test)
            except EOFError:
                pass

    except FileNotFoundError:
        pass

print("Done. Saved %s bytes." % (len(content)-outsize))



# Retrieval 3  files
for subjectNumber in range(first, last+1):

    if subjectNumber in skip:
        continue

    try:
        original = ('retrieve_' + str(subjectNumber) +'_0_3.pkl')
        destination = ('new_retrieve_' + str(subjectNumber) +'_0_3.pkl')

        content = ''
        outsize = 0
        with open(original, 'rb') as infile:
            content = infile.read()
        with open(destination, 'wb') as output:
            for line in content.splitlines():
                outsize += len(line) + 1
                output.write(line + str.encode('\n'))

        with open(destination, 'rb') as f:
            try:
                test = pickle.load(f, encoding='latin1')
                print(test)
            except EOFError:
                pass

    except FileNotFoundError:
        pass

print("Done. Saved %s bytes." % (len(content)-outsize))

# Collaborative Retrieval 2  files
for subjectNumber in range(first, last+1):

    if subjectNumber in skip:
        continue

    try:
        original = ('retrieve_' + str(subjectNumber) +'_1_2.pkl')
        destination = ('new_retrieve_' + str(subjectNumber) +'_1_2.pkl')

        content = ''
        outsize = 0
        with open(original, 'rb') as infile:
            content = infile.read()
        with open(destination, 'wb') as output:
            for line in content.splitlines():
                outsize += len(line) + 1
                output.write(line + str.encode('\n'))

        with open(destination, 'rb') as f:
            try:
                test = pickle.load(f, encoding='latin1')
                print(test)
            except EOFError:
                pass

    except FileNotFoundError:
        pass

print("Done. Saved %s bytes." % (len(content)-outsize))

#### __B3. Data frame creation (incl. spell check); to be used for subsequent analysis__

In [52]:
#-------------------------------------DESCRIPTION----------------------------------------
# Here we set up the dataframe and save it as a csv that will be analyzed

#-------------------------------------Notes----------------------------------------------
# This is for recalls 1 & 2 only as recall 3 was saved as dlm files up until SN58
# Therefore, next step will import all of recall3 as dlm files

#-------------------------------------Input----------------------------------------------
studyList1 = ['crow','eagle','finch','parrot','pigeon','cardinal','nitrogen','helium','chlorine','calcium','oxygen','mercury','trout',\
              'flounder','halibut','guppy','piranha','shark','carnation','orchid','pansy','daffodil','violet','rose','nectarine','pear',\
              'apple','grape','raspberry','cherry','tuba','drum','trumpet','saxophone','piano','organ','tree','ocean','canyon','mountain',\
              'plateau','cave','cinnamon','mustard','basil','oregano','paprika','cotton','wool', 'velvet','linen','leather','flyer',\
              'newspaper','comic','essay','pamphlet','tornado','hail','blizzard','rain','drought','jacket','dress','blouse','underwear',\
              'shoes','lamp','desk','bookcase','dresser','chair','banker','dentist','secretary','engineer','nurse','hour', 'arms', 'green',\
              'uncle']
    
studyList2 = ['crow','eagle','finch','parrot','pigeon','nitrogen','helium','chlorine','calcium','oxygen','trout','flounder','halibut',\
              'guppy','piranha','carnation','orchid','pansy','daffodil','violet','nectarine','pear','apple','grape','raspberry','tuba',\
              'drum','trumpet','saxophone','piano','tree','ocean','canyon','mountain','plateau','cinnamon','mustard','basil','oregano',\
              'paprika','salt','cotton','wool','velvet','linen','leather','denim','flyer','newspaper','comic','essay','pamphlet','book',\
              'tornado','hail','blizzard','rain','drought','lightning','jacket','dress','blouse','underwear','shoes','shirt','lamp','desk',\
              'bookcase','dresser','chair','recliner','banker','dentist','secretary','engineer','nurse','doctor','hour', 'arms', 'green', 'uncle']    

#-------------------------------------Create Data Frame----------------------------------------------

df = pd.DataFrame(columns=['SN', 'biased','order', 'collaboration', 'collaborator', 'word', 'correct', 'buffer', 'phase'])

first = 1
last = 9192
skip = [13, 14, 20, 21, 25, 27, 44, 45, 63, 65] 
# according to the "non-usable"-folder these participants were allocated to wrong ["because there were some participant condition \
# or subject numbers that were entered incorrectly by the RA. This then made the data unusable because the scripts use these values to make sure that participants in the \
# collaboration condition are assigned properly."]

# Just to have it written down somewhere, these are the collaborative pairs: 
# 56,78,1112,1718,2223,3031,3435,4041,4849,5253,5657,6061,6970,7374,7778,8182,8384,8586,8788,8990,9192



# A: Any subjects we should exclude?
for subjectNumber in range(first, last+1):

    if subjectNumber in skip:
        continue

# A: For pre and post collaboration
    j = [1, 2] # This is for the retrieval phase
    k = [0, 1] # This is indicating collaboration or not
    for z in j:
        for y in k:
        
            # A: Import all words from all participants for recall phase 1 and/or 2
            try:
                with open('new_retrieve_' + str(subjectNumber) +'_' + str(y) + '_' + str(z) + '.pkl', 'rb') as f:
                    try:
                        exp_data = pickle.load(f, encoding='latin1')
                    except EOFError:
                        pass

                    for trial in exp_data:
                        biased = 0
                        order = 0
                        collaboration = 0
                        phase = z
                        correct = 0
                        buffer = 0

                        # Select the word from the pickle files
                        word = trial['word']

                        # Spell checking
                        for x in word:
                            word = word.lower()
                            word = word.strip()
                            df = df.replace("\\", '')
                            # From experiment 1
                            word = ''.join('canyon' if word == 'canyoan' else word for word in word.split())
                            word = ''.join('canyon' if word == 'canyons' else word for word in word.split())
                            word = ''.join('cherry' if word == 'cherries' else word for word in word.split())
                            word = ''.join('chlorine' if word == 'clorine' else word for word in word.split())      
                            word = ''.join('cinnamon' if word == 'cinammon' else word for word in word.split())  
                            word = ''.join('cinnamon' if word == 'cinnaman' else word for word in word.split())      
                            word = ''.join('cinnamon' if word == 'cinnimon' else word for word in word.split())
                            word = ''.join('cinnamon' if word == 'cinomman' else word for word in word.split())
                            word = ''.join('cotton' if word == 'cotten' else word for word in word.split())
                            word = ''.join('daffodil' if word == 'daffidil' else word for word in word.split())
                            word = ''.join('daffodil' if word == 'dafodill' else word for word in word.split())      
                            word = ''.join('daffodil' if word == 'daphodile' else word for word in word.split())
                            word = ''.join('daffodil' if word == 'doffodil' else word for word in word.split())   
                            word = ''.join('denim' if word == 'denin' else word for word in word.split())
                            word = ''.join('drum' if word == 'drums' else word for word in word.split())
                            word = ''.join('engineer' if word == 'enginner' else word for word in word.split())
                            word = ''.join('flounder' if word == 'flunder' else word for word in word.split())
                            word = ''.join('flyer' if word == 'flier' else word for word in word.split())      
                            word = ''.join('flyer' if word == 'flyers' else word for word in word.split())
                            word = ''.join('grape' if word == 'grapes' else word for word in word.split())   
                            word = ''.join('guppy' if word == 'gupppy' else word for word in word.split())
                            word = ''.join('halibut' if word == 'hailbut' else word for word in word.split())
                            word = ''.join('halibut' if word == 'halibet' else word for word in word.split())
                            word = ''.join('linen' if word == 'linens' else word for word in word.split())      
                            word = ''.join('linen' if word == 'linnen' else word for word in word.split())
                            word = ''.join('mercury' if word == 'ercury' else word for word in word.split())   
                            word = ''.join('mountain' if word == 'mountains' else word for word in word.split())
                            word = ''.join('mountain' if word == 'moutain' else word for word in word.split())
                            word = ''.join('nectarine' if word == 'necratine' else word for word in word.split())
                            word = ''.join('nectarine' if word == 'necterine' else word for word in word.split())      
                            word = ''.join('orchid' if word == 'orchad' else word for word in word.split())
                            word = ''.join('oregano' if word == 'aregano' else word for word in word.split())   
                            word = ''.join('oregano' if word == 'orageno' else word for word in word.split())
                            word = ''.join('oregano' if word == 'regano' else word for word in word.split())
                            word = ''.join('pamphlet' if word == 'amphlet' else word for word in word.split())
                            word = ''.join('pamphlet' if word == 'pamplet' else word for word in word.split())      
                            word = ''.join('pamphlet' if word == 'panplet' else word for word in word.split())
                            word = ''.join('pamphlet' if word == 'phamlet' else word for word in word.split())
                            word = ''.join('pansy' if word == 'ansy' else word for word in word.split())
                            word = ''.join('paprika' if word == 'paparika' else word for word in word.split())
                            word = ''.join('paprika' if word == 'papprika' else word for word in word.split())
                            word = ''.join('paprika' if word == 'paprica' else word for word in word.split())      
                            word = ''.join('paprika' if word == 'peprica' else word for word in word.split())
                            word = ''.join('pigeon' if word == 'pegion' else word for word in word.split())
                            word = ''.join('pigeon' if word == 'pidgeon' else word for word in word.split())
                            word = ''.join('pigeon' if word == 'pidgeons' else word for word in word.split())
                            word = ''.join('pigeon' if word == 'pidgieon' else word for word in word.split())      
                            word = ''.join('piranha' if word == 'paranha' else word for word in word.split())
                            word = ''.join('piranha' if word == 'pirahana' else word for word in word.split())      
                            word = ''.join('piranha' if word == 'pirahna' else word for word in word.split())
                            word = ''.join('piranha' if word == 'pirahnna' else word for word in word.split())
                            word = ''.join('piranha' if word == 'pirannah' else word for word in word.split())
                            word = ''.join('piranha' if word == 'pirannha' else word for word in word.split())      
                            word = ''.join('piranha' if word == 'pirhana' else word for word in word.split())
                            word = ''.join('piranha' if word == 'pirranha' else word for word in word.split())   
                            word = ''.join('plateau' if word == 'plateu' else word for word in word.split())
                            word = ''.join('plateau' if word == 'platue' else word for word in word.split())
                            word = ''.join('raspberry' if word == 'rasberries' else word for word in word.split())
                            word = ''.join('raspberry' if word == 'rasberry' else word for word in word.split())      
                            word = ''.join('raspberry' if word == 'raspberries' else word for word in word.split())
                            word = ''.join('raspberry' if word == 'rasphberry' else word for word in word.split())   
                            word = ''.join('saxophone' if word == 'saxaphone' else word for word in word.split())
                            word = ''.join('saxophone' if word == 'saxephone' else word for word in word.split())
                            word = ''.join('saxophone' if word == 'saxiphone' else word for word in word.split())
                            word = ''.join('secretary' if word == 'secratary' else word for word in word.split())      
                            word = ''.join('secretary' if word == 'secrertary' else word for word in word.split())
                            word = ''.join('shoes' if word == 'shoe' else word for word in word.split())   
                            word = ''.join('trumpet' if word == 'trumphet' else word for word in word.split())
                            word = ''.join('trumpet' if word == 'tumpet' else word for word in word.split())
                            word = ''.join('underwear' if word == 'nderwear' else word for word in word.split())
                            word = ''.join('underwear' if word == 'udnerwear' else word for word in word.split())      
                            word = ''.join('bookcase' if word == 'bookshelf' else word for word in word.split())
                            word = ''.join('chair' if word == 'chari' else word for word in word.split())   
                            word = ''.join('lightning' if word == 'lightening' else word for word in word.split())
                            word = ''.join('lightning' if word == 'lightenings' else word for word in word.split())
                            word = ''.join('lightning' if word == 'lighting' else word for word in word.split())
                            word = ''.join('lightning' if word == 'lightning' else word for word in word.split())
                            word = ''.join('lightning' if word == 'lightnening' else word for word in word.split())      
                            word = ''.join('tree' if word == 'tress' else word for word in word.split())
                            word = ''.join('violet' if word == 'voilet' else word for word in word.split())
                            word = ''.join('blizzard' if word == 'blizarrd' else word for word in word.split())            
                            word = ''.join('velvet' if word == 'velvey' else word for word in word.split())
                            word = ''.join('daffodil' if word == 'dafodil' else word for word in word.split())
                            word = ''.join('halibut' if word == 'halibit' else word for word in word.split())
                            word = ''.join('book' if word == 'books' else word for word in word.split())
                            word = ''.join('hurricane' if word == 'hurricaine' else word for word in word.split())                    

                            # From experiment 2
                            word = ''.join('' if word == 'ado' else word for word in word.split()) # one row was ado, second tornao, third tornado               
                            word = ''.join('dandelion' if word == 'dandalione' else word for word in word.split())                
                            word = ''.join('dandelion' if word == 'dandeline' else word for word in word.split())                
                            word = ''.join('dandelion' if word == 'dandelion' else word for word in word.split())                
                            word = ''.join('dandelion' if word == 'dandelion' else word for word in word.split())                  
                            word = ''.join('dandelion' if word == 'dandalione' else word for word in word.split())                
                            word = ''.join('dandelion' if word == 'dandilion' else word for word in word.split())                  
                            #word = ''.join('finch' if word == 'flinch' else word for word in word.split()) as flinch has a different meaning, we didn't change it                   
                            word = ''.join('oregano' if word == 'gano' else word for word in word.split()) 
                            word = ''.join('halibut' if word == 'halbait' else word for word in word.split())                
                            #word = ''.join('hail' if word == 'hale' else word for word in word.split()) as hale has a different meaning, we didn't change it                
                            #word = ''.join('linen' if word == 'lenin' else word for word in word.split()) as lenin has a different meaning, we didn't change it                 
                            word = ''.join('velvet' if word == 'lvet' else word for word in word.split())                  
                            word = ''.join('mountain' if word == 'ntain' else word for word in word.split())                
                            word = ''.join('nurse' if word == 'nurde' else word for word in word.split())                    
                            word = ''.join('' if word == 'ado' else word for word in word.split()) # one row was ado, second tornao, third tornado               
                            word = ''.join('blouse' if word == 'ouse' else word for word in word.split())                
                            word = ''.join('pants' if word == 'pant' else word for word in word.split())                
                            word = ''.join('piranha' if word == 'parhna' else word for word in word.split())
                            #word = ''.join('pear' if word == 'peat' else word for word in word.split()) as peat has a different meaning, we didn't change it
                            word = ''.join('oregano' if word == 'pregeno' else word for word in word.split())                  
                            word = ''.join('dresser' if word == 'sser' else word for word in word.split()) # difficult, but all other recalled items from that participant \
                            # had a couple of letters missing in the front (see ntain or tton)
                            word = ''.join('cotton' if word == 'tton' else word for word in word.split())                  
                            word = ''.join('basil' if word == 'brasil' else word for word in word.split())                  
                            word = ''.join('tangerine' if word == 'tangarine' else word for word in word.split())                
                            word = ''.join('cloth' if word == 'clothes' else word for word in word.split())                
                            word = ''.join('cloth' if word == 'clothing' else word for word in word.split())                
                            word = ''.join('cloud' if word == 'clouds' else word for word in word.split())                  
                            word = ''.join('book' if word == 'books' else word for word in word.split())                
                            word = ''.join('lily' if word == 'lilly' else word for word in word.split())
                            word = ''.join('oregano' if word == 'oragano' else word for word in word.split())                
                            word = ''.join('tornado' if word == 'tronado' else word for word in word.split())                
                            word = ''.join('apple' if word == 'apll' else word for word in word.split())                
                            word = ''.join('comic' if word == 'comics' else word for word in word.split())                
                            word = ''.join('daffodil' if word == 'fodil' else word for word in word.split())                
                            word = ''.join('halibut' if word == 'habut' else word for word in word.split())                  
                            word = ''.join('lightning' if word == 'lighnting' else word for word in word.split())                
                            word = ''.join('lightning' if word == 'lightining' else word for word in word.split())             
                            word = ''.join('lightning' if word == 'lightning' else word for word in word.split())      
                            word = ''.join('organ' if word == 'organs' else word for word in word.split()) # Organ has multiple meanings by itself, hence potentially ok to change
                            word = ''.join('pigeon' if word == 'piogen' else word for word in word.split())      
                            word = ''.join('paprika' if word == 'pipraki' else word for word in word.split())
                            word = ''.join('piranha' if word == 'purrana' else word for word in word.split())
                            word = ''.join('drought' if word == 'rought' else word for word in word.split())
                            word = ''.join('trumpet' if word == 'trumphent' else word for word in word.split())
                            word = ''.join('trumpet' if word == 'trumpht' else word for word in word.split())      
                            word = ''.join('trumpet' if word == 'trumpt' else word for word in word.split())
                            word = ''.join('bookcase' if word == 'bookself' else word for word in word.split())                                   
                            word = ''.join('bookcase' if word == 'bookshlelf' else word for word in word.split())                
                            word = ''.join('bookcase' if word == 'boookcase' else word for word in word.split())                
                            word = ''.join('chlorine' if word == 'chloride' else word for word in word.split())
                            word = ''.join('chlorine' if word == 'chorline' else word for word in word.split())  
                            word = ''.join('chlorine' if word == 'cholrine' else word for word in word.split())                  
                            word = ''.join('chlorine' if word == 'chroine' else word for word in word.split())                
                            word = ''.join('cinnamon' if word == 'cinamon' else word for word in word.split())
                            word = ''.join('cinnamon' if word == 'cinnimin' else word for word in word.split())                
                            word = ''.join('cinnamon' if word == 'cinnoman' else word for word in word.split())                
                            word = ''.join('cinnamon' if word == 'cinnomin' else word for word in word.split())                
                            word = ''.join('cotton' if word == 'coten' else word for word in word.split())                  
                            word = ''.join('halibut' if word == 'halibat' else word for word in word.split())                
                            word = ''.join('halibut' if word == 'halibiut' else word for word in word.split())
                            word = ''.join('halibut' if word == 'hallibut' else word for word in word.split())                
                            word = ''.join('halibut' if word == 'halubit' else word for word in word.split())                
                            word = ''.join('helium' if word == 'heliu,' else word for word in word.split())
                            word = ''.join('helium' if word == 'elium' else word for word in word.split())
                            word = ''.join('helium' if word == 'heliu' else word for word in word.split())   
                            word = ''.join('hour' if word == 'hours' else word for word in word.split())                  
                            word = ''.join('paprika' if word == 'prika' else word for word in word.split())                
                            word = ''.join('rose' if word == 'roses' else word for word in word.split())
                            word = ''.join('oregano' if word == 'aregeno' else word for word in word.split())                
                            word = ''.join('blizzard' if word == 'blizzzard' else word for word in word.split())                
                            word = ''.join('blouse' if word == 'blousse' else word for word in word.split())                
                            word = ''.join('daffodil' if word == 'dafadil' else word for word in word.split())                  
                            word = ''.join('daffodil' if word == 'daffadil' else word for word in word.split())                
                            word = ''.join('daffodil' if word == 'daffildil' else word for word in word.split())                
                            word = ''.join('daffodil' if word == 'daffodils' else word for word in word.split())                
                            word = ''.join('daffodil' if word == 'daffoldil' else word for word in word.split())                
                            word = ''.join('daffodil' if word == 'dafidil' else word for word in word.split())                
                            word = ''.join('daffodil' if word == 'dafidill' else word for word in word.split())                  
                            word = ''.join('daffodil' if word == 'dafidill' else word for word in word.split())                
                            word = ''.join('daffodil' if word == 'daphadil' else word for word in word.split())
                            word = ''.join('daffodil' if word == 'daphadile' else word for word in word.split())                
                            word = ''.join('daffodil' if word == 'daphadile' else word for word in word.split())                
                            word = ''.join('daffodil' if word == 'daphadill' else word for word in word.split())                
                            word = ''.join('daffodil' if word == 'daphidile' else word for word in word.split())                  
                            word = ''.join('daffodil' if word == 'daphodil' else word for word in word.split())                
                            word = ''.join('daffodil' if word == 'daphodil' else word for word in word.split())                
                            word = ''.join('daffodil' if word == 'dapphodil' else word for word in word.split())                
                            word = ''.join('dentist' if word == 'dentis' else word for word in word.split())                
                            word = ''.join('dresser' if word == 'dreser' else word for word in word.split())                
                            word = ''.join('dress' if word == 'dresses' else word for word in word.split()) #very difficult, either dress or dresser, therefore we left it as is                
                            word = ''.join('drought' if word == 'drout' else word for word in word.split())                
                            word = ''.join('drum' if word == 'dum' else word for word in word.split())                   
                            word = ''.join('engineer' if word == 'engeneer' else word for word in word.split())                
                            word = ''.join('engineer' if word == 'enigineer' else word for word in word.split())                
                            word = ''.join('helium' if word == 'helim' else word for word in word.split())                
                            word = ''.join('linen' if word == 'linene' else word for word in word.split())
                            word = ''.join('lamp' if word == 'lamo' else word for word in word.split()) 
                            word = ''.join('mountain' if word == 'mountian' else word for word in word.split())                
                            word = ''.join('nectarine' if word == 'necatrina' else word for word in word.split())                   
                            word = ''.join('nectarine' if word == 'necatrine' else word for word in word.split())                
                            word = ''.join('nectarine' if word == 'nectarine' else word for word in word.split())                
                            word = ''.join('nectarine' if word == 'nectatine' else word for word in word.split())                
                            word = ''.join('nectarine' if word == 'nectorin' else word for word in word.split())                  
                            word = ''.join('nectarine' if word == 'nectorine' else word for word in word.split())                
                            word = ''.join('nectarine' if word == 'nectrine' else word for word in word.split())                   
                            word = ''.join('nectarine' if word == 'necturine' else word for word in word.split())                
                            word = ''.join('nectarine' if word == 'nictarine' else word for word in word.split())                                             
                            word = ''.join('nitrogen' if word == 'nigrogen' else word for word in word.split())                  
                            word = ''.join('nitrogen' if word == 'nitrogren' else word for word in word.split())                
                            word = ''.join('oxygen' if word == 'ocygen' else word for word in word.split())                
                            word = ''.join('orchid' if word == 'orchird' else word for word in word.split())                
                            word = ''.join('oregano' if word == 'oreganno' else word for word in word.split())                
                            word = ''.join('oregano' if word == 'oregeno' else word for word in word.split())                
                            word = ''.join('oregano' if word == 'oregno' else word for word in word.split())                  
                            word = ''.join('oregano' if word == 'oregno' else word for word in word.split())                
                            word = ''.join('oregano' if word == 'oregono' else word for word in word.split())
                            word = ''.join('oxygen' if word == 'oxogen' else word for word in word.split())                
                            word = ''.join('paprika' if word == 'papricka' else word for word in word.split())                
                            word = ''.join('parrot' if word == 'parot' else word for word in word.split())                
                            word = ''.join('paprika' if word == 'peperika' else word for word in word.split())                  
                            word = ''.join('pamphlet' if word == 'phamplet' else word for word in word.split())                
                            word = ''.join('piranha' if word == 'phirana' else word for word in word.split())                
                            word = ''.join('piranha' if word == 'pirana' else word for word in word.split())                                
                            word = ''.join('piranha' if word == 'piranah' else word for word in word.split())                  
                            word = ''.join('piranha' if word == 'piranna' else word for word in word.split())                
                            word = ''.join('piranha' if word == 'pirhanna' else word for word in word.split())                
                            word = ''.join('piranha' if word == 'pirnaha' else word for word in word.split())                
                            word = ''.join('piranha' if word == 'pirrhana' else word for word in word.split())                
                            word = ''.join('plateau' if word == 'plaeatu' else word for word in word.split())                
                            word = ''.join('plateau' if word == 'plataue' else word for word in word.split())                  
                            word = ''.join('plateau' if word == 'plateua' else word for word in word.split())                
                            word = ''.join('plateau' if word == 'plateux' else word for word in word.split())
                            word = ''.join('plateau' if word == 'plaute' else word for word in word.split())                
                            word = ''.join('plateau' if word == 'pleatu' else word for word in word.split())                
                            word = ''.join('plateau' if word == 'pleteau' else word for word in word.split())                
                            word = ''.join('raspberry' if word == 'rasperry' else word for word in word.split())                  
                            word = ''.join('trumpet' if word == 'rumpet' else word for word in word.split())                
                            word = ''.join('secretary' if word == 'sactatary' else word for word in word.split())                
                            word = ''.join('saxophone' if word == 'saxphone' else word for word in word.split())
                            word = ''.join('tornado' if word == 'torando' else word for word in word.split())                  
                            word = ''.join('tornado' if word == 'tornadeo' else word for word in word.split())                
                            word = ''.join('tornado' if word == 'tornadoe' else word for word in word.split())                
                            word = ''.join('tornado' if word == 'tornadoe' else word for word in word.split())                
                            word = ''.join('tornado' if word == 'tornador' else word for word in word.split())                
                            word = ''.join('tornado' if word == 'tornando' else word for word in word.split())                
                            word = ''.join('tornado' if word == 'tornao' else word for word in word.split())                  
                            word = ''.join('tree' if word == 'trees' else word for word in word.split())                
                            word = ''.join('trumpet' if word == 'trmpet' else word for word in word.split())
                            word = ''.join('trumpet' if word == 'trmpet' else word for word in word.split())                
                            word = ''.join('trumpet' if word == 'trupmet' else word for word in word.split())                
                            word = ''.join('underwear' if word == 'undewear' else word for word in word.split())                
                            word = ''.join('velvet' if word == 'velvot' else word for word in word.split())                  
                            word = ''.join('velvet' if word == 'vevelt' else word for word in word.split())                
                            word = ''.join('velvet' if word == 'volvet' else word for word in word.split())                
                            word = ''.join('saxophone' if word == 'xophone' else word for word in word.split())                
                            word = ''.join('oxygen' if word == 'xygen' else word for word in word.split())                  
                            word = ''.join('daffodil' if word == 'dafodil' else word for word in word.split())
                            word = ''.join('halibut' if word == 'halibit' else word for word in word.split())
                            word = ''.join('raspberry, grape' if word == 'raspberrygrape' else word for word in word.split())
                            word = ''.join('desk, chair, lamp, hour' if word == 'deskchairlamphour' else word for word in word.split())
                            word = ''.join('tornado, hail' if word == 'tornadohail' else word for word in word.split())                

                            #Buffer
                            word = ''.join('arms' if word == 'arm' else word for word in word.split())
                            word = ''.join('green' if word == 'grreen' else word for word in word.split())


                            # From experiment 3
                            word = ''.join('apple' if word == 'appl' else word for word in word.split())
                            word = ''.join('oregano' if word == 'arangano' else word for word in word.split())
                            word = ''.join('arms' if word == 'arm' else word for word in word.split())
                            word = ''.join('oregano' if word == 'arregano' else word for word in word.split())
                            word = ''.join('saxophone' if word == 'axophpne' else word for word in word.split())
                            word = ''.join('blizzard' if word == 'Bizzard' else word for word in word.split())
                            word = ''.join('blizzard' if word == 'blizard' else word for word in word.split())
                            word = ''.join('blouse' if word == 'Blouse' else word for word in word.split())
                            word = ''.join('blouse' if word == 'blouser' else word for word in word.split())
                            word = ''.join('bookcase' if word == 'bookshelf' else word for word in word.split())
                            word = ''.join('book' if word == 'boooks' else word for word in word.split())
                            word = ''.join('cardinal' if word == 'Cardinal' else word for word in word.split())
                            word = ''.join('cardigan' if word == 'cartigan' else word for word in word.split())
                            word = ''.join('chlorine' if word == 'chloride' else word for word in word.split())
                            word = ''.join('comic' if word == 'Comic' else word for word in word.split())
                            word = ''.join('cotton' if word == 'cutton' else word for word in word.split())
                            word = ''.join('daffodil' if word == 'daffadil' else word for word in word.split())
                            word = ''.join('daffodil' if word == 'daffildol' else word for word in word.split())
                            word = ''.join('daffodil' if word == 'daffodile' else word for word in word.split())
                            word = ''.join('daffodil' if word == 'daffodils' else word for word in word.split())
                            word = ''.join('daffodil' if word == 'dafidil' else word for word in word.split())
                            word = ''.join('daffodil' if word == 'dafiodil' else word for word in word.split())
                            word = ''.join('dandelion' if word == 'dandelion' else word for word in word.split())
                            word = ''.join('dandelion' if word == 'danelion' else word for word in word.split())
                            word = ''.join('daffodil' if word == 'daphodil' else word for word in word.split())
                            word = ''.join('daffodil' if word == 'daphodyl' else word for word in word.split())
                            word = ''.join('doctor' if word == 'doctore' else word for word in word.split())
                            word = ''.join('dress' if word == 'Dress' else word for word in word.split())
                            word = ''.join('dress' if word == 'dresses' else word for word in word.split())
                            word = ''.join('dresser' if word == 'dressor' else word for word in word.split())
                            word = ''.join('drought' if word == 'drough' else word for word in word.split())
                            word = ''.join('drum' if word == 'drums' else word for word in word.split())
                            word = ''.join('eagle' if word == 'Eagle' else word for word in word.split())
                            word = ''.join('eagle' if word == 'egale' else word for word in word.split())
                            word = ''.join('engineer' if word == 'eingineer' else word for word in word.split())
                            word = ''.join('finch' if word == 'finc' else word for word in word.split())
                            word = ''.join('finch' if word == 'Finch' else word for word in word.split())
                            word = ''.join('grape' if word == 'graoe' else word for word in word.split())
                            word = ''.join('grape' if word == 'grap' else word for word in word.split())
                            word = ''.join('grape' if word == 'Grape' else word for word in word.split())
                            word = ''.join('grape' if word == 'grapes' else word for word in word.split())
                            word = ''.join('green' if word == 'Green' else word for word in word.split())
                            word = ''.join('guppy' if word == 'gruppy' else word for word in word.split())
                            word = ''.join('guppy' if word == 'gubby' else word for word in word.split())
                            word = ''.join('guppy' if word == 'Guppy' else word for word in word.split())
                            word = ''.join('halibut' if word == 'hailbut' else word for word in word.split())
                            word = ''.join('halibut' if word == 'halbit' else word for word in word.split())
                            word = ''.join('halibut' if word == 'halibat' else word for word in word.split())
                            word = ''.join('helium' if word == 'helium\'' else word for word in word.split())
                            word = ''.join('helium' if word == 'heluim' else word for word in word.split())
                            word = ''.join('nitrogen' if word == 'hitrogen' else word for word in word.split())
                            word = ''.join('hour' if word == 'Hour' else word for word in word.split())
                            word = ''.join('linen' if word == 'inen' else word for word in word.split())
                            word = ''.join('jeans' if word == 'jean' else word for word in word.split())
                            word = ''.join('jesus' if word == 'Jesus' else word for word in word.split())
                            word = ''.join('jupiter' if word == 'jupitor' else word for word in word.split())
                            word = ''.join('leather' if word == 'lether' else word for word in word.split())
                            word = ''.join('lightning' if word == 'lightining' else word for word in word.split())
                            word = ''.join('lightning' if word == 'lightning' else word for word in word.split())
                            word = ''.join('linen' if word == 'linnen' else word for word in word.split())
                            word = ''.join('mountain' if word == 'mountains' else word for word in word.split())
                            word = ''.join('trumpet' if word == 'mpet' else word for word in word.split())
                            word = ''.join('nectarine' if word == 'nectorine' else word for word in word.split())
                            word = ''.join('nectarine' if word == 'nectraine' else word for word in word.split())
                            word = ''.join('nitrogen' if word == 'nitrgoen' else word for word in word.split())
                            word = ''.join('nitrogen' if word == 'Nitrogen' else word for word in word.split())
                            word = ''.join('nitrogen' if word == 'nitrogen\'' else word for word in word.split())
                            word = ''.join('ocean' if word == 'Ocean' else word for word in word.split())
                            word = ''.join('orchid' if word == 'ochid' else word for word in word.split())
                            word = ''.join('octopus' if word == 'octupus' else word for word in word.split())
                            word = ''.join('orchid' if word == 'orchids' else word for word in word.split())
                            word = ''.join('orchid' if word == 'orchird' else word for word in word.split())
                            word = ''.join('orchid' if word == 'orcid' else word for word in word.split())
                            #word = ''.join('oregano' if word == 'oregon' else word for word in word.split())
                            word = ''.join('oregano' if word == 'organo' else word for word in word.split())
                            word = ''.join('oxygen' if word == 'Oxygen' else word for word in word.split())
                            word = ''.join('piano' if word == 'paino' else word for word in word.split())
                            word = ''.join('pamphlet' if word == 'pamphelt' else word for word in word.split())
                            word = ''.join('pamphlet' if word == 'pamplet' else word for word in word.split())
                            word = ''.join('paprika' if word == 'paprikka' else word for word in word.split())
                            word = ''.join('piranha' if word == 'parahana' else word for word in word.split())
                            word = ''.join('piranha' if word == 'paranha' else word for word in word.split())
                            word = ''.join('pansy' if word == 'pasny' else word for word in word.split())
                            word = ''.join('piano' if word == 'pianp' else word for word in word.split())
                            word = ''.join('pigeon' if word == 'pidgeon' else word for word in word.split())
                            word = ''.join('pigeon' if word == 'piegon' else word for word in word.split())
                            word = ''.join('piranha' if word == 'pihrana' else word for word in word.split())
                            word = ''.join('piranha' if word == 'pirahna' else word for word in word.split())
                            word = ''.join('piranha' if word == 'pirahnna' else word for word in word.split())
                            word = ''.join('piranha' if word == 'piranah' else word for word in word.split())
                            word = ''.join('piranha' if word == 'pirannhea' else word for word in word.split())
                            word = ''.join('piranha' if word == 'pirhana' else word for word in word.split())
                            word = ''.join('piranha' if word == 'pirhanna' else word for word in word.split())
                            word = ''.join('piranha' if word == 'pirranha' else word for word in word.split())
                            word = ''.join('piranha' if word == 'pirranhea' else word for word in word.split())
                            word = ''.join('piranha' if word == 'pirrhana' else word for word in word.split())
                            word = ''.join('plateau' if word == 'plataeu' else word for word in word.split())
                            word = ''.join('plateau' if word == 'plateu' else word for word in word.split())
                            word = ''.join('pleasant' if word == 'pleasent' else word for word in word.split())
                            word = ''.join('plateau' if word == 'pleatou' else word for word in word.split())
                            word = ''.join('oxygen' if word == 'poxygen' else word for word in word.split())
                            word = ''.join('piranha' if word == 'prihana' else word for word in word.split())
                            word = ''.join('raspberry' if word == 'rasberry' else word for word in word.split())
                            word = ''.join('raspberry' if word == 'raseberry' else word for word in word.split())
                            word = ''.join('robin' if word == 'robbin' else word for word in word.split())
                            word = ''.join('saxophone' if word == 'saxaphone' else word for word in word.split())
                            word = ''.join('saxophone' if word == 'saxapphone' else word for word in word.split())
                            word = ''.join('saxophone' if word == 'saxohpone' else word for word in word.split())
                            word = ''.join('saxophone' if word == 'Saxophone' else word for word in word.split())
                            word = ''.join('saxophone' if word == 'saxopon' else word for word in word.split())
                            word = ''.join('saxophone' if word == 'saxphone' else word for word in word.split())
                            word = ''.join('secretary' if word == 'secatary' else word for word in word.split())
                            word = ''.join('secretary' if word == 'Secretary' else word for word in word.split())
                            word = ''.join('shoes' if word == 'shoe' else word for word in word.split())
                            word = ''.join('sulfur' if word == 'sulfure' else word for word in word.split())
                            word = ''.join('tangerine' if word == 'tangerene' else word for word in word.split())
                            word = ''.join('tornado' if word == 'tornadeo' else word for word in word.split())
                            word = ''.join('tree' if word == 'Tree' else word for word in word.split())
                            word = ''.join('tree' if word == 'trees' else word for word in word.split())
                            word = ''.join('drought' if word == 'trought' else word for word in word.split())
                            word = ''.join('trombone' if word == 'trumbone' else word for word in word.split())
                            word = ''.join('trumpet' if word == 'Trumpet' else word for word in word.split())
                            word = ''.join('trumpet' if word == 'trumphet' else word for word in word.split())
                            word = ''.join('turmeric' if word == 'tumeric' else word for word in word.split())
                            word = ''.join('trumpet' if word == 'umpet' else word for word in word.split())
                            word = ''.join('velvet' if word == 'Velvet' else word for word in word.split())
                            word = ''.join('velvet' if word == 'velvet3' else word for word in word.split())
                            word = ''.join('velvet' if word == 'velviot' else word for word in word.split())
                            word = ''.join('violet' if word == 'Violet' else word for word in word.split())
                            word = ''.join('women' if word == 'woman' else word for word in word.split())
                            word = ''.join('wool' if word == 'wool\\' else word for word in word.split())
                            word = ''.join('saxophone' if word == 'xophone' else word for word in word.split())
                            word = ''.join('chair' if word == 'chair\\' else word for word in word.split())
                            word = ''.join('daffodil' if word == 'daffodil\\' else word for word in word.split())
                            word = ''.join('ocean' if word == 'ocean\\' else word for word in word.split())
                            word = ''.join('leather' if word == 'eather' else word for word in word.split())
                            word = ''.join('choir' if word == 'chior' else word for word in word.split())
                            word = ''.join('halibut' if word == 'haliberd' else word for word in word.split())
                                                        
                            # Experiment 3, recall 2
                            word = ''.join('tree' if word == 'dree' else word for word in word.split())
                            word = ''.join('piranha' if word == 'piranhea' else word for word in word.split())
                            word = ''.join('oxygen' if word == 'oxigen' else word for word in word.split())
                            word = ''.join('organ' if word == 'ogran' else word for word in word.split())
                            word = ''.join('chair' if word == 'chaor' else word for word in word.split())
                            word = ''.join('rain' if word == 'rain\'' else word for word in word.split())
                            word = ''.join('blouse' if word == 'blosue' else word for word in word.split())
                            word = ''.join('oxygen' if word == 'oxegen' else word for word in word.split())
                            word = ''.join('parrot' if word == 'parrot\\' else word for word in word.split())
                            word = ''.join('engineer' if word == 'enigneer' else word for word in word.split())
                            word = ''.join('halibut' if word == 'halabit' else word for word in word.split())
                            word = ''.join('secretary' if word == 'secetary' else word for word in word.split())
                            word = ''.join('dentist' if word == 'demtist' else word for word in word.split())
                            word = ''.join('dentist' if word == 'denist' else word for word in word.split())                       
                            
                            
                        # Biase comes from file name
                        if os.path.exists('new_encoding_' + str(subjectNumber) +'_0.pkl'):
                            biased = 0
                        else:
                            biased = 1

                        # Order comes from SN and impacts correct study list reference
                        if (subjectNumber%2 == 0) and (biased == 1):
                            order = 1
                            if word in studyList2:
                                correct = 1
                            else:
                                correct = 0
                        elif (subjectNumber%2 == 1) and (biased == 1):
                            order = 0     
                            if word in studyList1:
                                correct = 1
                            else:
                                correct = 0
                        if (subjectNumber%2 == 0) and (biased == 0):
                            order = 0
                            if word in studyList1:
                                correct = 1
                            else:
                                correct = 0
                        elif (subjectNumber%2 == 1) and (biased == 0):
                            order = 1     
                            if word in studyList2:
                                correct = 1
                            else:
                                correct = 0
                      

                        # Indicate buffer
                        buffer = 0
                        for x in word:
                            if word in ['arms', 'green', 'uncle', 'hour']:
                                buffer = 1
                            else:
                                buffer = 0

                        # Collaboration comes from retrieval_1_2 files (1_2 collaborated, 0_2 did not)
                        if os.path.exists('new_retrieve_' + str(subjectNumber) +'_0_2.pkl'):
                            collaboration = 0
                        else:
                            collaboration = 1

                        # Collaborator for recall 1 and 3 easy to identify
                        # However, at the moment, phase 2 is included with a specific SN that is SN1 and SN2 concatenanted
                        # Therefore, SN56 in recall 1 is NOT SN56 in recall 2. All SN in recall 2 are concatenated and have bias=2 & order=2
                        collaborator = 0
                        if (subjectNumber == 56) and (collaboration == 1) and (phase == 2):
                            collaborator = str(0)
                        elif (subjectNumber == 78) and (collaboration == 1) and (phase == 2):
                            collaborator = str(0)
                        elif os.path.exists('new_retrieve_' + str(subjectNumber) + str(subjectNumber+1) + '_1_2.pkl'):
                            collaborator = str(subjectNumber+1)
                        elif os.path.exists('new_retrieve_' + str(subjectNumber-1) + str(subjectNumber) + '_1_2.pkl'):
                            collaborator = str(subjectNumber-1)
                        else:
                            collaborator = 0

                        # all collaborating dyads (SN with numbers > 94 and the two 56 and 78) get a 2 (similar to na) for order and bias
                        if (subjectNumber > 94):
                            collaboration=1
                            order = 2
                            biased = 2
                        elif (subjectNumber == 56) and (collaboration == 1) and (phase == 2):
                            collaboration=1
                            order = 2
                            biased = 2  
                        elif (subjectNumber == 78) and (collaboration == 1) and (phase == 2):
                            collaboration=1
                            order = 2
                            biased = 2
                        else:
                            pass
                            
                        trialDict = {'SN':subjectNumber, 'biased':biased, 'order':order, 'phase':phase, 'collaboration':collaboration, 'collaborator':collaborator, 
                                     'word':word, 'correct':correct, 'buffer':buffer}
                        df = df.append(trialDict, ignore_index=True)
            except OSError:
                logging.info('File missing' + str(f))
                pass

df = df.drop_duplicates(['SN','word','phase']) # Delete duplicate recalls in the same phase by the same participant
df = df.replace('', np.nan) 
df = df.dropna(axis=0, how='any', subset=['word']) # Delete rows with empty word



#df.to_csv('2020-09-17_Similarity_Exp3_AllWords_Clean1_Check.csv', index=False)
df['number'] = df['word']

# Includes misspellings and buffer words, same as in experiments 1 and 2 plus new intrusions and buffer
translator = {'crow':1,'eagle':2,'finch':3,'parrot':4,'pigeon':5,'cardinal':6,'nitrogen':7,'helium':8,'chlorine':9,'calcium':10,'oxygen':11,
          'mercury':12,'trout':13,'flounder':14,'halibut':15,'guppy':16,'piranha':17,'shark':18,'carnation':19,'orchid':20,'pansy':21,'daffodil':22,'violet':23,'rose':24,
          'nectarine':25,'pear':26,'apple':27,'grape':28,'raspberry':29,'cherry':30,'tuba':31,'drum':32,'trumpet':33,'saxophone':34,'piano':35,'organ':36,'tree':37,'ocean':38,
          'canyon':39,'mountain':40,'plateau':41,'cave':42,'cinnamon':43,'mustard':44,'basil':45,'oregano':46,'paprika':47,'salt':48,'cotton':49,'wool':50,'velvet':51,
          'linen':52,'leather':53,'denim':54,'flyer':55,'newspaper':56,'comic':57,'essay':58,'pamphlet':59,'book':60,'tornado':61,'hail':62,'blizzard':63,'rain':64,'drought':65,
          'lightning':66,'jacket':67,'dress':68,'blouse':69,'underwear':70,'shoes':71,'shirt':72,'lamp':73,'desk':74,'bookcase':75,'dresser':76,'chair':77,'recliner':78,
          'banker':79,'dentist':80,'secretary':81,'engineer':82,'nurse':83,'doctor':84,'a':85,'architect':86,'avidafil?':87,'ballet':88,'baseball':89,'bed':90,
          'bird':91,'blueberry':92,'cage':93,'canary':94,'carbon':95,'chemical':96,'clarinet':97,'coat':98,'concrete':99,'daisy':100,'dog':101,'experimenter':102,
          'fish':103,'flamenco':104,'flower':105,'flute':106,'fruit':107,'give':108,'grass':109,'hydrogen':110,'instrument':111,'lawyer':112,'library':113,'lily':114,
          'lithium':115,'lung':116,'melon':117,'music':118,'nickel':119,'nylon':120,'pants':121,'parsley':122,'pepper':123,'project':124,'red':125,'salmon':126,
          'satin':127,'sea':128,'sky':129,'slave':130,'sofa':131,'storm':132,'table':133,'tango':134,'teacher':135,'the':136,'thunder':137,'thunderstorm':138,'tissue':139,
          'to':140,'trombone':141,'trousers':142,'tulip':143,'tuna':144,'willing':145,'address':146,'cavern':147,'homework':148,'magazine':149,'are':150,
          'dove':151,'sodium':152,'hurricaine':153,'hurricane':154,'violent':155,'violin':156,'green':157,'uncle':158,'hour':159,'arms':160,'orchard':161,
          'paper':162,'sunflower':163,'dandelion':164,'snow':165,'green':166,'professor':167,'socks':168,'purse':169,'ar':170,'e':171,'mium':172,
          't':173,'rk':174, 'hid':175,'breakfast':176,'uncle':177,'hat':178,'hale':179,'speacker':180,'dance':181,'baker':182,'couch':183,'tube':184,
          'valley':185,'tangerine':186,'guitar':187,'orange':188,'skirt':189,'robin':190,'peach':191,'lavender':192,'thyme':193,'drawer':194,
          'cloud':195,'word':196,'paper':197,'carnelian':198,'article':199,'daydream':200,'tectonicplate':201,'blossom':202,'cloth':203,'tsunami':204,
          'legs':205,'keyboard':206,'fiction':207,'water':208,'earth':209,'light':210,'hair':211,'rocks':212,'moon':213,'weather':214,'wind':215,
          'door':216,'letter':217,'time':218,'yellow':219,'river':220,'lake':221,'typhoon':222,'gloves':223,'magnesium':224,'ear':225,'camp':226,
          'seed':227,'phone':228,'fresh':229,'lenin':230,'in':231,'halogen':232,'wager':233,'research':234,'positive':235,'success':236,'pelican':237,
          'computer':238,'plum':239,'although':240,'brazil':241,'fly':242,'brochure':243,'flood':244,'perch':245,'year':246,'lilac':247,'silk':248,
          'disaster':249,'bloom':250,'politician':251,'onion':252,'jazz':253,'peat':254,'flinch':255,'weatherphenomenon':256,'natural':257,
          'flavoring':258,'food':259,'furniture':260,'formation':261,'papya':262,'snowstorm':263,'basketball':264, 'jeans':265,'park':266,'cactus':267,
          'window':268,'bankteller':269,'keys':270,'raven':271,'purple':272,'cat':273,'candy':274,'mom':275,'dad':276,'brother':277,'sister':278,'chicken':279,'bread':280,
          'science':281,'english':282,'psy':283,'biology':284,'canopy':285,'run':286,'fast':287,'school':288,'waitress':289,'cow':290,
          'turmeric':291,'cinnamone':292,'perple':293,'fear':294,'male':295,'pleasant':296,'nicotine':297,'sparrow':298,'6312414461hmu':299,
          'cardigan':300,'bank':301,'clementine':302,'house':303,'squash':304,'basket':305,'cilantro':306,'owl':307,'cinemmon':308,'saffron':309,
          'erect':310,'exit':311,'egress':312,'laptop':314,'clemintine':315,'home':316,'ught':317,'blackberry':318, 'seagull':319, 'bear':320,
          'bass':321,'god':322,'love':323,'mercy':324,'feel':325,'nephew':326,'sun':327,'jesus':328,'man':329,'women':330,'men':331,'university':332,
          'take':333,'get':334,'have':335,'go':336,'meet':337,'someone':338,'somebody':339,'piece':340,'female':341,'song':342,'hiphop':343,'hoody':344,
          'but':345,'what':346,'which':347,'when':348,'who':349,'whose':350,'how':351,'mean':352,'iq':353,'friend':354,'boy':355,'girl':356,'all':357,
          'guest':358,'same':359,'silmilar':360,'hope':361,'peace':362,'forgive':363,'beat':364,'hello':365,'hi':366,'nice':367,'good':368,'carrot':369,
          'banana':370,'strawberry':371,'cranberry':372,'car':373,'truck':374,'sneak':375,'monkey':376,'monster':377,'dark':378,'cyber':379,'space':380,
          'teach':381,'learn':382,'me':383,'you':384,'i':385,'puppy':386,'mermaid':387,'bye':388,'se':389,'see':390,'next':391,'goes':392,'breeze':393,
          'deer':394,'crayon':395,'mustered':396,'waves':397,'suit':398,'bluejay':399,'pieceoffurniture':400,'typeofinstrument':401,'gout':402,
          'box':403,'mouse':404,'rat':405,'whale':406,'week':407,'minute':408,'appointment':409,'schedule':410,'leaf':411,'news':412,'sulfur':413,
          'wardrobe':414,'halibit':415,'polyester':416,'choir':417,'octopus':418,'snake':419,'jupiter':420,'afruit':421,'ajoborprofession':422,
          'aweathercondition':423,'abird':424,'aflower':425,'apieceoffurniture':426,'afish':427,'ageographicalformation':428,'amusicalinstrument':429,
          'nan':430,'nen':431,'ton':432, 'shelf':433,'apply':434,'glove':435,'cabarnet':436,'floud':437,'happy':438,'guinea':439,'torso':440,'parsley':441,
          'ran':442,'valor':443,'honor':444,'courage':445,'homework':446,'work':447,'money':448,'envelope':449,'life':450,'rail':451,'office':452,'oregon':453,
          'carnage':454, 'birch':455, 'pine':456}
            # changed 'haliberd':313 as intrusion to correction
df.number = [translator[item] for item in df.number] 

df.to_csv('2020-12-17_Similarity_Exp3_AllWords_Clean1.csv', index=False)



In [53]:
#-------------------------------------DESCRIPTION-----------------------------------------
# This is for recall 3, adding the dlm files


#-------------------------------------Input----------------------------------------------
studyList1 = ['crow','eagle','finch','parrot','pigeon','cardinal','nitrogen','helium','chlorine','calcium','oxygen','mercury','trout',\
              'flounder','halibut','guppy','piranha','shark','carnation','orchid','pansy','daffodil','violet','rose','nectarine','pear',\
              'apple','grape','raspberry','cherry','tuba','drum','trumpet','saxophone','piano','organ','tree','ocean','canyon','mountain',\
              'plateau','cave','cinnamon','mustard','basil','oregano','paprika','cotton','wool', 'velvet','linen','leather','flyer',\
              'newspaper','comic','essay','pamphlet','tornado','hail','blizzard','rain','drought','jacket','dress','blouse','underwear',\
              'shoes','lamp','desk','bookcase','dresser','chair','banker','dentist','secretary','engineer','nurse','hour', 'arms', 'green',\
              'uncle']
    
studyList2 = ['crow','eagle','finch','parrot','pigeon','nitrogen','helium','chlorine','calcium','oxygen','trout','flounder','halibut',\
              'guppy','piranha','carnation','orchid','pansy','daffodil','violet','nectarine','pear','apple','grape','raspberry','tuba',\
              'drum','trumpet','saxophone','piano','tree','ocean','canyon','mountain','plateau','cinnamon','mustard','basil','oregano',\
              'paprika','salt','cotton','wool','velvet','linen','leather','denim','flyer','newspaper','comic','essay','pamphlet','book',\
              'tornado','hail','blizzard','rain','drought','lightning','jacket','dress','blouse','underwear','shoes','shirt','lamp','desk',\
              'bookcase','dresser','chair','recliner','banker','dentist','secretary','engineer','nurse','doctor','hour', 'arms', 'green', 'uncle']    
    
#-------------------------------------Create Data Frame----------------------------------------------

df_dlm = pd.DataFrame(columns=['SN', 'biased','order', 'collaboration', 'collaborator', 'word', 'correct', 'buffer', 'phase'])

#df_dlm = pd.read_table('retrieve_27_0_2.dlm', sep='\t', names=["time-del", "time", "word-del", "word"])

first = 1
last = 93
skip = [13, 14, 20, 21, 25, 27, 44, 45, 63, 65] 
# according to the "non-usable"-folder these participants were allocated to wrong ["because there were some participant condition \
# or subject numbers that were entered incorrectly by the RA. This then made the data unusable because the scripts use these values to make sure that participants in the \
# collaboration condition are assigned properly."]
# 1_0_3 is missing pkl file, only has dlm



# A: Any subjects we should exclude?
for subjectNumber in range(first, last+1):

    if subjectNumber in skip:
        continue

       
    # A: Import all words from all participants for recall phase 3
    try:
        exp_data = pd.read_table('retrieve_'+str(subjectNumber)+'_0_3.dlm', sep='\t', names=["time-del", "time", "word-del", "word"])
        exp_data = exp_data.drop(['time-del', 'time', 'word-del'], axis=1)
        exp_data = exp_data.drop([0])
        words = exp_data['word'].to_list()
        #type(word)
        
        #word2 = " ".join([str(i) for i in word])

        #print(word2)

        for i in words:
            word = str(i)
            biased = 0
            order = 0
            collaboration = 0
            phase = 3
            correct = 0
            buffer = 0
            
            # From experiment 1
            word = word.replace(' ', '')
            word = word.replace('\\', '')
            word = word.lower()
            word = re.sub(r'\bcanyoan\b', 'canyon', word)
            word = re.sub(r'\bcanyons\b', 'canyon', word)
            word = re.sub(r'\bcherries\b', 'cherry', word)
            word = re.sub(r'\bclorine\b', 'chlorine', word)
            word = re.sub(r'\bcinammon\b', 'cinnamon', word)
            word = re.sub(r'\bcinnaman\b', 'cinnamon', word)
            word = re.sub(r'\bcinnimon\b', 'cinnamon', word)
            word = re.sub(r'\bcinomman\b', 'cinnamon', word)
            word = re.sub(r'\bcotten\b', 'cotton', word)
            word = re.sub(r'\bdaffidil\b', 'daffodil', word)
            word = re.sub(r'\bdafodill\b', 'daffodil', word)
            word = re.sub(r'\bdaphodile\b', 'daffodil', word)
            word = re.sub(r'\bdoffodil\b', 'daffodil', word)
            word = re.sub(r'\bdenin\b', 'denim', word)
            word = re.sub(r'\bdrums\b', 'drum', word)
            word = re.sub(r'\benginner\b', 'engineer', word)
            word = re.sub(r'\bflunder\b', 'flounder', word)
            word = re.sub(r'\bflier\b', 'flyer', word)
            word = re.sub(r'\bflyers\b', 'flyer', word)
            word = re.sub(r'\bgrapes\b', 'grape', word)
            word = re.sub(r'\bgupppy\b', 'guppy', word)
            word = re.sub(r'\bhailbut\b', 'halibut', word)
            word = re.sub(r'\bhalibet\b', 'halibut', word)
            word = re.sub(r'\blinens\b', 'linen', word)
            word = re.sub(r'\blinnen\b', 'linen', word)
            word = re.sub(r'\bercury\b', 'mercury', word)
            word = re.sub(r'\bmountains\b', 'mountain', word)
            word = re.sub(r'\bmoutain\b', 'mountain', word)
            word = re.sub(r'\bnecratine\b', 'nectarine', word)
            word = re.sub(r'\bnecterine\b', 'nectarine', word)
            word = re.sub(r'\borchad\b', 'orchid', word)
            word = re.sub(r'\baregano\b', 'oregano', word)
            word = re.sub(r'\borageno\b', 'oregano', word)
            word = re.sub(r'\bregano\b', 'oregano', word)
            word = re.sub(r'\bamphlet\b', 'pamphlet', word)
            word = re.sub(r'\bpamplet\b', 'pamphlet', word)
            word = re.sub(r'\bpanplet\b', 'pamphlet', word)
            word = re.sub(r'\bphamlet\b', 'pamphlet', word)
            word = re.sub(r'\bansy\b', 'pansy', word)
            word = re.sub(r'\bpaparika\b', 'paprika', word)
            word = re.sub(r'\bpapprika\b', 'paprika', word)
            word = re.sub(r'\bpaprica\b', 'paprika', word)
            word = re.sub(r'\bpeprica\b', 'paprika', word)
            word = re.sub(r'\bpegion\b', 'pigeon', word)
            word = re.sub(r'\bpidgeon\b', 'pigeon', word)
            word = re.sub(r'\bpidgeons\b', 'pigeon', word)
            word = re.sub(r'\bpidgieon\b', 'pigeon', word)
            word = re.sub(r'\bparanha\b', 'piranha', word)
            word = re.sub(r'\bpirahana\b', 'piranha', word)
            word = re.sub(r'\bpirahna\b', 'piranha', word)
            word = re.sub(r'\bpirahnna\b', 'piranha', word)
            word = re.sub(r'\bpirannah\b', 'piranha', word)
            word = re.sub(r'\bpirannha\b', 'piranha', word)
            word = re.sub(r'\bpirhana\b', 'piranha', word)
            word = re.sub(r'\bpirranha\b', 'piranha', word)
            word = re.sub(r'\bplateu\b', 'plateau', word)
            word = re.sub(r'\bplatue\b', 'plateau', word)
            word = re.sub(r'\brasberries\b', 'raspberry', word)
            word = re.sub(r'\brasberry\b', 'raspberry', word)
            word = re.sub(r'\braspberries\b', 'raspberry', word)
            word = re.sub(r'\brasphberry\b', 'raspberry', word)
            word = re.sub(r'\bsaxaphone\b', 'saxophone', word)
            word = re.sub(r'\bsaxephone\b', 'saxophone', word)
            word = re.sub(r'\bsaxiphone\b', 'saxophone', word)
            word = re.sub(r'\bsecratary\b', 'secretary', word)
            word = re.sub(r'\bsecrertary\b', 'secretary', word)
            word = re.sub(r'\bshoe\b', 'shoes', word)
            word = re.sub(r'\btrumphet\b', 'trumpet', word)
            word = re.sub(r'\btumpet\b', 'trumpet', word)
            word = re.sub(r'\bnderwear\b', 'underwear', word)
            word = re.sub(r'\budnerwear\b', 'underwear', word)
            word = re.sub(r'\bbookshelf\b', 'bookcase', word)
            word = re.sub(r'\bchari\b', 'chair', word)
            word = re.sub(r'\blightening\b', 'lightning', word)
            word = re.sub(r'\blightenings\b', 'lightning', word)
            word = re.sub(r'\blighting\b', 'lightning', word)
            word = re.sub(r'\blightning\b', 'lightning', word)
            word = re.sub(r'\blightnening\b', 'lightning', word)
            word = re.sub(r'\btress\b', 'tree', word)
            word = re.sub(r'\bvoilet\b', 'violet', word)
            word = re.sub(r'\bblizarrd\b', 'blizzard', word)
            word = re.sub(r'\bvelvey\b', 'velvet', word)
            word = re.sub(r'\bdafodil\b', 'daffodil', word)            
            word = re.sub(r'\bhalibit\b', 'halibut', word)
            word = re.sub(r'\bbooks\b', 'book', word)
            word = re.sub(r'\bhurricaine\b', 'hurricane', word)          
            
            #From experiment 2
            #word = re.sub(r'\bado\b', '', word)
            word = re.sub(r'\bdandalione\b', 'dandelion', word)
            word = re.sub(r'\bdandeline\b', 'dandelion', word)
            word = re.sub(r'\bdandelion\b', 'dandelion', word)
            word = re.sub(r'\bdandelion\b', 'dandelion', word)
            word = re.sub(r'\bdandalione\b', 'dandelion', word)
            word = re.sub(r'\bdandilion\b', 'dandelion', word)
            #word = re.sub(r'\bflinch\b', 'finch', word)
            word = re.sub(r'\bgano\b', 'oregano', word)
            word = re.sub(r'\bhalbait\b', 'halibut', word)
            word = re.sub(r'\bhale\b', 'hail', word)
            word = re.sub(r'\blenin\b', 'linen', word)
            word = re.sub(r'\blvet\b', 'velvet', word)
            word = re.sub(r'\bntain\b', 'mountain', word)
            word = re.sub(r'\bnurde\b', 'nurse', word)
            word = re.sub(r'\bado\b', '', word)
            word = re.sub(r'\bouse\b', 'blouse', word)
            word = re.sub(r'\bpant\b', 'pants', word)
            word = re.sub(r'\bparhna\b', 'piranha', word)
            word = re.sub(r'\bpeat\b', 'pear', word)
            word = re.sub(r'\bpregeno\b', 'oregano', word)
            word = re.sub(r'\bsser\b', 'dresser', word)
            word = re.sub(r'\btton\b', 'cotton', word)
            word = re.sub(r'\bbrasil\b', 'basil', word)
            word = re.sub(r'\btangarine\b', 'tangerine', word)
            word = re.sub(r'\bclothes\b', 'cloth', word)
            word = re.sub(r'\bclothing\b', 'cloth', word)
            word = re.sub(r'\bclouds\b', 'cloud', word)
            word = re.sub(r'\bbooks\b', 'book', word)
            word = re.sub(r'\blilly\b', 'lily', word)
            word = re.sub(r'\boragano\b', 'oregano', word)
            word = re.sub(r'\btronado\b', 'tornado', word)
            word = re.sub(r'\bapll\b', 'apple', word)
            word = re.sub(r'\bcomics\b', 'comic', word)
            word = re.sub(r'\bfodil\b', 'daffodil', word)
            word = re.sub(r'\bhabut\b', 'halibut', word)
            word = re.sub(r'\blighnting\b', 'lightning', word)
            word = re.sub(r'\blightining\b', 'lightning', word)
            word = re.sub(r'\blightning\b', 'lightning', word)
            word = re.sub(r'\borgans\b', 'organ', word)
            word = re.sub(r'\bpiogen\b', 'pigeon', word)
            word = re.sub(r'\bpipraki\b', 'paprika', word)
            word = re.sub(r'\bpurrana\b', 'piranha', word)
            word = re.sub(r'\brought\b', 'drought', word)
            word = re.sub(r'\btrumphent\b', 'trumpet', word)
            word = re.sub(r'\btrumpht\b', 'trumpet', word)
            word = re.sub(r'\btrumpt\b', 'trumpet', word)
            word = re.sub(r'\bbookself\b', 'bookcase', word)
            word = re.sub(r'\bbookshlelf\b', 'bookcase', word)
            word = re.sub(r'\bboookcase\b', 'bookcase', word)
            word = re.sub(r'\bchloride\b', 'chlorine', word)
            word = re.sub(r'\bchorline\b', 'chlorine', word)
            word = re.sub(r'\bcholrine\b', 'chlorine', word)
            word = re.sub(r'\bchroine\b', 'chlorine', word)
            word = re.sub(r'\bcinamon\b', 'cinnamon', word)
            word = re.sub(r'\bcinnimin\b', 'cinnamon', word)
            word = re.sub(r'\bcinnoman\b', 'cinnamon', word)
            word = re.sub(r'\bcinnomin\b', 'cinnamon', word)
            word = re.sub(r'\bcoten\b', 'cotton', word)
            word = re.sub(r'\bhalibat\b', 'halibut', word)
            word = re.sub(r'\bhalibiut\b', 'halibut', word)
            word = re.sub(r'\bhallibut\b', 'halibut', word)
            word = re.sub(r'\bhalubit\b', 'halibut', word)
            word = re.sub(r'\bheliu,\b', 'helium', word)
            word = re.sub(r'\belium\b', 'helium', word)
            word = re.sub(r'\bheliu\b', 'helium', word)
            word = re.sub(r'\bhours\b', 'hour', word)
            word = re.sub(r'\bprika\b', 'paprika', word)
            word = re.sub(r'\broses\b', 'rose', word)
            word = re.sub(r'\baregeno\b', 'oregano', word)
            word = re.sub(r'\bblizzzard\b', 'blizzard', word)
            word = re.sub(r'\bblousse\b', 'blouse', word)
            word = re.sub(r'\bdafadil\b', 'daffodil', word)
            word = re.sub(r'\bdaffadil\b', 'daffodil', word)
            word = re.sub(r'\bdaffildil\b', 'daffodil', word)
            word = re.sub(r'\bdaffodils\b', 'daffodil', word)
            word = re.sub(r'\bdaffoldil\b', 'daffodil', word)
            word = re.sub(r'\bdafidil\b', 'daffodil', word)
            word = re.sub(r'\bdafidill\b', 'daffodil', word)
            word = re.sub(r'\bdafidill\b', 'daffodil', word)
            word = re.sub(r'\bdaphadil\b', 'daffodil', word)
            word = re.sub(r'\bdaphadile\b', 'daffodil', word)
            word = re.sub(r'\bdaphadile\b', 'daffodil', word)
            word = re.sub(r'\bdaphadill\b', 'daffodil', word)
            word = re.sub(r'\bdaphidile\b', 'daffodil', word)
            word = re.sub(r'\bdaphodil\b', 'daffodil', word)
            word = re.sub(r'\bdaphodil\b', 'daffodil', word)
            word = re.sub(r'\bdapphodil\b', 'daffodil', word)
            word = re.sub(r'\bdentis\b', 'dentist', word)
            word = re.sub(r'\bdreser\b', 'dresser', word)
            word = re.sub(r'\bdresses\b', 'dress', word)
            word = re.sub(r'\bdrout\b', 'drought', word)
            word = re.sub(r'\bdum\b', 'drum', word)
            word = re.sub(r'\bengeneer\b', 'engineer', word)
            word = re.sub(r'\benigineer\b', 'engineer', word)
            word = re.sub(r'\bhelim\b', 'helium', word)
            word = re.sub(r'\blinene\b', 'linen', word)
            word = re.sub(r'\blamo\b', 'lamp', word)
            word = re.sub(r'\bmountian\b', 'mountain', word)
            word = re.sub(r'\bnecatrina\b', 'nectarine', word)
            word = re.sub(r'\bnecatrine\b', 'nectarine', word)
            word = re.sub(r'\bnectarine\b', 'nectarine', word)
            word = re.sub(r'\bnectatine\b', 'nectarine', word)
            word = re.sub(r'\bnectorin\b', 'nectarine', word)
            word = re.sub(r'\bnectorine\b', 'nectarine', word)
            word = re.sub(r'\bnectrine\b', 'nectarine', word)
            word = re.sub(r'\bnecturine\b', 'nectarine', word)
            word = re.sub(r'\bnictarine\b', 'nectarine', word)
            word = re.sub(r'\bnigrogen\b', 'nitrogen', word)
            word = re.sub(r'\bnitrogren\b', 'nitrogen', word)
            word = re.sub(r'\bocygen\b', 'oxygen', word)
            word = re.sub(r'\borchird\b', 'orchid', word)
            word = re.sub(r'\boreganno\b', 'oregano', word)
            word = re.sub(r'\boregeno\b', 'oregano', word)
            word = re.sub(r'\boregno\b', 'oregano', word)
            word = re.sub(r'\boregno\b', 'oregano', word)
            word = re.sub(r'\boregono\b', 'oregano', word)
            word = re.sub(r'\boxogen\b', 'oxygen', word)
            word = re.sub(r'\bpapricka\b', 'paprika', word)
            word = re.sub(r'\bparot\b', 'parrot', word)
            word = re.sub(r'\bpeperika\b', 'paprika', word)
            word = re.sub(r'\bphamplet\b', 'pamphlet', word)
            word = re.sub(r'\bphirana\b', 'piranha', word)
            word = re.sub(r'\bpirana\b', 'piranha', word)
            word = re.sub(r'\bpiranah\b', 'piranha', word)
            word = re.sub(r'\bpiranna\b', 'piranha', word)
            word = re.sub(r'\bpirhanna\b', 'piranha', word)
            word = re.sub(r'\bpirnaha\b', 'piranha', word)
            word = re.sub(r'\bpirrhana\b', 'piranha', word)
            word = re.sub(r'\bplaeatu\b', 'plateau', word)
            word = re.sub(r'\bplataue\b', 'plateau', word)
            word = re.sub(r'\bplateua\b', 'plateau', word)
            word = re.sub(r'\bplateux\b', 'plateau', word)
            word = re.sub(r'\bplaute\b', 'plateau', word)
            word = re.sub(r'\bpleatu\b', 'plateau', word)
            word = re.sub(r'\bpleteau\b', 'plateau', word)
            word = re.sub(r'\brasperry\b', 'raspberry', word)
            word = re.sub(r'\brumpet\b', 'trumpet', word)
            word = re.sub(r'\bsactatary\b', 'secretary', word)
            word = re.sub(r'\bsaxphone\b', 'saxophone', word)
            word = re.sub(r'\btorando\b', 'tornado', word)
            word = re.sub(r'\btornadeo\b', 'tornado', word)
            word = re.sub(r'\btornadoe\b', 'tornado', word)
            word = re.sub(r'\btornadoe\b', 'tornado', word)
            word = re.sub(r'\btornador\b', 'tornado', word)
            word = re.sub(r'\btornando\b', 'tornado', word)
            word = re.sub(r'\btornao\b', 'tornado', word)
            word = re.sub(r'\btrees\b', 'tree', word)
            word = re.sub(r'\btrmpet\b', 'trumpet', word)
            word = re.sub(r'\btrmpet\b', 'trumpet', word)
            word = re.sub(r'\btrupmet\b', 'trumpet', word)
            word = re.sub(r'\bundewear\b', 'underwear', word)
            word = re.sub(r'\bvelvot\b', 'velvet', word)
            word = re.sub(r'\bvevelt\b', 'velvet', word)
            word = re.sub(r'\bvolvet\b', 'velvet', word)
            word = re.sub(r'\bxophone\b', 'saxophone', word)
            word = re.sub(r'\bxygen\b', 'oxygen', word)
            word = re.sub(r'\bdafodil\b', 'daffodil', word)
            word = re.sub(r'\bhalibit\b', 'halibut', word)

            # From experiment 3
            word = re.sub(r'\beather\b', 'leather', word)
            word = re.sub(r'\bappl\b', 'apple', word)
            word = re.sub(r'\barangano\b', 'oregano', word)
            word = re.sub(r'\barm\b', 'arms', word)
            word = re.sub(r'\barregano\b', 'oregano', word)
            word = re.sub(r'\baxophpne\b', 'saxophone', word)
            word = re.sub(r'\bbizzard\b', 'blizzard', word)
            word = re.sub(r'\bblizard\b', 'blizzard', word)
            #word = re.sub(r'\bBlouse\b', 'blouse', word)
            word = re.sub(r'\bblouser\b', 'blouse', word)
            word = re.sub(r'\bbookshelf\b', 'bookcase', word)
            word = re.sub(r'\bboooks\b', 'book', word)
            #word = re.sub(r'\bCardinal\b', 'cardinal', word)
            word = re.sub(r'\bcartigan\b', 'cardigan', word)
            word = re.sub(r'\bchloride\b', 'chlorine', word)
            #word = re.sub(r'\bComic\b', 'comic', word)
            word = re.sub(r'\bcutton\b', 'cotton', word)
            word = re.sub(r'\bdaffadil\b', 'daffodil', word)
            word = re.sub(r'\bdaffildol\b', 'daffodil', word)
            word = re.sub(r'\bdaffodile\b', 'daffodil', word)
            word = re.sub(r'\bdaffodils\b', 'daffodil', word)
            word = re.sub(r'\bdafidil\b', 'daffodil', word)
            word = re.sub(r'\bdafiodil\b', 'daffodil', word)
            word = re.sub(r'\bdandelion\b', 'dandelion', word)
            word = re.sub(r'\bdanelion\b', 'dandelion', word)
            word = re.sub(r'\bdaphodil\b', 'daffodil', word)
            word = re.sub(r'\bdaphodyl\b', 'daffodil', word)
            word = re.sub(r'\bdoctore\b', 'doctor', word)
            #word = re.sub(r'\bDress\b', 'dress', word)
            word = re.sub(r'\bdresses\b', 'dress', word)
            word = re.sub(r'\bdressor\b', 'dresser', word)
            word = re.sub(r'\bdrough\b', 'drought', word)
            word = re.sub(r'\bdrums\b', 'drum', word)
            #word = re.sub(r'\bEagle\b', 'eagle', word)
            word = re.sub(r'\begale\b', 'eagle', word)
            word = re.sub(r'\beingineer\b', 'engineer', word)
            word = re.sub(r'\bfinc\b', 'finch', word)
            word = re.sub(r'\bFinch\b', 'finch', word)
            word = re.sub(r'\bgraoe\b', 'grape', word)
            word = re.sub(r'\bgrap\b', 'grape', word)
            #word = re.sub(r'\bGrape\b', 'grape', word)
            word = re.sub(r'\bgrapes\b', 'grape', word)
            #word = re.sub(r'\bGreen\b', 'green', word)
            word = re.sub(r'\bgruppy\b', 'guppy', word)
            word = re.sub(r'\bgubby\b', 'guppy', word)
            #word = re.sub(r'\bGuppy\b', 'guppy', word)
            word = re.sub(r'\bhailbut\b', 'halibut', word)
            word = re.sub(r'\bhalbit\b', 'halibut', word)
            word = re.sub(r'\bhalibat\b', 'halibut', word)
            word = re.sub(r'\bhelium\'\b', 'helium', word)
            word = re.sub(r'\bheluim\b', 'helium', word)
            word = re.sub(r'\bhitrogen\b', 'nitrogen', word)
            #word = re.sub(r'\bHour\b', 'hour', word)
            word = re.sub(r'\binen\b', 'linen', word)
            word = re.sub(r'\bjean\b', 'jeans', word)
            word = re.sub(r'\bJesus\b', 'jesus', word)
            word = re.sub(r'\bjupitor\b', 'jupiter', word)
            word = re.sub(r'\blether\b', 'leather', word)
            word = re.sub(r'\blightining\b', 'lightning', word)
            word = re.sub(r'\blightning\b', 'lightning', word)
            word = re.sub(r'\blinnen\b', 'linen', word)
            word = re.sub(r'\bmountains\b', 'mountain', word)
            word = re.sub(r'\bmpet\b', 'trumpet', word)
            word = re.sub(r'\bnectorine\b', 'nectarine', word)
            word = re.sub(r'\bnectraine\b', 'nectarine', word)
            word = re.sub(r'\bnitrgoen\b', 'nitrogen', word)
            #word = re.sub(r'\bNitrogen\b', 'nitrogen', word)
            word = re.sub(r'\bnitrogen\'\b', 'nitrogen', word)
            word = re.sub(r'\bOcean\b', 'ocean', word)
            word = re.sub(r'\bochid\b', 'orchid', word)
            word = re.sub(r'\boctupus\b', 'octopus', word)
            word = re.sub(r'\borchids\b', 'orchid', word)
            word = re.sub(r'\borchird\b', 'orchid', word)
            word = re.sub(r'\borcid\b', 'orchid', word)
            #word = re.sub(r'\boregon\b', 'oregano', word)
            word = re.sub(r'\borgano\b', 'oregano', word)
            word = re.sub(r'\bOxygen\b', 'oxygen', word)
            word = re.sub(r'\bpaino\b', 'piano', word)
            word = re.sub(r'\bpamphelt\b', 'pamphlet', word)
            word = re.sub(r'\bpamplet\b', 'pamphlet', word)
            word = re.sub(r'\bpaprikka\b', 'paprika', word)
            word = re.sub(r'\bparahana\b', 'piranha', word)
            word = re.sub(r'\bparanha\b', 'piranha', word)
            word = re.sub(r'\bpasny\b', 'pansy', word)
            word = re.sub(r'\bpianp\b', 'piano', word)
            word = re.sub(r'\bpidgeon\b', 'pigeon', word)
            word = re.sub(r'\bpiegon\b', 'pigeon', word)
            word = re.sub(r'\bpihrana\b', 'piranha', word)
            word = re.sub(r'\bpirahna\b', 'piranha', word)
            word = re.sub(r'\bpirahnna\b', 'piranha', word)
            word = re.sub(r'\bpiranah\b', 'piranha', word)
            word = re.sub(r'\bpirannhea\b', 'piranha', word)
            word = re.sub(r'\bpirhana\b', 'piranha', word)
            word = re.sub(r'\bpirhanna\b', 'piranha', word)
            word = re.sub(r'\bpirranha\b', 'piranha', word)
            word = re.sub(r'\bpirranhea\b', 'piranha', word)
            word = re.sub(r'\bpirrhana\b', 'piranha', word)
            word = re.sub(r'\bplataeu\b', 'plateau', word)
            word = re.sub(r'\bplateu\b', 'plateau', word)
            word = re.sub(r'\bpleasent\b', 'pleasant', word)
            word = re.sub(r'\bpleatou\b', 'plateau', word)
            word = re.sub(r'\bpoxygen\b', 'oxygen', word)
            word = re.sub(r'\bprihana\b', 'piranha', word)
            word = re.sub(r'\brasberry\b', 'raspberry', word)
            word = re.sub(r'\braseberry\b', 'raspberry', word)
            word = re.sub(r'\brobbin\b', 'robin', word)
            word = re.sub(r'\bsaxaphone\b', 'saxophone', word)
            word = re.sub(r'\bsaxapphone\b', 'saxophone', word)
            word = re.sub(r'\bsaxohpone\b', 'saxophone', word)
            word = re.sub(r'\bSaxophone\b', 'saxophone', word)
            word = re.sub(r'\bsaxopon\b', 'saxophone', word)
            word = re.sub(r'\bsaxphone\b', 'saxophone', word)
            word = re.sub(r'\bsecatary\b', 'secretary', word)
            word = re.sub(r'\bSecretary\b', 'secretary', word)
            word = re.sub(r'\bshoe\b', 'shoes', word)
            word = re.sub(r'\bsulfure\b', 'sulfur', word)
            word = re.sub(r'\btangerene\b', 'tangerine', word)
            word = re.sub(r'\bTangerine\b', 'tangerine', word)
            word = re.sub(r'\btornadeo\b', 'tornado', word)
            #word = re.sub(r'\bTree\b', 'tree', word)
            word = re.sub(r'\btrees\b', 'tree', word)
            word = re.sub(r'\btrought\b', 'drought', word)
            word = re.sub(r'\btrumbone\b', 'trombone', word)
            word = re.sub(r'\bTrumpet\b', 'trumpet', word)
            word = re.sub(r'\btrumphet\b', 'trumpet', word)
            word = re.sub(r'\btumeric\b', 'turmeric', word)
            word = re.sub(r'\bumpet\b', 'trumpet', word)
            word = re.sub(r'\bVelvet\b', 'velvet', word)
            word = re.sub(r'\bvelvet3\b', 'velvet', word)
            word = re.sub(r'\bvelviot\b', 'velvet', word)
            word = re.sub(r'\bViolet\b', 'violet', word)
            word = re.sub(r'\bwoman\b', 'women', word)
            word = re.sub(r'\bwool\\b', 'wool', word)
            word = re.sub(r'\bxophone\b', 'saxophone', word)
            word = re.sub(r'\bcinammon\b', 'cinnamon', word)
            word = re.sub(r'\bcinamon\b', 'cinnamon', word)
            word = re.sub(r'\bcinemmon\b', 'cinnamon', word)
            word = re.sub(r'\bcinnamone\b', 'cinnamon', word)
            word = re.sub(r'\bcinnemon\b', 'cinnamon', word)
            word = re.sub(r'\bchior\b', 'choir', word)
            word = re.sub(r'\bhaliberd\b', 'halibut', word)
            
            #Buffer
            word = re.sub(r'\barm\b', 'arms', word)
            word = re.sub(r'\bgrreen\b', 'green', word)

            #Experiment 3, recall 2
            word = re.sub(r'\bdree\b', 'tree', word)
            word = re.sub(r'\bpiranhea\b', 'piranha', word)
            word = re.sub(r'\boxigen\b', 'oxygen', word)
            word = re.sub(r'\bogran\b', 'organ', word)
            word = re.sub(r'\bchaor\b', 'chair', word)
            word = re.sub(r'\brain\'\b', 'rain', word)
            word = re.sub(r'\bblosue\b', 'blouse', word)
            word = re.sub(r'\boxegen\b', 'oxygen', word)
            word = re.sub(r'\bparrot\\b', 'parrot', word)
            word = re.sub(r'\benigneer\b', 'engineer', word)
            word = re.sub(r'\bhalabit\b', 'halibut', word)
            word = re.sub(r'\bsecetary\b', 'secretary', word)
            word = re.sub(r'\bdemtist\b', 'dentist', word)
            word = re.sub(r'\bdenist\b', 'dentist', word)



            
            # Bias comes from file name
            if os.path.exists('new_encoding_' + str(subjectNumber) +'_0.pkl'):
                biased = 0
            else:
                biased = 1

            # Order comes from SN and impacts correct study list reference
            if (subjectNumber%2 == 0) and (biased == 1):
                order = 1
                if word in studyList2:
                    correct = 1
                else:
                    correct = 0
            elif (subjectNumber%2 == 1) and (biased == 1):
                order = 0     
                if word in studyList1:
                    correct = 1
                else:
                    correct = 0
            if (subjectNumber%2 == 0) and (biased == 0):
                order = 0
                if word in studyList1:
                    correct = 1
                else:
                    correct = 0
            elif (subjectNumber%2 == 1) and (biased == 0):
                order = 1     
                if word in studyList2:
                    correct = 1
                else:
                    correct = 0

            # Indicate buffer
            buffer = 0

            if i in ['arms', 'green', 'uncle', 'hour']:
                buffer = 1
            else:
                buffer = 0

            # Collaboration comes from retrieval_1_2 files (1_2 collaborated, 0_2 did not)
            if os.path.exists('new_retrieve_' + str(subjectNumber) +'_0_2.pkl'):
                collaboration = 0
            else:
                collaboration = 1

            #Problem: This is only the words from the 3. recall. And the column with who they collaborated
            collaborator = 0
            if os.path.exists('new_retrieve_' + str(subjectNumber) + str(subjectNumber+1) + '_1_2.pkl'):
                collaborator = str(subjectNumber+1)
            elif os.path.exists('new_retrieve_' + str(subjectNumber-1) + str(subjectNumber) + '_1_2.pkl'):
                collaborator = str(subjectNumber-1)
            else:
                collaborator = 0         
            
            trialDict = {'SN':subjectNumber, 'biased':biased, 'order':order, 'phase':phase, 'collaboration':collaboration, 'collaborator':collaborator, 
                     'word':word, 'correct':correct, 'buffer':buffer}   
            df_dlm = df_dlm.append(trialDict, ignore_index=True)
            
    except EOFError:
        pass

df_dlm = df_dlm.drop_duplicates(['SN','word', 'phase']) #Delete duplicates
df_dlm = df_dlm.replace('', np.nan)
df_dlm = df_dlm.dropna(axis=0, how='any', subset=['word'])



df_dlm.to_csv('2020-12-17_Similarity_Exp3_AllWords_Clean2.csv', index=False)
df_dlm['number'] = df_dlm['word']
# Includes misspellings and buffer words, same as in experiment 1 plus new intrusions and buffer

translator = {'crow':1,'eagle':2,'finch':3,'parrot':4,'pigeon':5,'cardinal':6,'nitrogen':7,'helium':8,'chlorine':9,'calcium':10,'oxygen':11,
          'mercury':12,'trout':13,'flounder':14,'halibut':15,'guppy':16,'piranha':17,'shark':18,'carnation':19,'orchid':20,'pansy':21,'daffodil':22,'violet':23,'rose':24,
          'nectarine':25,'pear':26,'apple':27,'grape':28,'raspberry':29,'cherry':30,'tuba':31,'drum':32,'trumpet':33,'saxophone':34,'piano':35,'organ':36,'tree':37,'ocean':38,
          'canyon':39,'mountain':40,'plateau':41,'cave':42,'cinnamon':43,'mustard':44,'basil':45,'oregano':46,'paprika':47,'salt':48,'cotton':49,'wool':50,'velvet':51,
          'linen':52,'leather':53,'denim':54,'flyer':55,'newspaper':56,'comic':57,'essay':58,'pamphlet':59,'book':60,'tornado':61,'hail':62,'blizzard':63,'rain':64,'drought':65,
          'lightning':66,'jacket':67,'dress':68,'blouse':69,'underwear':70,'shoes':71,'shirt':72,'lamp':73,'desk':74,'bookcase':75,'dresser':76,'chair':77,'recliner':78,
          'banker':79,'dentist':80,'secretary':81,'engineer':82,'nurse':83,'doctor':84,'a':85,'architect':86,'avidafil?':87,'ballet':88,'baseball':89,'bed':90,
          'bird':91,'blueberry':92,'cage':93,'canary':94,'carbon':95,'chemical':96,'clarinet':97,'coat':98,'concrete':99,'daisy':100,'dog':101,'experimenter':102,
          'fish':103,'flamenco':104,'flower':105,'flute':106,'fruit':107,'give':108,'grass':109,'hydrogen':110,'instrument':111,'lawyer':112,'library':113,'lily':114,
          'lithium':115,'lung':116,'melon':117,'music':118,'nickel':119,'nylon':120,'pants':121,'parsley':122,'pepper':123,'project':124,'red':125,'salmon':126,
          'satin':127,'sea':128,'sky':129,'slave':130,'sofa':131,'storm':132,'table':133,'tango':134,'teacher':135,'the':136,'thunder':137,'thunderstorm':138,'tissue':139,
          'to':140,'trombone':141,'trousers':142,'tulip':143,'tuna':144,'willing':145,'address':146,'cavern':147,'homework':148,'magazine':149,'are':150,
          'dove':151,'sodium':152,'hurricaine':153,'hurricane':154,'violent':155,'violin':156,'green':157,'uncle':158,'hour':159,'arms':160,'orchard':161,
          'paper':162,'sunflower':163,'dandelion':164,'snow':165,'green':166,'professor':167,'socks':168,'purse':169,'ar':170,'e':171,'mium':172,
          't':173,'rk':174, 'hid':175,'breakfast':176,'uncle':177,'hat':178,'hale':179,'speacker':180,'dance':181,'baker':182,'couch':183,'tube':184,
          'valley':185,'tangerine':186,'guitar':187,'orange':188,'skirt':189,'robin':190,'peach':191,'lavender':192,'thyme':193,'drawer':194,
          'cloud':195,'word':196,'paper':197,'carnelian':198,'article':199,'daydream':200,'tectonicplate':201,'blossom':202,'cloth':203,'tsunami':204,
          'legs':205,'keyboard':206,'fiction':207,'water':208,'earth':209,'light':210,'hair':211,'rocks':212,'moon':213,'weather':214,'wind':215,
          'door':216,'letter':217,'time':218,'yellow':219,'river':220,'lake':221,'typhoon':222,'gloves':223,'magnesium':224,'ear':225,'camp':226,
          'seed':227,'phone':228,'fresh':229,'lenin':230,'in':231,'halogen':232,'wager':233,'research':234,'positive':235,'success':236,'pelican':237,
          'computer':238,'plum':239,'although':240,'brazil':241,'fly':242,'brochure':243,'flood':244,'perch':245,'year':246,'lilac':247,'silk':248,
          'disaster':249,'bloom':250,'politician':251,'onion':252,'jazz':253,'peat':254,'flinch':255,'weatherphenomenon':256,'natural':257,
          'flavoring':258,'food':259,'furniture':260,'formation':261,'papya':262,'snowstorm':263,'basketball':264, 'jeans':265,'park':266,'cactus':267,
          'window':268,'bankteller':269,'keys':270,'raven':271,'purple':272,'cat':273,'candy':274,'mom':275,'dad':276,'brother':277,'sister':278,'chicken':279,'bread':280,
          'science':281,'english':282,'psy':283,'biology':284,'canopy':285,'run':286,'fast':287,'school':288,'waitress':289,'cow':290,
          'turmeric':291,'cinnamone':292,'perple':293,'fear':294,'male':295,'pleasant':296,'nicotine':297,'sparrow':298,'6312414461hmu':299,
          'cardigan':300,'bank':301,'clementine':302,'house':303,'squash':304,'basket':305,'cilantro':306,'owl':307,'cinemmon':308,'saffron':309,
          'erect':310,'exit':311,'egress':312,'laptop':314,'clemintine':315,'home':316,'ught':317,'blackberry':318, 'seagull':319, 'bear':320,
          'bass':321,'god':322,'love':323,'mercy':324,'feel':325,'nephew':326,'sun':327,'jesus':328,'man':329,'women':330,'men':331,'university':332,
          'take':333,'get':334,'have':335,'go':336,'meet':337,'someone':338,'somebody':339,'piece':340,'female':341,'song':342,'hiphop':343,'hoody':344,
          'but':345,'what':346,'which':347,'when':348,'who':349,'whose':350,'how':351,'mean':352,'iq':353,'friend':354,'boy':355,'girl':356,'all':357,
          'guest':358,'same':359,'silmilar':360,'hope':361,'peace':362,'forgive':363,'beat':364,'hello':365,'hi':366,'nice':367,'good':368,'carrot':369,
          'banana':370,'strawberry':371,'cranberry':372,'car':373,'truck':374,'sneak':375,'monkey':376,'monster':377,'dark':378,'cyber':379,'space':380,
          'teach':381,'learn':382,'me':383,'you':384,'i':385,'puppy':386,'mermaid':387,'bye':388,'se':389,'see':390,'next':391,'goes':392,'breeze':393,
          'deer':394,'crayon':395,'mustered':396,'waves':397,'suit':398,'bluejay':399,'pieceoffurniture':400,'typeofinstrument':401,'gout':402,
          'box':403,'mouse':404,'rat':405,'whale':406,'week':407,'minute':408,'appointment':409,'schedule':410,'leaf':411,'news':412,'sulfur':413,
          'wardrobe':414,'halibit':415,'polyester':416,'choir':417,'octopus':418,'snake':419,'jupiter':420,'afruit':421,'ajoborprofession':422,
          'aweathercondition':423,'abird':424,'aflower':425,'apieceoffurniture':426,'afish':427,'ageographicalformation':428,'amusicalinstrument':429,
          'nan':430,'nen':431,'ton':432, 'shelf':433,'apply':434,'glove':435,'cabarnet':436,'floud':437,'happy':438,'guinea':439,'torso':440,'parsley':441,
          'ran':442,'valor':443,'honor':444,'courage':445,'homework':446,'work':447,'money':448,'envelope':449,'life':450,'rail':451,'office':452,'oregon':453,
          'carnage':454, 'birch':455, 'pine':456}
            # Changed 'haliberd':313 from intrusion to correction into halibut
            # Carnage, birch added due to intrusion participant from dlm file


df_dlm.number = [translator[item] for item in df_dlm.number] 
                          
# Import files with recalls 1 & 2                          
df_dlm.to_csv('2020-12-17_Similarity_Exp3_AllWords_Clean2.csv', index=False)
print(df_dlm)
df = pd.read_csv('2020-12-17_Similarity_Exp3_AllWords_Clean1.csv')                        

# Combine recalls (1 & 2) & 3
frames = [df, df_dlm]
df_both = pd.concat(frames)
df_both.to_csv('2020-12-17_Similarity_Exp3_AllWords_Clean.csv', index=False)

      SN  biased  order  collaboration collaborator      word  correct  \
0      1       0      1              0            0      tree        1   
1      1       0      1              0            0     trout        1   
2      1       0      1              0            0     ocean        1   
3      1       0      1              0            0  mountain        1   
4      1       0      1              0            0     green        1   
...   ..     ...    ...            ...          ...       ...      ...   
2275  93       1      0              0            0     apple        1   
2276  93       1      0              0            0      rain        1   
2278  93       1      0              0            0    helium        1   
2279  93       1      0              0            0   calcium        1   
2280  93       1      0              0            0    cotton        1   

      buffer  phase  number  
0          0      3      37  
1          0      3      13  
2          0      3  

In [8]:
# ATTENTION! This needs to be manually adjusted over the experiments to identify the max of permutations, which is specific to the experiment
def permu(n):
    df_max = pd.read_csv('2020-08-14_Similarity_Availability_AllWords_Clean.csv')
    maxi = max(df_max.number) 
    #print(maxi)
    all = list(range(0,maxi))
    y = list(itertools.permutations(all, n))
    return y

#### __B4. Similarity for dyads by sub-group__

* Attention: These analyses only work if no participant recalled no words (all participants recalled at least 1 word)
* The order of str1 and str2 in the code is important, as str1 always needs to correspond to the first SN in the combo and str2 to the second, however, it is completely irrelevant of a specific SN is str1 or str2 as the code usually starts with the longer one anyways, if it makes a difference at all

In [26]:
df = pd.read_csv('2020-12-17_Similarity_Exp3_AllWords_Clean.csv')

In [5]:
# Quick overview about how many participants are in which condition
df1 = df.drop_duplicates(subset=['SN'], keep='first')
#print(df1)
df_000 = len(df1.SN[(df1['collaboration']==0) & (df1['biased']==0) & (df1['order']==0)].tolist())
df_011 = len(df1.SN[(df1['collaboration']==0) & (df1['biased']==1) & (df1['order']==1)].tolist())
df_010 = len(df1.SN[(df1['collaboration']==0) & (df1['biased']==1) & (df1['order']==0)].tolist())
df_001 = len(df1.SN[(df1['collaboration']==0) & (df1['biased']==0) & (df1['order']==1)].tolist())
df_100 = len(df1.SN[(df1['collaboration']==1) & (df1['biased']==0) & (df1['order']==0)].tolist())
df_111 = len(df1.SN[(df1['collaboration']==1) & (df1['biased']==1) & (df1['order']==1)].tolist())
df_110 = len(df1.SN[(df1['collaboration']==1) & (df1['biased']==1) & (df1['order']==0)].tolist())
df_101 = len(df1.SN[(df1['collaboration']==1) & (df1['biased']==0) & (df1['order']==1)].tolist())
print('000=', df_000)
print('011=', df_011)
print('010=', df_010)
print('001=', df_001)
print('100=', df_100)
print('111=', df_111)
print('110=', df_110)
print('101=', df_101)

000= 11
011= 11
010= 10
001= 9
100= 9
111= 12
110= 9
101= 12


In [10]:
# Calculate mean averages
print('sum_pP_count_phase1', df['SN'].unique())
# Total word recalled phase 1
sum_pP_count_phase1 = df[(df['phase'] == 1)].groupby('SN').count()
mean_total_phase1 = sum_pP_count_phase1.mean()
sd_total_phase1 = sum_pP_count_phase1.std()
print('mean_total', mean_total_phase1)
print('sd_total', sd_total_phase1)
# Total words recalled phase 3
sum_pP_count_phase3 = df[(df['phase'] == 3)].groupby('SN').count()
mean_total_phase3 = sum_pP_count_phase3.mean()
sd_total_phase3 = sum_pP_count_phase3.std()
print('mean_total', mean_total_phase3)
print('sd_total', sd_total_phase3)
# Total words recalled phase 3 collaborated
sum_pP_count_phase3_Collab1 = df[(df['phase'] == 3)&(df['collaboration'] == 1)].groupby('SN').count()
mean_total_phase3_Collab1 = sum_pP_count_phase3_Collab1.mean()
sd_total_phase3_Collab1 = sum_pP_count_phase3_Collab1.std()
print('mean_total', mean_total_phase3_Collab1)
print('sd_total', sd_total_phase3_Collab1)
# Total words recalled phase 3 not collaborated
sum_pP_count_phase3_Collab0 = df[(df['phase'] == 3)&(df['collaboration'] == 0)].groupby('SN').count()
mean_total_phase3_Collab0 = sum_pP_count_phase3_Collab0.mean()
sd_total_phase3_Collab0 = sum_pP_count_phase3_Collab0.std()
print('mean_total', mean_total_phase3_Collab0)
print('sd_total', sd_total_phase3_Collab0)
# Total words recalled phase 1 not collaborated
sum_pP_count_phase3_Collab0 = df[(df['phase'] == 1)&(df['collaboration'] == 0)].groupby('SN').count()
mean_total_phase3_Collab0 = sum_pP_count_phase3_Collab0.mean()
sd_total_phase3_Collab0 = sum_pP_count_phase3_Collab0.std()
print('mean_total', mean_total_phase3_Collab0)
print('sd_total', sd_total_phase3_Collab0)

sum_pP_count_phase1 [   1    2    3    4    5    6    7    8    9   10   11   12   15   16
   17   18   19   22   23   24   26   28   29   30   31   32   33   34
   35   36   37   38   39   40   41   42   43   46   47   48   49   50
   51   52   53   54   55   56   57   58   59   60   61   62   64   66
   67   68   69   70   71   72   73   74   75   76   77   78   79   80
   81   82   83   84   85   86   87   88   89   90   91   92   93 1112
 1718 2223 3031 3435 4041 4849 5253 5657 6061 6970 7374 7778 8182 8384
 8586 8788 8990 9192]
mean_total biased           20.26506
order            20.26506
collaboration    20.26506
collaborator     20.26506
word             20.26506
correct          20.26506
buffer           20.26506
phase            20.26506
number           20.26506
dtype: float64
sd_total biased           9.357512
order            9.357512
collaboration    9.357512
collaborator     9.357512
word             9.357512
correct          9.357512
buffer           9.357512
phase     

In [28]:
def similarity(df, collab1, bias1, order1, phase1, collab2, bias2, order2, phase2, TrueCollab=True, Self=True):
  
    """
    USAGE:
    all similarity measures = similarity(df, collab1, bias1, order1, phase1, collab2, bias2, order2, phase2, TrueCollab=True, Self=True)
    Attention!!! If synthetic (nominal collaborative) groups are included in the comparison, they always have to be in the second position!!!

    ARGUMENTS:
    df          Dataframe to be used
    
    collab1     Collaborative condition for group 1: 1=yes, 0=no
    bias1       Bias condition for group1: 0=no bias, 1=bias1
    order1      If biased, then order 0 or 1
    phase1      Which recall phase? Recall 2 for experiments 1(A) and 2(1B)
    partner1    If this is a 100 group one can indicate whether 100 collaborated with 110 or 111 (collab, bias, order). 
                A collaborating, unbiased, order 0 (100) participant always collaborated with a collaborating, biased, of any or specific order (110 or 111?)
    
    collab2     Collaborative condition for group 2: 1=yes, 0=no
    bias2       Bias condition for group 2: 0=no bias, 1=bias1
    order2      If biased, then order 0 or 1
    phase2      Which recall phase? Recall 2 for experiments 1(A) and 2(1B)
    partner2    If this is a 100 group one can indicate whether 100 collaborated with 110 or 111. 
    
    TrueCollab  Did the participants on the two groups actually collaborate: Collaborated with each other=True, Nominal/Synthetic groups=False
    Self        Only for experiment 3: Is this a pre-post collaboration comparison? (How about within-group?)

    DESCRIPTION:
    This helps to calculate the different specific group comparisons
    
    QUESTIONS:
    why would bias be 2?
    what is the partner for in partner1==1?
    where does df come from? it contains all? 
    
    REMINDER:
       
    """

    # Identify all participants in the two groups
    # Select the correct group 1 (identifies all participants in this group)
    Group1 = df.SN[(df['collaboration'] == collab1) & (df['biased'] == bias1)  & (df['order'] == order1) & (df['phase'] == phase1)]
    Group1 = Group1.unique().tolist()
    print('normalGroup1')
    print('SN1', Group1)
           
    # Select the correct group 2
    Group2 = df.SN[(df['collaboration'] == collab2) & (df['biased'] == bias2)  & (df['order'] == order2) & (df['phase'] == phase2)]
    Group2 = Group2.unique().tolist()
    print('normalGroup2')
    print('SN2', Group2)    
    
    # Merge all participants for both groups
    groupSN = Group2 + Group1
    print('groupSN', groupSN)
    
    # Calculate all potential permutations (aka combinations of participants) for ngram "index"
    # For bigrams
    maxi = max(df.number) 
    all = list(range(0,maxi))
    p2 = list(itertools.permutations(all, 2))
    # For trigrams
    maxi = max(df.number) 
    all = list(range(0,maxi))
    
    # Calculate the average descriptive stats (# true, fals words) per participant in first group (1. set up list, 2. calculate mean per SN, 3. save mean per SN, 4. calculate mean across group)
    average_recalled_t_Group1 = []
    average_correct_recalled_t_Group1 = []
    average_intrusion_recalled_t_Group1 = []
    for i in Group1:            
        average_recalled_Group1 = df.number[(df['SN'] == i) & (df['collaboration'] == collab1) & (df['biased'] == bias1)  & (df['order'] == order1) & (df['phase'] == phase1)].count()
        average_correct_recalled_Group1 = df.number[(df['SN'] == i) & (df['correct'] == 1) & (df['collaboration'] == collab1) & (df['biased'] == bias1)  & (df['order'] == order1) & (df['phase'] == phase1)].count()
        average_intrusion_recalled_Group1 = df.number[(df['SN'] == i) & (df['correct'] == 0) & (df['collaboration'] == collab1) & (df['biased'] == bias1)  & (df['order'] == order1) & (df['phase'] == phase1)].count()
        average_recalled_t_Group1.append(average_recalled_Group1)
        average_correct_recalled_t_Group1.append(average_correct_recalled_Group1)
        average_intrusion_recalled_t_Group1.append(average_intrusion_recalled_Group1)
    average_recalled_t_Group1 = np.mean(average_recalled_t_Group1)
    average_correct_recalled_t_Group1 = np.mean(average_correct_recalled_t_Group1)
    average_intrusion_recalled_t_Group1 = np.mean(average_intrusion_recalled_t_Group1)
    print('average_recalled_t_Group1=', average_recalled_t_Group1)
    print('average_correct_recalled_t_Group1=', average_correct_recalled_t_Group1)
    print('average_intrusion_recalled_t_Group1=', average_intrusion_recalled_t_Group1)

    # Calculate the average descriptive stats (# true, fals words) per participant in second group (1. set up list, 2. calculate mean per SN, 3. save mean per SN, 4. calculate mean across group)
    average_recalled_t_Group2 = []
    average_correct_recalled_t_Group2 = []
    average_intrusion_recalled_t_Group2 = []
    for i in Group2:
        average_recalled_Group2 = df.number[(df['SN'] == i) & (df['collaboration'] == collab2) & (df['biased'] == bias2)  & (df['order'] == order2) & (df['phase'] == phase2)].count()
        average_correct_recalled_Group2 = df.number[(df['SN'] == i) & (df['correct'] == 1) & (df['collaboration'] == collab2) & (df['biased'] == bias2)  & (df['order'] == order2) & (df['phase'] == phase2)].count()
        average_intrusion_recalled_Group2 = df.number[(df['SN'] == i) & (df['correct'] == 0) & (df['collaboration'] == collab2) & (df['biased'] == bias2)  & (df['order'] == order2) & (df['phase'] == phase2)].count()
        average_recalled_t_Group2.append(average_recalled_Group2)
        average_correct_recalled_t_Group2.append(average_correct_recalled_Group2)
        average_intrusion_recalled_t_Group2.append(average_intrusion_recalled_Group2)
    average_recalled_t_Group2 = np.mean(average_recalled_t_Group2)
    average_correct_recalled_t_Group2 = np.mean(average_correct_recalled_t_Group2)
    average_intrusion_recalled_t_Group2 = np.mean(average_intrusion_recalled_t_Group2)
    print('average_recalled_t_Group2=', average_recalled_t_Group2)
    print('average_correct_recalled_t_Group2=', average_correct_recalled_t_Group2)
    print('average_intrusion_recalled_t_Group2=', average_intrusion_recalled_t_Group2)



    # Calculate the dyad specific similarity measures 

    # Identify all potential/theoretical dyads across the two groups
    # In theory I only need this for nominal and nominal collaborative groups, but this was the easiest (albeit not the fastest way)
    allCombos = list(itertools.product(Group1, Group2))
    allCombos2 = len(allCombos)
    
    # Set up data frame to save results per dyad in
    dfresults = pd.DataFrame(columns=['Comparison','Dyad', 'intersection', 'overlap', 'Jaccard', 'SMC', 'lcs', \
                                      'OmEuni', 'OmEbi', 'OdEuni', 'OdEbi', 'OdMuni', 'OdMbi', 'OmEdMuni', 'OmEdMbi', \
                                      'OmEdMmEMuni', 'OmEdMmEMbi', 'pairedFreq', 'ITR2', 'ARC2', 'editdist','mod_editdist', \
                                      'editdist_IDST', 'editdist_IDS', 'editdist_ID', 'editdist_IDT'])

    # Calculate similarity measures for every dyad
    # for every dyad (in all combos)
    for j in allCombos:
        # Reset all values
        if Self == False and TrueCollab == False:            
            #str1 = df.number[(df['collaboration'] == collab1) & (df['biased'] == bias1) & (df['order'] == order1) & (df['SN'] == j[0]) & (df['phase'] == phase1)]
            #str2 = df.number[(df['collaboration'] == collab2) & (df['biased'] == bias2) & (df['order'] == order2) & (df['SN'] == j[1]) & (df['phase'] == phase2)]    
            str1 = df.number[(df['collaboration'] == collab1) & (df['biased'] == bias1) & (df['order'] == order1) & (df['SN'] == j[0]) & (df['collaborator'] != j[1]) & (df['phase'] == phase1)]
            str2 = df.number[(df['collaboration'] == collab2) & (df['biased'] == bias2) & (df['order'] == order2) & (df['SN'] == j[1]) & (df['collaborator'] != j[0]) & (df['phase'] == phase2)] 
        # This would only need to be done for each SN and not all combos
        elif Self == True and TrueCollab == False:
            str1 = df.number[(df['collaboration'] == collab1) & (df['biased'] == bias1) & (df['order'] == order1) & (df['SN'] == j[0]) & (df['phase'] == phase1)]
            str2 = df.number[(df['collaboration'] == collab2) & (df['biased'] == bias2) & (df['order'] == order2) & (df['SN'] == j[0]) & (df['phase'] == phase2)]           
        elif Self == False and TrueCollab == True:
            str1 = df.number[(df['collaboration'] == collab1) & (df['biased'] == bias1) & (df['order'] == order1) & (df['SN'] == j[0]) & (df['collaborator'] == j[1]) & (df['phase'] == phase1)]
            str2 = df.number[(df['collaboration'] == collab2) & (df['biased'] == bias2) & (df['order'] == order2) & (df['SN'] == j[1]) & (df['collaborator'] == j[0]) & (df['phase'] == phase2)] 
        
        num_comm = len(np.intersect1d(str1, str2))

        if len(str1)==0 or len(str2)==0:
            pass
        else:
            #num_pairs = ngram_abs(str1, str2, 2, p2, unidirectional=True)
            #num_trip = ngram_abs(str1, str2, 3, p3, unidirectional=True) 

            a = intersection(str1, str2)
            b = overlap(str1, str2)
            c = Jaccard(str1, str2)
            d = SMC(str1, str2)
            f = OmE(str1, str2, 2, unidirectional=True)
            g = OmE(str1, str2, 2, unidirectional=False)
            h = OdE(str1, str2, 2, unidirectional=True)
            z = OdE(str1, str2, 2, unidirectional=False)
            k = OdM(str1, str2, 2, unidirectional=True)
            l = OdM(str1, str2, 2, unidirectional=False)
            m = OmEdM(str1, str2, 2, unidirectional=True)
            n = OmEdM(str1, str2, 2, unidirectional=False)
            o = OmEdMmE(str1, str2, 2, unidirectional=True)
            p = OmEdMmE(str1, str2, 2, unidirectional=False)
            q = pairedFreq(str1, str2)
            r = ITR2(str1, str2)
            s = ARC2(str1, str2)

            str1 = str1.reset_index(drop=True)
            str2 = str2.reset_index(drop=True)
            str1 = np.array(str1)
            str2 = np.array(str2)
            e = lcs(str1, str2)
            t = editdist(str1, str2, min_threshold = 0)
            u = mod_editdist(str1, str2, min_threshold = 0)
            v = edit_dists(str1, str2, insert=True, delete=True, substitute=True, transpose=True)
            w = edit_dists(str1, str2, insert=True, delete=True, substitute=True, transpose=False)
            x = edit_dists(str1, str2, insert=True, delete=True, substitute=False, transpose=False)
            y = edit_dists(str1, str2, insert=True, delete=True, substitute=False, transpose=True)

            # it saves all prior values for abc... if j exists but no update. so have to reset all to zero
            trialDict = {'Comparison':(str(collab1) + str(bias1) + str(order1) + str(phase1) + str(int(TrueCollab)) + '_' \
                         + str(collab2) + str(bias2) + str(order2) + str(phase2) + str(int(TrueCollab))),'Dyad':j, \
                         'intersection':a, 'overlap':b, 'Jaccard':c, 'SMC':d, 'lcs':e,  \
                         'OmEuni':f, 'OmEbi':g, 'OdEuni':h, 'OdEbi':z, 'OdMuni':k, 'OdMbi':l, 'OmEdMuni':m, 'OmEdMbi':n, 'OmEdMmEMuni':o, \
                         'OmEdMmEMbi':p, 'pairedFreq':q, 'ITR2':r, 'ARC2':s, 'editdist':t,'mod_editdist':u, \
                         'editdist_IDST':v, 'editdist_IDS':w, 'editdist_ID':x, 'editdist_IDT':y}              
                        # 'num_pairs':num_pairs, 'num_trip':num_trip,
            dfresults = dfresults.append(trialDict, ignore_index=True)    
        
    # Calculate mean for descriptive statistics
    mean = dfresults.mean()
    #mode_0000_0001 = dfresults.mode()
    #median_0000_0001 = dfresults.median()
    print('Mean=', mean)
    #print('Mode=',mode_0000_0001)
    #print('Median=', median_0000_0001)
    # If file name is changed, it has to also be changed below when merging the different files
    dfresults.to_csv('2022-04-18_Similarity_Exp3_AllWords_Clean_'+ str(collab1) + str(bias1) + str(order1) + str(phase1) + str(int(TrueCollab)) + str(int(Self))+ '_' +  str(collab2) + str(bias2) + str(order2) + str(phase2) + str(int(TrueCollab))+ str(int(Self)) +'_Results.csv', index=False)

In [47]:
# RESEARCH QUESTION 1: Typical comparisons
# similarity(df, collab1, bias1, order1, phase1, collab2, bias2, order2, phase2, TrueCollab=True, Self=True)
# Reminder: Experiment 3 had half of the lists longer than the other half. Order in experiment 3 means which lists was longer. 
# Reminder cntd.: Always one biased and one unbiased person collaborated. Participants always collaborated with a person with the same order (two consecutive numbers).
# Nominal Pre
similarity(df, 0,0,0,1,0,1,0,1,False,False) #unbiased,order1 & biased,order1
similarity(df, 0,0,1,1,0,1,1,1,False,False) #unbiased,order2 & biased,order2
# Nominal collaborative Pre
similarity(df, 1,1,1,1,1,0,1,1,False,False) #biased,order2 & unbiased,order2
similarity(df, 1,1,0,1,1,0,0,1,False,False) #biased,order1 & unbiased,order1
# Collaborative Pre
similarity(df, 1,1,1,1,1,0,1,1,True,False) #biased,order2 & unbiased,order2
similarity(df, 1,1,0,1,1,0,0,1,True,False) #biased,order1 & unbiased,order1
# Nominal Post
similarity(df, 0,0,0,3,0,1,0,3,False,False) #unbiased,order1 & biased,order1
similarity(df, 0,0,1,3,0,1,1,3,False,False) #unbiased,order2 & biased,order2
# Nominal collaborative Post
similarity(df, 1,1,1,3,1,0,1,3,False,False) #biased,order2 & unbiased,order2
similarity(df, 1,1,0,3,1,0,0,3,False,False) #biased,order1 & unbiased,order1
# Collaborative Post
similarity(df, 1,1,1,3,1,0,1,3,True,False) #biased,order2 & unbiased,order2
similarity(df, 1,1,0,3,1,0,0,3,True,False) #biased,order1 & unbiased,order1

normalGroup1
SN1 [24, 28, 32, 36, 38, 42, 46, 50, 54, 58, 62]
normalGroup2
SN2 [29, 33, 37, 39, 43, 47, 51, 55, 59, 93]
groupSN [29, 33, 37, 39, 43, 47, 51, 55, 59, 93, 24, 28, 32, 36, 38, 42, 46, 50, 54, 58, 62]
average_recalled_t_Group1= 22.363636363636363
average_correct_recalled_t_Group1= 21.636363636363637
average_intrusion_recalled_t_Group1= 0.7272727272727273
average_recalled_t_Group2= 13.8
average_correct_recalled_t_Group2= 12.6
average_intrusion_recalled_t_Group2= 1.2
[(24, 29), (24, 33), (24, 37), (24, 39), (24, 43), (24, 47), (24, 51), (24, 55), (24, 59), (24, 93), (28, 29), (28, 33), (28, 37), (28, 39), (28, 43), (28, 47), (28, 51), (28, 55), (28, 59), (28, 93), (32, 29), (32, 33), (32, 37), (32, 39), (32, 43), (32, 47), (32, 51), (32, 55), (32, 59), (32, 93), (36, 29), (36, 33), (36, 37), (36, 39), (36, 43), (36, 47), (36, 51), (36, 55), (36, 59), (36, 93), (38, 29), (38, 33), (38, 37), (38, 39), (38, 43), (38, 47), (38, 51), (38, 55), (38, 59), (38, 93), (42, 29), (42, 33

In [48]:
# RESEARCH QUESTION 2: Similarity of people before and after collaboration
# Participant pre & post collab (so they needed to collab; therefore only 4 and not 8 comaprisons)
similarity(df, 1,0,0,1,1,0,0,3,False,True)
similarity(df, 1,0,1,1,1,0,1,3,False,True)
similarity(df, 1,1,0,1,1,1,0,3,False,True)
similarity(df, 1,1,1,1,1,1,1,3,False,True)
# Partner pre & participant post collab
similarity(df, 1,1,0,1,1,0,0,3,True,False) 
similarity(df, 1,1,1,1,1,0,1,3,True,False) 
similarity(df, 1,0,0,1,1,1,0,3,True,False) 
similarity(df, 1,0,1,1,1,1,1,3,True,False) 
# non-partner pre & participant post collab
similarity(df, 1,1,0,1,1,0,0,3,False,False) 
similarity(df, 1,1,1,1,1,0,1,3,False,False) 
similarity(df, 1,0,0,1,1,1,0,3,False,False) 
similarity(df, 1,0,1,1,1,1,1,3,False,False) 

normalGroup1
SN1 [6, 8, 18, 70, 74, 78, 82, 86, 90]
normalGroup2
SN2 [6, 8, 18, 70, 74, 78, 82, 86, 90]
groupSN [6, 8, 18, 70, 74, 78, 82, 86, 90, 6, 8, 18, 70, 74, 78, 82, 86, 90]
average_recalled_t_Group1= 29.555555555555557
average_correct_recalled_t_Group1= 26.0
average_intrusion_recalled_t_Group1= 3.5555555555555554
average_recalled_t_Group2= 44.44444444444444
average_correct_recalled_t_Group2= 32.44444444444444
average_intrusion_recalled_t_Group2= 12.0
[(6, 6), (6, 8), (6, 18), (6, 70), (6, 74), (6, 78), (6, 82), (6, 86), (6, 90), (8, 6), (8, 8), (8, 18), (8, 70), (8, 74), (8, 78), (8, 82), (8, 86), (8, 90), (18, 6), (18, 8), (18, 18), (18, 70), (18, 74), (18, 78), (18, 82), (18, 86), (18, 90), (70, 6), (70, 8), (70, 18), (70, 70), (70, 74), (70, 78), (70, 82), (70, 86), (70, 90), (74, 6), (74, 8), (74, 18), (74, 70), (74, 74), (74, 78), (74, 82), (74, 86), (74, 90), (78, 6), (78, 8), (78, 18), (78, 70), (78, 74), (78, 78), (78, 82), (78, 86), (78, 90), (82, 6), (82, 8), (82, 18)

In [49]:
# RESEARCH QUESTION 3: Individual memory similairt pre/post collab
# Self-similarity recall 1-2
similarity(df, 0,0,0,1,0,0,0,2,False,True) 
similarity(df, 0,0,1,1,0,0,1,2,False,True) 
similarity(df, 0,1,0,1,0,1,0,2,False,True) 
similarity(df, 0,1,1,1,0,1,1,2,False,True) 

# Self-similarity recall 2-3
similarity(df, 0,0,0,2,0,0,0,3,False,True) 
similarity(df, 0,0,1,2,0,0,1,3,False,True) 
similarity(df, 0,1,0,2,0,1,0,3,False,True) 
similarity(df, 0,1,1,2,0,1,1,3,False,True) 

# Self-similarity recall 1-3
similarity(df, 0,0,0,1,0,0,0,3,False,True) 
similarity(df, 0,0,1,1,0,0,1,3,False,True) 
similarity(df, 0,1,0,1,0,1,0,3,False,True) 
similarity(df, 0,1,1,1,0,1,1,3,False,True) 

normalGroup1
SN1 [24, 28, 32, 36, 38, 42, 46, 50, 54, 58, 62]
normalGroup2
SN2 [24, 28, 32, 36, 38, 42, 46, 50, 54, 58, 62]
groupSN [24, 28, 32, 36, 38, 42, 46, 50, 54, 58, 62, 24, 28, 32, 36, 38, 42, 46, 50, 54, 58, 62]
average_recalled_t_Group1= 22.363636363636363
average_correct_recalled_t_Group1= 21.636363636363637
average_intrusion_recalled_t_Group1= 0.7272727272727273
average_recalled_t_Group2= 21.818181818181817
average_correct_recalled_t_Group2= 20.454545454545453
average_intrusion_recalled_t_Group2= 1.3636363636363635
[(24, 24), (24, 28), (24, 32), (24, 36), (24, 38), (24, 42), (24, 46), (24, 50), (24, 54), (24, 58), (24, 62), (28, 24), (28, 28), (28, 32), (28, 36), (28, 38), (28, 42), (28, 46), (28, 50), (28, 54), (28, 58), (28, 62), (32, 24), (32, 28), (32, 32), (32, 36), (32, 38), (32, 42), (32, 46), (32, 50), (32, 54), (32, 58), (32, 62), (36, 24), (36, 28), (36, 32), (36, 36), (36, 38), (36, 42), (36, 46), (36, 50), (36, 54), (36, 58), (36, 62), (38, 24), (38, 28), (38, 3

#### B5. Group means (Results presented in Manuscript)

In [57]:
# Merge all self-similarity in one file
# If file name was changed above, it has to also be changed here
all_files = glob.glob("2022-04-18_Similarity_Exp3_AllWords_Clean_*_Results.csv") 
print(all_files)
results = pd.concat((pd.read_csv(f) for f in all_files),sort=False)
results.to_csv('2022-04-18_Similarity_Exp3_AllWords_Clean_Comparisons.csv', index=False)

['2022-04-18_Similarity_Exp3_AllWords_Clean_000100_010100_Results.csv', '2022-04-18_Similarity_Exp3_AllWords_Clean_000101_000201_Results.csv', '2022-04-18_Similarity_Exp3_AllWords_Clean_000101_000301_Results.csv', '2022-04-18_Similarity_Exp3_AllWords_Clean_000201_000301_Results.csv', '2022-04-18_Similarity_Exp3_AllWords_Clean_000300_010300_Results.csv', '2022-04-18_Similarity_Exp3_AllWords_Clean_001100_011100_Results.csv', '2022-04-18_Similarity_Exp3_AllWords_Clean_001101_001201_Results.csv', '2022-04-18_Similarity_Exp3_AllWords_Clean_001101_001301_Results.csv', '2022-04-18_Similarity_Exp3_AllWords_Clean_001201_001301_Results.csv', '2022-04-18_Similarity_Exp3_AllWords_Clean_001300_011300_Results.csv', '2022-04-18_Similarity_Exp3_AllWords_Clean_010101_010201_Results.csv', '2022-04-18_Similarity_Exp3_AllWords_Clean_010101_010301_Results.csv', '2022-04-18_Similarity_Exp3_AllWords_Clean_010201_010301_Results.csv', '2022-04-18_Similarity_Exp3_AllWords_Clean_011101_011201_Results.csv', '2022

In [75]:
# Reminder: Naming convention of groups
# collab1) + (bias1) + (order1) + (phase1) + ((TrueCollab)) + ((Self))+ '_' +  str(collab2) + str(bias2) + str(order2) + str(phase2) + str(int(TrueCollab))+ str(int(Self))

# Step 1: Introduce new naming that allows to average across order
#RESEARCH Q1 (usual groups)
results.loc[results['Comparison'] == '00010_01010', 'Group'] = '11_Nominal_Pre' 
results.loc[results['Comparison'] == '00110_01110', 'Group'] = '11_Nominal_Pre' 
results.loc[results['Comparison'] == '11010_10010', 'Group'] = '12_NominalCollab_Pre' 
results.loc[results['Comparison'] == '11110_10110', 'Group'] = '12_NominalCollab_Pre' 
results.loc[results['Comparison'] == '11011_10011', 'Group'] = '13_Collab_Pre' 
results.loc[results['Comparison'] == '11111_10111', 'Group'] = '13_Collab_Pre' 

results.loc[results['Comparison'] == '00030_01030', 'Group'] = '14_Nominal_Post' 
results.loc[results['Comparison'] == '00130_01130', 'Group'] = '14_Nominal_Post' 
results.loc[results['Comparison'] == '11030_10030', 'Group'] = '15_NominalCollab_Post' 
results.loc[results['Comparison'] == '11130_10130', 'Group'] = '15_NominalCollab_Post' 
results.loc[results['Comparison'] == '11031_10031', 'Group'] = '16_Collab_Post' 
results.loc[results['Comparison'] == '11131_10131', 'Group'] = '16_Collab_Post' 

# RESEARCH Q2 (collab only)
results.loc[results['Comparison'] == '10010_10030', 'Group'] = '21_SelfPrePost' 
results.loc[results['Comparison'] == '10110_10130', 'Group'] = '21_SelfPrePost' 
results.loc[results['Comparison'] == '11010_11030', 'Group'] = '21_SelfPrePost' 
results.loc[results['Comparison'] == '11110_11130', 'Group'] = '21_SelfPrePost' 

results.loc[results['Comparison'] == '11011_10031', 'Group'] = '22_PartnerPre_SelfPost' 
results.loc[results['Comparison'] == '11111_10131', 'Group'] = '22_PartnerPre_SelfPost'
results.loc[results['Comparison'] == '10011_11031', 'Group'] = '22_PartnerPre_SelfPost'
results.loc[results['Comparison'] == '10111_11131', 'Group'] = '22_PartnerPre_SelfPost'

results.loc[results['Comparison'] == '11010_10030', 'Group'] = '23_NonPartnerPre_SelfPost' 
results.loc[results['Comparison'] == '11110_10130', 'Group'] = '23_NonPartnerPre_SelfPost'
results.loc[results['Comparison'] == '10010_11030', 'Group'] = '23_NonPartnerPre_SelfPost'
results.loc[results['Comparison'] == '10110_11130', 'Group'] = '23_NonPartnerPre_SelfPost'


# RESEARCH Q3 (individual only) 
results.loc[results['Comparison'] == '00010_00020', 'Group'] = '31_Self1_2' 
results.loc[results['Comparison'] == '00110_00120', 'Group'] = '31_Self1_2'
results.loc[results['Comparison'] == '01010_01020', 'Group'] = '31_Self1_2'
results.loc[results['Comparison'] == '01110_01120', 'Group'] = '31_Self1_2'

results.loc[results['Comparison'] == '00020_00030', 'Group'] = '32_Self2_3'
results.loc[results['Comparison'] == '00120_00130', 'Group'] = '32_Self2_3'
results.loc[results['Comparison'] == '01020_01030', 'Group'] = '32_Self2_3'
results.loc[results['Comparison'] == '01120_01130', 'Group'] = '32_Self2_3'

results.loc[results['Comparison'] == '00010_00030', 'Group'] = '33_Self1_3'
results.loc[results['Comparison'] == '00110_00130', 'Group'] = '33_Self1_3'
results.loc[results['Comparison'] == '01010_01030', 'Group'] = '33_Self1_3'
results.loc[results['Comparison'] == '01110_01130', 'Group'] = '33_Self1_3'

# Step 2: Average across order using new naming convention
results.to_csv('2022-04-18_Similarity_Exp3_AllWords_Clean-RelevantComparisons2.csv', index=False)
GroupMeans = round(results.groupby('Group').mean(),2)
print('MEANS', GroupMeans)
GroupSD = round(results.groupby('Group').std(),2)
print('SD', GroupSD)

MEANS                            intersection  overlap  Jaccard   SMC   lcs  OmEuni  \
Group                                                                           
11_Nominal_Pre                     4.66     0.31     0.13  0.68  1.05    0.09   
12_NominalCollab_Pre               5.23     0.35     0.14  0.70  1.04    0.08   
13_Collab_Pre                      5.10     0.33     0.13  0.69  1.10    0.22   
14_Nominal_Post                    5.91     0.33     0.14  0.66  1.18    0.30   
15_NominalCollab_Post             10.10     0.45     0.21  0.65  1.50    0.61   
16_Collab_Post                    15.29     0.67     0.37  0.76  1.76    1.18   
21_SelfPrePost                    15.83     0.75     0.47  0.86  2.07    1.68   
22_PartnerPre_SelfPost             9.74     0.55     0.25  0.72  1.31    0.31   
23_NonPartnerPre_SelfPost          7.16     0.40     0.17  0.66  1.21    0.25   
31_Self1_2                        16.00     0.85     0.66  0.94  1.98    1.75   
32_Self2_3            

In [2]:
# To be Continued. Alex to add semi-partial calculation