In [4]:
import numpy as np
import pandas as pd

# Task Data #

In [5]:
df = pd.read_csv('../data/raw/sample_task_data.csv')

df

Unnamed: 0,Participant Number,iPad Number,T1 Snack,T2 Bathroom,T3 Marker,T4 Multipy,T5 Use Tool,T6 Movie,T7 Breadstick,T8 Nickname,T9 Joke,T10 Weekend
0,1,1,C+6,F+4,F+4,C+5,C+3,C+6,C+4,C+2,C+11,F+2
1,2,2,C+7,C+7,C+5,C+6,C+9,C+6,C+5,C+3,C-2,C+7
2,3,3,F+3,F+4,F+4,C+7,F+2,F+4,C+3,C+2,F+8,C+3
3,4,1,C+3,C+6,C+5,C+6,C+3,S-1,S+1,F+2,I,C+3
4,5,2,C+4,F+4,F+5,C+6,C+6,C+6,S+1,C+4,C-5,F+3
5,6,3,F+6,C+5,F+4,F+5,F+6,F+6,F+6,C+2,C+2,F+8
6,7,1,F+3,F+4,C+5,C+6,C+7,C+6,C+6,C+2,C+13,F+5
7,8,2,F+4,F+3,F+2,F+8,C+7,F+5,F+5,C+2,F+11,F+6
8,9,3,C+6,C+6,C+4,F+5,C+4,C+7,C+4,C+5,C+11,C+6
9,10,1,F+4,F+3,F+4,F+5,F+4,C+7,F+4,C+2,F+7,C+6


The following two cells were used to clean a previous sample. They are left as a reference in case future files have similar issues.

In [6]:
# df = df.drop(columns=['Unnamed: 12'])
# df

In [7]:
# df = df.dropna()
# df

I change the labels to make it easier to work with in code during the analysis portion.

In [8]:
labels = {"Participant Number":"p_number",
          "iPad Number":"ipad_number",
          "T1 Snack":"t1",
          "T2 Bathroom":"t2",
          "T3 Marker":"t3",
          "T4 Multipy":"t4",
          "T5 Use Tool":"t5",
          "T6 Movie":"t6",
          "T7 Breadstick":"t7",
          "T8 Nickname":"t8",
          "T9 Joke":"t9",
          "T10 Weekend":"t10"}
df = df.rename(columns=labels)
df

Unnamed: 0,p_number,ipad_number,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10
0,1,1,C+6,F+4,F+4,C+5,C+3,C+6,C+4,C+2,C+11,F+2
1,2,2,C+7,C+7,C+5,C+6,C+9,C+6,C+5,C+3,C-2,C+7
2,3,3,F+3,F+4,F+4,C+7,F+2,F+4,C+3,C+2,F+8,C+3
3,4,1,C+3,C+6,C+5,C+6,C+3,S-1,S+1,F+2,I,C+3
4,5,2,C+4,F+4,F+5,C+6,C+6,C+6,S+1,C+4,C-5,F+3
5,6,3,F+6,C+5,F+4,F+5,F+6,F+6,F+6,C+2,C+2,F+8
6,7,1,F+3,F+4,C+5,C+6,C+7,C+6,C+6,C+2,C+13,F+5
7,8,2,F+4,F+3,F+2,F+8,C+7,F+5,F+5,C+2,F+11,F+6
8,9,3,C+6,C+6,C+4,F+5,C+4,C+7,C+4,C+5,C+11,C+6
9,10,1,F+4,F+3,F+4,F+5,F+4,C+7,F+4,C+2,F+7,C+6


In [9]:
df = df.astype({'p_number': 'int32'})
df = df.astype({'ipad_number': 'int32'})

df.dtypes

p_number        int32
ipad_number     int32
t1             object
t2             object
t3             object
t4             object
t5             object
t6             object
t7             object
t8             object
t9             object
t10            object
dtype: object

The following function will parse out my encoding scheme into the three values. These values are then transformed into the appropriate data types and stored as separate columns in the clean data.

In [10]:
def print_vals(df):
    resultsDf = pd.DataFrame()
    for label, content in df.items():
        types = []
        success = []
        length = []
        for index, value in content.items():
            if len(value) > 1:
                ans_type = value[0]
                is_success = value[1]
                ans_length = int(value[2:])
            else:
                ans_type = 0
                is_success = False
                ans_length = 0

            if ans_type == "I":
                ans_type = 0
            elif ans_type == "S":
                ans_type = 1
            elif ans_type == "F":
                ans_type = 2
            elif ans_type == "C":
                ans_type = 3

            if is_success == "+":
                is_success = True
            else:
                is_success = False
                
            types.append(ans_type)
            success.append(is_success)
            length.append(ans_length)
            
        types_label = label + "_type"
        success_label = label + "_success"
        length_label = label + "_numWords"
        
        
        resultsDf[types_label] = types
        resultsDf[success_label] = success
        resultsDf[length_label] = length
        
    return resultsDf

The function only works on task data, so the resulting DataFrame must then be concatenated with the participant number and the iPad number.

In [11]:
new_df = print_vals(df[["t1","t2","t3","t4","t5","t6","t7","t8","t9","t10"]])
frames = [df[["p_number", "ipad_number"]], new_df]
clean_df = pd.concat(frames, axis=1)

In [12]:
clean_df

Unnamed: 0,p_number,ipad_number,t1_type,t1_success,t1_numWords,t2_type,t2_success,t2_numWords,t3_type,t3_success,...,t7_numWords,t8_type,t8_success,t8_numWords,t9_type,t9_success,t9_numWords,t10_type,t10_success,t10_numWords
0,1,1,3,True,6,2,True,4,2,True,...,4,3,True,2,3,True,11,2,True,2
1,2,2,3,True,7,3,True,7,3,True,...,5,3,True,3,3,False,2,3,True,7
2,3,3,2,True,3,2,True,4,2,True,...,3,3,True,2,2,True,8,3,True,3
3,4,1,3,True,3,3,True,6,3,True,...,1,2,True,2,0,False,0,3,True,3
4,5,2,3,True,4,2,True,4,2,True,...,1,3,True,4,3,False,5,2,True,3
5,6,3,2,True,6,3,True,5,2,True,...,6,3,True,2,3,True,2,2,True,8
6,7,1,2,True,3,2,True,4,3,True,...,6,3,True,2,3,True,13,2,True,5
7,8,2,2,True,4,2,True,3,2,True,...,5,3,True,2,2,True,11,2,True,6
8,9,3,3,True,6,3,True,6,3,True,...,4,3,True,5,3,True,11,3,True,6
9,10,1,2,True,4,2,True,3,2,True,...,4,3,True,2,2,True,7,3,True,6


# Survey Data #