In [1]:
import pandas as pd
import numpy as np

**Labeling open text questions from the study results.**
- input: formatted results export from UXtweak study
- output: auto-labeled results, some still need to be labeled manually

# Result labeling

In [2]:
results = pd.read_csv("../data/results_unlabeled.csv", index_col=0)

In [3]:
results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 726 entries, 0 to 725
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   respondent   726 non-null    int64  
 1   task         726 non-null    int64  
 2   interval     726 non-null    object 
 3   time         726 non-null    float64
 4   company      725 non-null    object 
 5   purpose      725 non-null    object 
 6   detail       723 non-null    object 
 7   description  725 non-null    object 
 8   PQ1          726 non-null    float64
 9   ATT1         726 non-null    float64
 10  PQ2          726 non-null    float64
 11  HQ1          726 non-null    float64
 12  PQ3          726 non-null    float64
 13  HQ2          726 non-null    float64
 14  HQ3          726 non-null    float64
 15  ATT2         726 non-null    float64
 16  PQ4          726 non-null    float64
 17  HQ4          726 non-null    float64
 18  familiarity  726 non-null    bool   
dtypes: bool(

In [4]:
results.head()

Unnamed: 0,respondent,task,interval,time,company,purpose,detail,description,PQ1,ATT1,PQ2,HQ1,PQ3,HQ2,HQ3,ATT2,PQ4,HQ4,familiarity
0,3,1,2s,94.0,Whistles,Clothing,Panda bear,There were pictures of people of different age...,7.0,7.0,7.0,6.0,6.0,5.0,6.0,7.0,7.0,7.0,False
1,3,2,2s,425.0,Glasses 4 U,"Glasses, contact lenses, sunglasses",1,"Pictures of people smiling, some with sunglasses.",6.0,6.0,7.0,6.0,6.0,6.0,6.0,7.0,7.0,7.0,False
2,3,3,2s,73.0,Hillsport,Sport gear,2022,There was a picture of a white leopard. There ...,6.0,7.0,7.0,6.0,5.0,6.0,6.0,7.0,7.0,6.0,False
3,3,4,2s,85.0,Weathery,Information about the weather,Monday,There were lots of different weather forecasts...,7.0,7.0,7.0,6.0,7.0,5.0,6.0,7.0,7.0,6.0,False
4,3,5,2s,413.0,Mega Zoo,Pet supplies,Paw prints,There were different pictures of items for pets,5.0,5.0,6.0,4.0,5.0,4.0,5.0,6.0,6.0,4.0,False


## Automatic labeling

In [5]:
results.company = results.company.astype(str)
results['company_eval'] = 0

results.purpose = results.purpose.astype(str)
results.insert(6, 'purpose_eval', 0)

results.detail = results.detail.astype(str)
results['detail_eval'] = 0

**Company's name labeling.**

In [6]:
for index, row in results.iterrows():
    if(
        row.company.strip().lower() in [
            'whistles','whistle', 'whistless', 'whistels', 'wistles', 'whistlers', 'whilstles'] + [
            'i-care',  'i care', 'icare',  'i -care', 'i- care', 'i - care'] + [
            'hillsport', 'hill sport', 'hilsport', 'hilsport', 'hill-sport', 'hills sport', 'hillspot', 'hill sports', 'hill spot'] + [
            'weathery', 'weatherly', 'weather', 'wethery', 'weathry', 'weatherby', 'wearthy'] + [
            'mega zoo', 'megazoo', 'mega-zoo', 'meg zoo', 'mego zoo', 'maga zoo', 'mega zoom'] + [
            'tech live', 'tech-live', 'techlive', 'tech - live', 'techlife', 'tech-life', 'tech life', 'tech live (i think)', 'live tech'
        ]
    ):
        results.loc[index, 'company_eval'] = 1
    elif(
        row.company.strip().lower() in [
            'eye care', 'care', 'ice care', 'i-clear',
            'hillwear', 'sports', 'hillstart', 'hill sort', 'hill',
            'zoo', 'mini zoo', 'repti zoo', 'mega vets', 'moo zoo', 'zoofood',
            'weather office',
            'tech', 'tech - something', 'tech page', 'tech lite'
        ]
    ):
        results.loc[index, 'company_eval'] = 0.5
    else:
        results.loc[index, 'company_eval'] = 0

**Company's purpose labeling.**

In [7]:
for index, row in results.iterrows():
    if(
        row.task == 1 and any(word in row.purpose.strip().lower() for word in [
            'clothing', 'clothes', 'sustainable fashion', 'cloths'
        ]) or
        row.task == 2 and any(word in row.purpose.strip().lower() for word in [
            'glasses', 'eyewear', 'eye wear', 'eye products', 'eye care products', 'eye services', 'gkasses',
            'vision related services', 'eyeware',  'opticians', 'lenses'
        ]) or
        row.task == 3 and any(word in row.purpose.strip().lower() for word in [
            'clothes', 'clothing', 'sportswear', 'outdoor', 'mountain sports', 'activity equipment', 'sports attire'
            'sportwear', 'hiking', 'sports wear', 'sporting equipment', 'gear', 'activewear', 'sportwear', 'sports ware',
            'active wear', 'sports attire'
        ]) or
        row.task == 4 and any(word in row.purpose.strip().lower() for word in [
            'forecast', 'weather', 'wheather'
        ]) or
        row.task == 5 and any(word in row.purpose.strip().lower() for word in [
            'pet food', 'pet supplies', 'for animals', 'pet products', 'pet related products', 'animal food',
            'animal products', 'pet items', 'pet supples', 'related to pet-keeping', 'supplies for pets',
            'animal accessories', 'animal supplies', 'stuff for pets', 'pet related', 'pet accessories',
            'pet needs', 'pet-related', 'products for a variety of pets', 'pet care products', 'varoety of things for pets',
            'items for pets', 'they sell food, and toys for pets'
        ]) or
        row.task == 6 and any(word in row.purpose.strip().lower() for word in [
            'tech news', 'technology news', 'about tech', 'tech information', 'tech industry', 'gadgets and technology',
            'latest tecnological', 'on tech', 'of tech', 'tech related', 'modern technologies', 'technology related', 
            'tech blog', 'technology community', 'for tech', 'tech knowledge', 'relating to technology',
            'news about itech', 'tech info', 'technology sector', 'technology info', 'tech articles'
        ])
    ):
        results.loc[index, 'purpose_eval'] = 1
    elif(
        row.task == 2 and row.purpose.strip().lower() in [
            'eye things and holiday things', 'eyesight'
        ] or
        row.task == 3 and row.purpose.strip().lower() in [
            'extreme winter sport services', 'backpacks and running shoes', 'it\'s about trekking.'
        ] or
        row.task == 5 and any(word in row.purpose.strip().lower() for word in [
            'animal feed', 'animals', 'animal stuff', 'petfood', 'bio sand'
        ]) or
        row.task == 6 and any(word in row.purpose.strip().lower() for word in [
            'tech', 'news', 'articles', 'online magazine'
        ])
    ):
        results.loc[index, 'purpose_eval'] = 0.5
    else:
        results.loc[index, 'purpose_eval'] = 0

**Web page's detail labeling.**

In [8]:
for index, row in results.iterrows():
    if(
        row.task == 1 and row.detail.strip().lower() in [
            'hedgehog', 'hedghog', 'badger/hedgehog'
        ] or
        row.task == 2 and row.detail.strip().lower() in [
            '2', '2?', 'i think there were two children.', '2 ?', 'two', '2 kids two adults i think'
        ] or
        row.task == 3 and row.detail.strip().lower() in [
            '2020', '2020 summer/fall', '2020 I think'
        ] or
        row.task == 4 and row.detail.strip().lower() in [
            'thursday', 'i think maybe thursday? but i\'m not sure why i think that.', 'thursday?'
        ] or
        row.task == 5 and row.detail.strip().lower() in [
            'paw prints', 'dog treats', 'dog paw?', 'bones', 'paws', 'a paw', 'dog bones', 'dogbone?',
            'dog bone', 'paw prints?', 'dog paws', 'paw and circle', 'paw prints?', 'dog paw', 
            'i think i saw a paw print.', 'maybe a paw print?', 'paw prints, bones and leaves?', 
            'i think it was paw prints. it (as best as i can remember) was a white and gold box with black pawprints.',
            'bones (for dogs)', 'dog treats such as bones etc?', 'bones and the company logo', 'a paw print',
            'bone', 'animal footprints', 'cat paw/print'
        ] or
        row.task == 6 and row.detail.strip().lower() in [
            'landscape', 'i think there was some sort of power plant', 'wind turbine', 'wind turbines',
            'power plant', 'electricity plant', 'alternative power source', 'tanks with a windmill', 'windmill', 
            'i\'m not sure i think it was a view of something like a power plant or such but could be wrong. it appeared to be outdoors and industrial in nature but again i could be wrong.',
            'it looked like a battery farm but i am not sure if this is the correct term. it looks like it is in a remote location with a turbine in the background'
        ]
    ):
        results.loc[index, 'detail_eval'] = 1
    elif(
        row.task == 2 and row.detail.strip().lower() in [
           '1 or 2', '2-3?', 'unsure - 2 or 3', "1 or 2,  but i can't recall.",
        ] or
        row.task == 3 and row.detail.strip().lower() in [
           '202-',
        ] or
        row.task == 4 and row.detail.strip().lower() in [
           'not sure but tuesday or thursday', 'all days of the week', 'everyday'
        ] or
        row.task == 5 and row.detail.strip().lower() in [
           'i think it was the characters of the brand name mega zoo', 'circle?', 'circle',
           'the mega zoo logo i think', 'mega zoo'
        ] or
        row.task == 6 and row.detail.strip().lower() in [
           'battery farm', 'A power station', 'power station', 'an industrial plant - refinery or something similar'
        ]
    ):
        results.loc[index, 'detail_eval'] = 0.5
    else:
        results.loc[index, 'detail_eval'] = 0

In [9]:
results.columns

Index(['respondent', 'task', 'interval', 'time', 'company', 'purpose',
       'purpose_eval', 'detail', 'description', 'PQ1', 'ATT1', 'PQ2', 'HQ1',
       'PQ3', 'HQ2', 'HQ3', 'ATT2', 'PQ4', 'HQ4', 'familiarity',
       'company_eval', 'detail_eval'],
      dtype='object')

**Remove unecessary columns.**

In [10]:
results.company = results.company_eval
results.detail = results.detail_eval

In [11]:
results = results.drop(columns={
    'company_eval', 'detail_eval'
})

In [12]:
results.head()

Unnamed: 0,respondent,task,interval,time,company,purpose,purpose_eval,detail,description,PQ1,ATT1,PQ2,HQ1,PQ3,HQ2,HQ3,ATT2,PQ4,HQ4,familiarity
0,3,1,2s,94.0,1.0,Clothing,1.0,0.0,There were pictures of people of different age...,7.0,7.0,7.0,6.0,6.0,5.0,6.0,7.0,7.0,7.0,False
1,3,2,2s,425.0,0.0,"Glasses, contact lenses, sunglasses",1.0,0.0,"Pictures of people smiling, some with sunglasses.",6.0,6.0,7.0,6.0,6.0,6.0,6.0,7.0,7.0,7.0,False
2,3,3,2s,73.0,1.0,Sport gear,1.0,0.0,There was a picture of a white leopard. There ...,6.0,7.0,7.0,6.0,5.0,6.0,6.0,7.0,7.0,6.0,False
3,3,4,2s,85.0,1.0,Information about the weather,1.0,0.0,There were lots of different weather forecasts...,7.0,7.0,7.0,6.0,7.0,5.0,6.0,7.0,7.0,6.0,False
4,3,5,2s,413.0,1.0,Pet supplies,1.0,1.0,There were different pictures of items for pets,5.0,5.0,6.0,4.0,5.0,4.0,5.0,6.0,6.0,4.0,False


## Preparation for manual labeling

In [13]:
results.insert(9, 'desc_eval_char', '-')
results.insert(9, 'desc_eval_elem', '-')
results.insert(9, 'desc_eval_comp', '-')

In [14]:
results.head()

Unnamed: 0,respondent,task,interval,time,company,purpose,purpose_eval,detail,description,desc_eval_comp,...,ATT1,PQ2,HQ1,PQ3,HQ2,HQ3,ATT2,PQ4,HQ4,familiarity
0,3,1,2s,94.0,1.0,Clothing,1.0,0.0,There were pictures of people of different age...,-,...,7.0,7.0,6.0,6.0,5.0,6.0,7.0,7.0,7.0,False
1,3,2,2s,425.0,0.0,"Glasses, contact lenses, sunglasses",1.0,0.0,"Pictures of people smiling, some with sunglasses.",-,...,6.0,7.0,6.0,6.0,6.0,6.0,7.0,7.0,7.0,False
2,3,3,2s,73.0,1.0,Sport gear,1.0,0.0,There was a picture of a white leopard. There ...,-,...,7.0,7.0,6.0,5.0,6.0,6.0,7.0,7.0,6.0,False
3,3,4,2s,85.0,1.0,Information about the weather,1.0,0.0,There were lots of different weather forecasts...,-,...,7.0,7.0,6.0,7.0,5.0,6.0,7.0,7.0,6.0,False
4,3,5,2s,413.0,1.0,Pet supplies,1.0,1.0,There were different pictures of items for pets,-,...,5.0,6.0,4.0,5.0,4.0,5.0,6.0,6.0,4.0,False


**Export.**

In [15]:
results.to_csv("../data/results_to_label.csv")