# Python Session 1
## We will practice cleaning some Food choice task data

We are going to generate data from 20 individuals to practice our skills. In the task, participants rate 50 foods for healthiness, tastiness and choice. We are simulating this data below.



In [2]:
import pandas as pd
import numpy as np
import random

In [37]:
import random
import pandas as pd

# Define blocks and trial structure
blocks = ['health', 'taste', 'choice']
trials_per_block = 75
participants = range(1, 21)

# Generate 50 unique foods
base_foods = [
    'apple', 'banana', 'burger', 'carrot', 'donut', 'eggs', 'fries', 'grapes', 'ice cream', 'kale',
    'pizza', 'yogurt', 'spinach', 'steak', 'candy', 'popcorn', 'mango', 'nuts', 'cheese', 'chicken',
    'broccoli', 'chocolate', 'granola', 'lettuce', 'pasta', 'salmon', 'tofu', 'soda', 'rice', 'beans',
    'cucumber', 'peach', 'bacon', 'cereal', 'toast', 'avocado', 'beef', 'peanut butter', 'cake', 'milk',
    'watermelon', 'pear', 'turkey', 'onion rings', 'oatmeal', 'cranberries', 'syrup', 'waffles', 'cookie', 'shrimp'
]
assert len(base_foods) == 50

# Assign fat and sugar levels randomly
food_properties = {}
for food in base_foods:
    fat = random.choices(['high', 'low'], weights=[0.4, 0.6])[0]
    sugar = random.choices(['high', 'low'], weights=[0.5, 0.5])[0]
    food_properties[food] = {'fat': fat, 'sugar': sugar}

# Generate trials
all_trials = []

for participant in participants:
    for block in blocks:
        for trial_num in range(1, trials_per_block + 1):
            food = random.choice(base_foods)
            rt_missing = random.random() < 0.02  # 2% chance of missing RT
            reaction_time = None if rt_missing else round(random.uniform(0.5, 4.0), 2)
            rating = None if reaction_time is None else random.randint(1, 10)

            fat = food_properties[food]['fat']
            sugar = food_properties[food]['sugar']

            trial = {
                'participant': participant,
                'block': block,
                'trial_number': trial_num,
                'food': food,
                'reaction_time': reaction_time,
                'rating': rating,
                'fat': fat,
                'sugar': sugar
            }
            all_trials.append(trial)

# Create DataFrame
df = pd.DataFrame(all_trials)

# Validate logic: rating is only missing if RT is missing
assert all(df[df['rating'].isna()]['reaction_time'].isna())

# The data are stored in a dataframe object, which we have called df
To access items in the dataframe, we need to type "df"

In [14]:
#If we want to see the data, we can just type
df

Unnamed: 0,participant,block,trial_number,food,reaction_time,rating,fat,sugar
0,1,health,1,cake,0.88,3.0,high,low
1,1,health,2,waffles,1.69,4.0,low,high
2,1,health,3,grapes,0.74,9.0,low,high
3,1,health,4,grapes,0.79,8.0,low,high
4,1,health,5,toast,1.66,6.0,low,low
...,...,...,...,...,...,...,...,...
4495,20,choice,71,popcorn,2.15,7.0,low,high
4496,20,choice,72,toast,1.86,7.0,low,low
4497,20,choice,73,onion rings,2.10,9.0,low,high
4498,20,choice,74,cookie,2.61,6.0,high,low


In [38]:
# To see anything in df we will need to reference df first
df.columns

Index(['participant', 'block', 'trial_number', 'food', 'reaction_time',
       'rating', 'fat', 'sugar'],
      dtype='object')

In [21]:
# We can also look at the values of columns
# All of these will access the food column
df.food
df['food']
df.iloc[:,3]

0              cake
1           waffles
2            grapes
3            grapes
4             toast
           ...     
4495        popcorn
4496          toast
4497    onion rings
4498         cookie
4499          pizza
Name: food, Length: 4500, dtype: object

In [20]:
# Try here with RT

0       0.88
1       1.69
2       0.74
3       0.79
4       1.66
        ... 
4495    2.15
4496    1.86
4497    2.10
4498    2.61
4499    2.76
Name: reaction_time, Length: 4500, dtype: float64

In [39]:
# To analyze this data, we will first need to remove any missing trials
# let's find the missing values
df.reaction_time[df.reaction_time.isna()==True]

292    NaN
363    NaN
455    NaN
464    NaN
469    NaN
        ..
4281   NaN
4323   NaN
4348   NaN
4436   NaN
4455   NaN
Name: reaction_time, Length: 105, dtype: float64

In [40]:
df.reaction_time[df.reaction_time > 3]

# What would we change to see RTs < 2 only?

8       3.18
10      3.64
11      3.46
19      3.91
25      3.40
        ... 
4484    3.86
4486    3.95
4488    3.52
4490    3.15
4492    3.61
Name: reaction_time, Length: 1211, dtype: float64

In [42]:
# make a new data frame with no missing values
df1 = df[df.reaction_time.isna()==True]

In [None]:
# Now we want to perform some calculations on this data-set
# let's start by summarizing, for one person the health rating

# Filter for participant 1 and the 'health' block
participant_id = 1
health_block = df[(df['participant'] == participant_id) & (df['block'] == 'health')]

# Remove missing ratings (i.e., where RT was missing)
valid_ratings = health_block['rating'].dropna()

# Calculate the average health rating
average_health_rating = valid_ratings.mean()

print(f"Participant {participant_id}'s average health rating: {average_health_rating:.2f}")


In [None]:
#Try for health only for low and high-fat

In [43]:
#Now let's create a new dataframe and store each persons average RT and rating for high and low fat foods

# Group by participant, block, and fat level
summary_df = (
    df
    .dropna(subset=['rating', 'reaction_time'])  # Exclude trials with missing values
    .groupby(['participant', 'block', 'fat'])
    .agg(
        average_rating=('rating', 'mean'),
        average_reaction_time=('reaction_time', 'mean'),
        trial_count=('rating', 'count')  # Optional: to see how many valid trials per group
    )
    .reset_index()
)

print(summary_df.head())


   participant   block   fat  average_rating  average_reaction_time  \
0            1  choice  high        5.083333               2.010000   
1            1  choice   low        5.650794               2.090000   
2            1  health  high        4.444444               2.417778   
3            1  health   low        5.625000               2.231042   
4            1   taste  high        5.600000               2.165333   

   trial_count  
0           12  
1           63  
2           27  
3           48  
4           15  


In [44]:
# Pivot to wide format
wide_df = summary_df.pivot_table(
    index='participant',
    columns=['block', 'fat'],
    values=['average_rating', 'average_reaction_time']
)




   participant  average_rating_choice_high  average_rating_choice_low  \
0            1                    5.083333                   5.650794   
1            2                    5.666667                   5.425926   
2            3                    5.739130                   5.875000   
3            4                    5.074074                   6.111111   
4            5                    5.333333                   5.196078   

   average_rating_health_high  average_rating_health_low  \
0                    4.444444                   5.625000   
1                    5.884615                   4.875000   
2                    5.241379                   6.116279   
3                    6.086957                   5.423077   
4                    5.137931                   5.913043   

   average_rating_taste_high  average_rating_taste_low  \
0                   5.600000                  6.116667   
1                   5.695652                  5.215686   
2                   5.3000

In [None]:
# Step 3: Flatten column names
wide_df.columns = [f'{stat}_{block}_{fat}' for stat, block, fat in wide_df.columns]
wide_df = wide_df.reset_index()

print(wide_df.head())

In [None]:
# Here try and simulate a different dataset - a monetary choice task where the participant
# selects between an immediate vs delayed reward. Compare the RT between when the participant
# chooses the immediate vs delayed option

In [48]:
# navigate to the directory
import os
wd="/Users/emilylloyd/Documents/Coding Course"
os.chdir(wd)
#read the dataset in
data=pd.read_csv("DelayDisc_example.csv")

In [61]:
# figure out whether left or right column is delayed (1 is left, 2 is right)
data['delayed_opt']= "none"
data.loc[data['delay_left'] < data['delay_right'],'delayed_opt'] ==2
data.loc[data['delay_left'] > data['delay_right'],'delayed_opt'] ==1

0      False
1      False
2      False
3      False
4      False
       ...  
109    False
111    False
112    False
114    False
117    False
Name: delayed_opt, Length: 62, dtype: bool

In [60]:
# Now summarize the RT for each person when they chose delayed vs chose sooner reward



KeyError: 'delayed_opt'

# This is the homework exercise

In [49]:
# Here calculate the average earnings per person and the number of times they chose delayed vs sooner

## Upload solution to Github

Unnamed: 0,onset durati,on,choice,agent,money_left,delay_left,money_right,delay_right,participant
0,17.009420,5,1,1,23.21,131,10.99,51,1
1,28.013655,5,1,1,16.43,32,9.99,19,1
2,43.017407,5,1,1,38.44,33,32.02,12,1
3,56.021820,5,1,1,38.66,100,26.57,24,1
4,71.024792,5,1,1,29.54,142,27.76,6,1
...,...,...,...,...,...,...,...,...,...
115,344.112863,5,1,0,30.54,38,37.25,132,2
116,357.117812,5,2,0,23.07,82,29.27,140,2
117,368.121754,5,1,0,33.18,165,27.61,104,2
118,383.125799,5,1,0,6.15,51,14.76,177,2


# Extra
## These are hard exercises - not homework, for extra practice

In [None]:
# Hard

# Here simulate your own Delay Discounting Task and calculate some average metrics

In [None]:
# Very hard
# One outcome of interest is the discount rate, k, which denotes extent to which someone discounts
# value of delayed rewards (higher values = less patient)

# Here you can use chatGPT to get the formula for k - see whether you can calculate for each person
# in your dataset