In [1]:
###UPF Brain structure brain project University of Helsinki 6.02.2024
###Arsene Kanyamibwa, Daniel Fängström 
rm(list=ls())

#install packages (if needed)
#install.packages("tidyverse")
#install.packages("ggplot2")
#install.packages("Hmisc")

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
#Load the necessary libraries 
  library(tidyverse)
  library(ggplot2)
  library(ggpubr)
  library(Hmisc)
  library(readxl)
 
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::


── [1mAttaching core tidyverse packages[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::

In [4]:
UKBB_participant_diet_answers=read.table('/dagher/dagher11/filip/UPF/data/coalesced_intake_all_timepoints.csv', sep=',', quote='"', header=T)
#colnames(UKBB_participant_diet_answers)=gsub('_coalesced','',colnames(UKBB_participant_diet_answers)) #remove coalesced from column names
UKBB_participant_diet_answers$Participant=UKBB_participant_diet_answers$eid
mastersheet=readRDS('/dagher/dagher11/filip/UPF/data/mastersheet_clean.rds')

# MANUALLY change column names to correspond to the mastersheet

In [62]:
change_to=c('pure_fruit_vegetable_juice_intake',
'sugar_added_to_coffee_intake',
'sugar_added_to_tea_intake',
'beer/cider_intake',
'whole-wheat_cereal_intake',
"goat's_cheese_intake",
'number_of_bread_slices_with_butter/margarine_intake',
'number_of_baguettes_with_butter/margarine_intake',
'number_of_baps_with_butter/margarine_intake',
'number_of_bread_rolls_with_butter/margarine_intake',
'number_of_crackers/crispbreads_with_butter/margarine_intake',
'number_of_oatcakes_with_butter/margarine_intake',
'number_of_other_bread_types_with_butter/margarine_intake',
'ice-cream_intake',
'milk-based_pudding_intake',
'other_milk-based_pudding_intake',
'chocolate-covered_raisin_intake',
'chocolate-covered_biscuits_intake',
'powdered/instant_soup_intake',
'crumbed_or_deep-fried_poultry_intake',
'lobster/crab_intake',
'vegetarian_sausages/burgers_intake',
'boiled/baked_potatoes_intake',
'cabbage/kale_intake',
'turnip/swede_intake',
'peach/nectarine_intake',
'other_coffee_type_intake')

to_change=c('pure_fruitvegetable_juice_intake','intake_of_sugar_added_to_coffee','intake_of_sugar_added_to_tea',
            'beercider_intake','whole.wheat_cereal_intake','goats_cheese_intake', 'number_of_bread_slices_with_buttermargarine',
            'number_of_baguettes_with_buttermargarine','number_of_baps_with__buttermargarine',
            'number_of_bread_rolls_with__buttermargarine','number_of_crackerscrispbreads_with_buttermargarine',
            'number_of_oatcakes_with__buttermargarine','number_of_other_bread_types_with__buttermargarine',
            'ice.cream_intake','milk.based_pudding_intake','other_milk.based_pudding_intake','chocolate.covered_raisin_intake',
            'chocolate.covered_biscuits_intake','powderedinstant_soup_intake','crumbed_or_deep.fried_poultry_intake',
            'lobstercrab_intake','vegetarian_sausagesburgers_intake','boiledbaked_potatoes_intake','cabbagekale_intake',
            'turnipswede_intake','peachnectarine_intake','other_coffee_type')

for (i in 1:length(change_to)){

    names(UKBB_participant_diet_answers)[names(UKBB_participant_diet_answers) == to_change[i]] <- change_to[i]
    
}

In [63]:
# Function to calculate consumption for each participant
calculate_consumption <- function(response, serving_size, energy_per_100g_or_ml) {
  # Calculate total amount consumed per item by participant
  total_consumed_g_or_ml <- response * serving_size
  
  # Calculate energy per serving
  energy_per_serving_KJ <- serving_size * (energy_per_100g_or_ml/100)
  
  # Calculate energy intake for each participant by item
  total_energy_consumed <- energy_per_serving_KJ * response
  
  # Return total amount consumed and energy per serving
  return(list(total_consumed_g_or_ml = total_consumed_g_or_ml, energy_per_serving_KJ = energy_per_serving_KJ, total_energy_consumed = total_energy_consumed))
}

In [6]:
final_UKBB_results_table <- data.frame(participant=vector("character", length(UKBB_participant_diet_answers$Participant)))
names=matrix(ncol=1,nrow=nrow(mastersheet))

pb = txtProgressBar(min = 0, max = nrow(UKBB_participant_diet_answers), initial = 0) 

# Loop through each row in UKBB_participant_diet_asnwers

for (i in 1:nrow(mastersheet)) {  #look into all rows of the dataframe
  current_item_name <- mastersheet[i,2] #look into item names and create a variable
  current_serving_g <- mastersheet[i,8] #look serving size in grams column names and create a variable
  current_serving_ml <- mastersheet[i,9] #look serving size in ml column and create a variable
  current_Energy <- mastersheet[i,10] #look energy in 100 ml or grams column and create a variable
  if (is.na(current_serving_g)){
    current_unit <- current_serving_ml
  }  else {
      current_unit <- current_serving_g #check if item has a NA value for serving size in ml or gr if not use the column with a value
  }
  
  current_item_name <- gsub(" ","_",current_item_name, fixed = TRUE) #eliminate space from item names
  current_item_name_intake <- paste(current_item_name, "_intake") #add intake at the end of item names
  current_item_name_intake <- gsub(" ","",current_item_name_intake, fixed = TRUE) #too much of cognitive power for 3 lines
  current_item_name_intake <- tolower(current_item_name_intake) #we made the initial letter non capital
  column_item_total_consumed_g_or_ml <-  paste(current_item_name,"_total_consumed_g_or_ml", sep = "")
  final_UKBB_results_table[, column_item_total_consumed_g_or_ml] <- NA
  column_item_energy_per_serving_KJ <-  paste(current_item_name,"_energy_per_serving_KJ", sep = "")
  final_UKBB_results_table[, column_item_energy_per_serving_KJ] <- NA
  column_item_total_energy_consumed <-  paste(current_item_name,"_total_energy_consumed", sep = "")
  final_UKBB_results_table[, column_item_total_energy_consumed] <- NA
  
  for (j in 1:nrow(UKBB_participant_diet_answers)){
    Participant <- as.character(UKBB_participant_diet_answers$Participant[j]) #Go through dataframe and find participant numbers final_UKBB_results_table[j, "participant"] <- as.character(UKBB_participant_diet_answers$Participant[j])
    #print(Participant)
    if (is.null(final_UKBB_results_table[j, Participant])){  #m
      final_UKBB_results_table[j, "participant"] <- Participant  #take participant variable value and insert in empty dataframe
    }
    
    indx <- grep(paste('^',current_item_name_intake, sep=''), colnames(UKBB_participant_diet_answers))  #change from grepl to grep to give us columns number instead of logical index
    #print(UKBB_participant_diet_answers[j,indx]) #then prints the value of dataframe in j= participant and column found
    #print(current_item_name_intake)
    response <- UKBB_participant_diet_answers[j,indx] #making it a variable
    setTxtProgressBar(pb,j)
    if (!length(indx)==0){  # Added this - sometimes current_item_name_intake is not found (because there is not _intake at the end), so this needs a new condition 
      if (!is.na(response) ) { #If the response is not NA, go through the rest of the logic
          if (is.character(response)) { # If the response is  character, check what type and reformat it
            #print(endsWith(response,"+"))
            if (endsWith(response,"+")) {
              response <- gsub("+", "", response, fixed = TRUE)
              #print(response)
            } else {
              response <- gsub("varied", NA, response, fixed = TRUE) #the number remains a string
              response <- gsub("half", "0.5", response, fixed = TRUE) #the number remains a string
              response <- gsub("quarter", "0.25", response, fixed = TRUE) #the number remains a string

            }
            #print(response)
              response <- ifelse(is.na(response), NA, as.numeric(response)) #making sure it is a number not string. 0.5 is not included as an integer 
            #print(response)
          }
          UKBB_total_consumption <- calculate_consumption(response,current_unit,current_Energy)
          final_UKBB_results_table[j, column_item_energy_per_serving_KJ] <- UKBB_total_consumption$energy_per_serving_KJ
          final_UKBB_results_table[j,column_item_total_energy_consumed] <-  UKBB_total_consumption$total_energy_consumed
          final_UKBB_results_table[j, column_item_total_consumed_g_or_ml] <- UKBB_total_consumption$total_consumed_g_or_ml
          #print(final_UKBB_results_table)
        }
    }
      #else{names[i]=current_item_name_intake} # check which variables from mastersheet are still missing
  }

    close(pb)
        
}



    # Store results
write.table(final_UKBB_results_table,'/dagher/dagher11/filip/UPF/data/Participants_consumption.csv', quote=T, row.names=F)





In [64]:
# Function to create results table for one timepoint
process_timepoint <- function(UKBB_data, mastersheet, timepoint) {
  # Initialize results table
  final_UKBB_results_table <- data.frame(participant=UKBB_data$eid)
  
  # More informative progress tracking
  total_items <- nrow(mastersheet)
  
  cat(sprintf("\nProcessing timepoint %d.0 (%d items to process)\n", timepoint, total_items))
  
  # Loop through each item in mastersheet
  for (i in 1:nrow(mastersheet)) {
    # Progress update every 10 items
    if(i %% 10 == 0) {
      cat(sprintf("\rProcessing item %d of %d (%.1f%%)", i, total_items, i/total_items*100))
    }
    
    current_item_name <- mastersheet[i,2]
    current_serving_g <- mastersheet[i,8]
    current_serving_ml <- mastersheet[i,9]
    current_Energy <- mastersheet[i,10]
    
    current_unit <- ifelse(is.na(current_serving_g), current_serving_ml, current_serving_g)

      # Process item name
    current_item_name <- gsub(" ","_",current_item_name, fixed = TRUE)
    current_item_name_intake <- tolower(paste0(current_item_name, "_intake"))
    
    # Initialize columns
    column_item_total_consumed_g_or_ml <- paste0(current_item_name,"_total_consumed_g_or_ml")
    column_item_energy_per_serving_KJ <- paste0(current_item_name,"_energy_per_serving_KJ")
    column_item_total_energy_consumed <- paste0(current_item_name,"_total_energy_consumed")
    
    final_UKBB_results_table[, column_item_total_consumed_g_or_ml] <- NA
    final_UKBB_results_table[, column_item_energy_per_serving_KJ] <- NA
    final_UKBB_results_table[, column_item_total_energy_consumed] <- NA
    
    # Find matching column in UKBB data (accounting for timepoint suffix)
    indx <- grep(paste0('^',current_item_name_intake, ".*\\.", timepoint, "\\.0$"), 
                colnames(UKBB_data))
    
    if(length(indx) > 0) {
      responses <- UKBB_data[,indx]
      
      # Process each response
      for(j in 1:length(responses)) {
        response <- responses[j]
        
        if(!is.na(response)) {
          if(is.character(response)) {
            response <- gsub("\\+$", "", response)
            response <- gsub("varied", NA, response, fixed = TRUE)
            response <- gsub("half", "0.5", response, fixed = TRUE)
            response <- gsub("quarter", "0.25", response, fixed = TRUE)
            response <- ifelse(is.na(response), NA, as.numeric(response))
          }
            

            
          UKBB_total_consumption <- calculate_consumption(as.numeric(response), as.numeric(current_unit), current_Energy)
          final_UKBB_results_table[j, column_item_energy_per_serving_KJ] <- 
            UKBB_total_consumption$energy_per_serving_KJ
          final_UKBB_results_table[j, column_item_total_energy_consumed] <- 
            UKBB_total_consumption$total_energy_consumed
          final_UKBB_results_table[j, column_item_total_consumed_g_or_ml] <- 
            UKBB_total_consumption$total_consumed_g_or_ml
        }
      }
    }
  }
  
  cat("\nCompleted processing timepoint", timepoint, ".0\n")
  return(final_UKBB_results_table)
}

# Main execution
process_all_timepoints <- function(UKBB_data, mastersheet) {
  # Process each timepoint
  results_list <- list()
  
  for(timepoint in 0:4) {
    results_list[[paste0("timepoint_", timepoint)]] <- 
      process_timepoint(UKBB_data, mastersheet, timepoint)
  }
  
  # Save results for each timepoint
  for(timepoint in 0:4) {
    filename <- sprintf("/dagher/dagher11/filip/UPF/data/Participants_consumption_t%d.csv", 
                       timepoint)
    write.table(results_list[[paste0("timepoint_", timepoint)]], 
                filename, quote=TRUE, row.names=FALSE)
  }
  
  return(results_list)
}



In [71]:
results <- process_all_timepoints(UKBB_participant_diet_answers, mastersheet)


Processing timepoint 0.0 (210 items to process)
Processing item 210 of 210 (100.0%)
Completed processing timepoint 0 .0

Processing timepoint 1.0 (210 items to process)
Processing item 210 of 210 (100.0%)
Completed processing timepoint 1 .0

Processing timepoint 2.0 (210 items to process)
Processing item 210 of 210 (100.0%)
Completed processing timepoint 2 .0

Processing timepoint 3.0 (210 items to process)
Processing item 210 of 210 (100.0%)
Completed processing timepoint 3 .0

Processing timepoint 4.0 (210 items to process)
Processing item 210 of 210 (100.0%)
Completed processing timepoint 4 .0


In [2]:
final_UKBB_results_table=read.table('/dagher/dagher11/filip/UPF/data/Participants_consumption.csv', header=T, sep=' ')

# NOVA calculations

In [5]:
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
#Calculations for total energy and NOVA precentage

#Get column indices that contain "participant" or "_total_energy_consumed"
columns_to_keep <- grepl("participant|_total_energy_consumed", colnames(final_UKBB_results_table))

# Keep only the columns containing "participant" or "_total_energy_consumed"
final_UKBB_total_consumption <- final_UKBB_results_table[, columns_to_keep]

#check dataframe
#str(final_UKBB_total_consumption)

#same procedure without trasposing first.
# Get the column names of the dataframe
col_names <- colnames(final_UKBB_total_consumption)
col_names <- gsub("_total_energy_consumed", "", col_names)
col_names <- gsub("_", " ", col_names)
# Assign the modified column names back to the dataframe
colnames(final_UKBB_total_consumption) <- col_names
#change to long format
final_UKBB_total_consumption <- gather(final_UKBB_total_consumption, 
                                       key = "Food_Item",
                                       value = "Energy_KJ", -1)

#Calculate total energy (i originally calculated the energy and wanted to add the NOVA but I tried another approach)
#final_UKBB_total_consumption <- final_UKBB_total_consumption %>%
#  mutate(Total_energy = rowSums(select(.,-1), na.rm=TRUE))

#load another dataframe to insert nova groups and merge
#Store results
#getwd()
#setwd("P:/h345/obrain_labspace/Projects/PhD_projects/MARVEL/MARVEL III_UPF/03_Experiment/03_Data")
#write.csv(final_UKBB_total_consumption, file = "Final_UKBB_total_consumption.csv")
#I transposed the document in excel just for the sake of speed, I was getting a little frustarted with the errors 24.04.2024

#load new document
final_UKBB_total_consumption2 <- read.csv("/dagher/dagher11/filip/UPF/data/Final_UKBB_total_consumption_v2.csv")

# Get the column names of the dataframe
col_names <- colnames(final_UKBB_total_consumption2)

# Remove "X" from the beginning of column names (if present)
col_names <- gsub("^X", "", col_names)

# Assign the modified column names back to the dataframe
colnames(final_UKBB_total_consumption2) <- col_names
#remove the end with total_energy_consumed
for (col in colnames(final_UKBB_total_consumption2)) {
  final_UKBB_total_consumption2[[col]] <- gsub("_total_energy_consumed", "", final_UKBB_total_consumption2[[col]])
}
# Remove "_" from values in the column named "Food_item"
final_UKBB_total_consumption2$Food_item <- gsub("_", " ", final_UKBB_total_consumption2$Food_item)
# Find matching values and extract corresponding NOVA values
final_UKBB_total_consumption2$NOVA <- mastersheet$`NOVA _code`[match(final_UKBB_total_consumption2$Food_item, mastersheet$Item_Name)]
#make NOVA column factor
final_UKBB_total_consumption2$NOVA <- as.factor(final_UKBB_total_consumption2$NOVA)
#remove columns in position 2 and 11
final_UKBB_total_consumption2 <- final_UKBB_total_consumption2 %>% 
  select(-2,-3,-4,-5,-6,-7,-8,-9,-10,-11) 
#Match names on food items
names(final_UKBB_total_consumption2)[1]= "Food_Item"
#merge two dataframes
main_df <- merge(final_UKBB_total_consumption, final_UKBB_total_consumption2, by= "Food_Item", all.x = TRUE)

#I tried this way because the most obvious ones did not work (see like 197)
summary_df <- aggregate(main_df$Energy_KJ, list(main_df$participant, main_df$NOVA), FUN=sum, na.rm = TRUE) 
#changed names
{
  names(summary_df)[1] = "Participant"
  names(summary_df)[2] = "NOVA_Group"
  names(summary_df)[3] = "Total_Energy"
  }

#print(summary_df)

# Step 2: Summarize the energy consumed per NOVA group
# Group the dataframe by participant ID and NOVA group, and calculate the sum of energy consumed
summary_df <- main_df %>%
  group_by(participant, NOVA) %>%
  summarise(total_energy_consumed = sum(Energy_KJ, na.rm=T)) #for some reason this did not work
#this error kept coming and I could not solve it
#Error in summarize(., Total_energy = sum(Energy_KJ)) : argument "by" is missing, with no default


[1m[22m`summarise()` has grouped output by 'participant'. You can override using the `.groups` argument.


In [6]:
write.table(summary_df,'/dagher/dagher11/filip/UPF/data/Participants_summary_NOVA.csv', quote=T, row.names=F)


In [8]:
summary_df_nona=na.omit(summary_df)

In [9]:
NOVA_energy=pivot_wider(
  summary_df_nona,
  id_cols = c(participant),
  names_from = NOVA,
  values_from = total_energy_consumed,
  names_prefix = "kJ_NOVA_"
)

In [10]:
write.table(NOVA_energy,'/dagher/dagher11/filip/UPF/data/NOVA_energy_values_wide.csv', quote=T, row.names=F)


# New NOVA calculations per time point and averaged

In [105]:
# 1. Modified process_nova_energy to include participation check
process_nova_energy <- function(results_table, mastersheet) {
  # First check who actually participated
  participated <- check_participation(results_table)
  results_table <- results_table[participated, ]
  
  # Rest remains the same
  columns_to_keep <- grepl("participant|_total_energy_consumed", colnames(results_table))
  energy_consumption <- results_table[, columns_to_keep]
  
  colnames(energy_consumption) <- gsub("_total_energy_consumed", "", colnames(energy_consumption))
  colnames(energy_consumption) <- gsub("_", " ", colnames(energy_consumption))
  
  energy_long <- gather(energy_consumption, 
                       key = "Food_Item",
                       value = "Energy_KJ", -participant)
  
  energy_long$NOVA <- mastersheet$`NOVA _code`[match(energy_long$Food_Item, 
                                                    mastersheet$Item_Name)]
  
  nova_summary <- energy_long %>%
    group_by(participant, NOVA) %>%
    summarise(Energy_KJ = sum(Energy_KJ, na.rm = TRUE), .groups = 'drop') %>%
    group_by(participant) %>%
    mutate(Total_Energy = sum(Energy_KJ, na.rm = TRUE),
           Percentage = (Energy_KJ / Total_Energy) * 100) %>%
    ungroup()
  
  nova_wide <- nova_summary %>%
    pivot_wider(
      id_cols = participant,
      names_from = NOVA,
      values_from = c(Energy_KJ, Percentage),
      names_prefix = "NOVA_",
      values_fill = 0
    ) %>%
    mutate(Total_Energy_KJ = rowSums(select(., starts_with("Energy_KJ")), na.rm = TRUE))
  
  return(nova_wide)
}

# Function to check participation
check_participation <- function(results_table) {
  energy_cols <- grep("_total_energy_consumed", colnames(results_table))
  has_data <- rowSums(results_table[, energy_cols] > 0, na.rm = TRUE) > 0
  return(has_data)
}

process_all_timepoints_nova <- function(results_list, mastersheet) {
  nova_results <- list()
  
  # Process each timepoint separately
  for(t in 0:4) {
    results_table <- results_list[[paste0("timepoint_", t)]]
    if(!is.null(results_table)) {
      # Process everyone but store participation info
      nova_results[[paste0("t", t)]] <- process_nova_energy(results_table, mastersheet)
    }
  }
  
  # Get all unique participants
  all_participants <- unique(unlist(lapply(nova_results, function(x) x$participant)))
  all_participants <- all_participants[!is.na(all_participants)]
  
  # Initialize columns we want to average
  cols_to_average <- grep("^(Energy_KJ|Percentage|Total_Energy)", 
                         colnames(nova_results[[1]]), value = TRUE)
  
  # Create empty dataframe for averages
  avg_results <- data.frame(
    participant = all_participants,
    matrix(0, 
           nrow = length(all_participants), 
           ncol = length(cols_to_average),
           dimnames = list(NULL, cols_to_average))
  )
  
  # Calculate averages
  for(i in seq_along(all_participants)) {
    p <- all_participants[i]
    sum_values <- as.numeric(rep(0, length(cols_to_average)))
    participated_timepoints <- 0
    
    # Sum values across timepoints
    for(t in names(nova_results)) {
      timepoint_data <- nova_results[[t]][nova_results[[t]]$participant == p, ]
      if(nrow(timepoint_data) > 0) {
        # Check if they have any non-zero values in this timepoint
        if(any(as.numeric(timepoint_data[1, grep("Energy_KJ", cols_to_average)]) > 0, na.rm = TRUE)) {
          participated_timepoints <- participated_timepoints + 1
          current_values <- as.numeric(timepoint_data[1, cols_to_average])
          sum_values <- sum_values + replace(current_values, is.na(current_values), 0)
        }
      }
    }
    
    # Calculate average only if they participated in any timepoints
    if(participated_timepoints > 0) {
      avg_results[i, cols_to_average] <- sum_values / participated_timepoints
    }
    
    # Store number of participated timepoints
    avg_results$participated_timepoints[i] <- participated_timepoints
  }
  
  # Save results
  for(t in 0:4) {
    if(!is.null(nova_results[[paste0("t", t)]])) {
      write.csv(nova_results[[paste0("t", t)]], 
                sprintf("/dagher/dagher11/filip/UPF/data/NOVA_energy_t%d.csv", t),
                row.names = FALSE)
    }
  }
  
  write.csv(avg_results, 
            "/dagher/dagher11/filip/UPF/data/NOVA_energy_averaged.csv",
            row.names = FALSE)
  
  return(list(
    timepoint_results = nova_results,
    averaged_results = avg_results
  ))
}

# 3. Function to create combined dataset
create_combined_df <- function(nova_results) {
  # Get all unique eids across all timepoints
  all_eids <- NULL
  for(t in 0:4) {
    if(!is.null(nova_results$timepoint_results[[paste0("t", t)]])) {
      all_eids <- union(all_eids, 
                       nova_results$timepoint_results[[paste0("t", t)]]$participant)
    }
  }
  all_eids <- all_eids[!is.na(all_eids)]  # Remove NA eids
  
  # Initialize combined_df with all possible eids
  combined_df <- data.frame(eid = all_eids)
  
  # Add timepoint-specific columns
  for(t in 0:4) {
    if(!is.null(nova_results$timepoint_results[[paste0("t", t)]])) {
      timepoint_data <- nova_results$timepoint_results[[paste0("t", t)]]
      timepoint_data <- timepoint_data[!is.na(timepoint_data$participant), ]
      
      # Create temporary dataframe for this timepoint
      temp_df <- timepoint_data
      names(temp_df)[names(temp_df) == "participant"] <- "eid"
      
      # Remove participant column and rename remaining columns
      temp_df$participant <- NULL
      names(temp_df)[-1] <- paste0(names(temp_df)[-1], "_t", t)
      
      # Merge with combined_df
      combined_df <- merge(combined_df, temp_df, by = "eid", all = TRUE)
    }
  }
  
  # Add averaged data
  averaged_data <- nova_results$averaged_results
  averaged_data <- averaged_data[!is.na(averaged_data$participant), ]
  names(averaged_data)[names(averaged_data) == "participant"] <- "eid"
  names(averaged_data)[-1] <- paste0(names(averaged_data)[-1], "_avg")
  
  # Merge with combined_df
  combined_df <- merge(combined_df, averaged_data, by = "eid", all = TRUE)
  
  # Sort by eid
  combined_df <- combined_df[order(combined_df$eid), ]
  
  # Save the combined dataset
  write.csv(combined_df, 
            "/dagher/dagher11/filip/UPF/data/NOVA_energy_all_timepoints_combined.csv",
            row.names = FALSE)
  
  return(combined_df)
}

get_most_recent_timepoint <- function(nova_results) {
  # Initialize empty dataframe for most recent data
  most_recent <- data.frame()
  
  # Get all unique participants
  all_participants <- unique(unlist(lapply(nova_results$timepoint_results, function(x) x$participant)))
  all_participants <- all_participants[!is.na(all_participants)]
  
  # For each participant
  for(p in all_participants) {
    # Check timepoints from newest (4) to oldest (0)
    for(t in 4:0) {
      timepoint_key <- paste0("t", t)
      if(!is.null(nova_results$timepoint_results[[timepoint_key]])) {
        participant_data <- nova_results$timepoint_results[[timepoint_key]][
          nova_results$timepoint_results[[timepoint_key]]$participant == p, ]
        
        # If participant has data at this timepoint and has any energy values
        if(nrow(participant_data) > 0 && 
           any(as.numeric(participant_data[1, grep("Energy_KJ", colnames(participant_data))]) > 0, 
               na.rm = TRUE)) {
          # Add timepoint number
          participant_data$timepoint <- t
          # Add to results and move to next participant
          most_recent <- rbind(most_recent, participant_data)
          break  # Exit loop once we found the most recent timepoint
        }
      }
    }
  }
  
  # Save the results
  write.csv(most_recent, 
            "/dagher/dagher11/filip/UPF/data/NOVA_energy_most_recent.csv",
            row.names = FALSE)
  
  return(most_recent)
}



In [98]:
nova_analysis <- process_all_timepoints_nova(results, mastersheet)

In [101]:
combined_results <- create_combined_df(nova_analysis)

# Compute intraclass correlation between NOVA 4 for all timepoints

In [1]:
combined_results=read.table("/dagher/dagher11/filip/UPF/data/NOVA_energy_all_timepoints_combined.csv", sep=',', header=T)

In [2]:
library(psych)  # for ICC
library(tidyr)

calculate_nova_icc <- function(combined_results) {
  # Define NOVA categories (1 through 4, exclude NA)
  nova_categories <- 1:4
  
  icc_results <- list()
  
  # First print data availability
  cat("\nChecking data availability:\n")
  for(nova in nova_categories) {
    for(t in 0:4) {
      col_name <- paste0("Percentage_NOVA_", nova, "_t", t)
      n_valid <- sum(!is.na(combined_results[[col_name]]))
      cat(sprintf("NOVA %d, Timepoint %d: %d valid measurements\n", 
                 nova, t, n_valid))
    }
  }
  
  for(nova in nova_categories) {
    # Create matrix for this NOVA category
    percentage_cols <- paste0("Percentage_NOVA_", nova, "_t", 0:4)
    
    # Extract data for this NOVA category across all timepoints
    nova_data <- combined_results[, percentage_cols, drop = FALSE]
    
    # Only include participants with at least 2 timepoints
    complete_cases <- rowSums(!is.na(nova_data)) >= 2
    nova_data_clean <- nova_data[complete_cases, ]
    
    # Calculate ICC if we have enough data
    if(nrow(nova_data_clean) > 1) {
      tryCatch({
        icc_result <- ICC(nova_data_clean)
        icc_results[[paste0("NOVA", nova)]] <- icc_result
        
        cat(sprintf("\nNOVA %d ICC calculation successful:\n", nova))
        cat(sprintf("Number of participants included: %d\n", nrow(nova_data_clean)))
      }, error = function(e) {
        message(paste("Could not calculate ICC for NOVA", nova, ":", e$message))
      })
    } else {
      message(paste("Not enough data for ICC calculation for NOVA", nova))
    }
  }
  
  # Create summary dataframe only if we have valid results
  if(length(icc_results) > 0) {
    # First check if we have all the required elements in the results
    valid_results <- sapply(icc_results, function(x) {
      !is.null(x) && !is.null(x$results) && 
        all(c("ICC", "F", "p", "ICC_low", "ICC_up") %in% colnames(x$results))
    })
    
    if(any(valid_results)) {
      # Only use valid results
      valid_icc_results <- icc_results[valid_results]
      
      icc_summary <- data.frame(
        NOVA_category = names(valid_icc_results),
        ICC_single = sapply(valid_icc_results, function(x) x$results[1, "ICC"]),
        ICC_average = sapply(valid_icc_results, function(x) x$results[2, "ICC"]),
        F_value = sapply(valid_icc_results, function(x) x$results[1, "F"]),
        p_value = sapply(valid_icc_results, function(x) x$results[1, "p"]),
        CI_lower = sapply(valid_icc_results, function(x) x$results[1, "ICC_low"]),
        CI_upper = sapply(valid_icc_results, function(x) x$results[1, "ICC_up"])
      )
      
      # Save results
      write.csv(icc_summary, 
                "/dagher/dagher11/filip/UPF/data/NOVA_ICC_results.csv",
                row.names = FALSE)
    } else {
      icc_summary <- data.frame()
      message("No valid ICC results to summarize")
    }
  } else {
    icc_summary <- data.frame()
    message("No ICC results were calculated")
  }
  
  # Return results
  return(list(
    summary = icc_summary,
    full_results = icc_results,
    data_used = if(exists("nova_data_clean")) nova_data_clean else NULL
  ))
}

# Usage:
icc_results <- calculate_nova_icc(combined_results)


Checking data availability:
NOVA 1, Timepoint 0: 8202 valid measurements
NOVA 1, Timepoint 1: 18465 valid measurements
NOVA 1, Timepoint 2: 15838 valid measurements
NOVA 1, Timepoint 3: 20134 valid measurements
NOVA 1, Timepoint 4: 19777 valid measurements
NOVA 2, Timepoint 0: 8202 valid measurements
NOVA 2, Timepoint 1: 18465 valid measurements
NOVA 2, Timepoint 2: 15838 valid measurements
NOVA 2, Timepoint 3: 20134 valid measurements
NOVA 2, Timepoint 4: 19777 valid measurements
NOVA 3, Timepoint 0: 8202 valid measurements
NOVA 3, Timepoint 1: 18465 valid measurements
NOVA 3, Timepoint 2: 15838 valid measurements
NOVA 3, Timepoint 3: 20134 valid measurements
NOVA 3, Timepoint 4: 19777 valid measurements
NOVA 4, Timepoint 0: 8202 valid measurements
NOVA 4, Timepoint 1: 18465 valid measurements
NOVA 4, Timepoint 2: 15838 valid measurements
NOVA 4, Timepoint 3: 20134 valid measurements
NOVA 4, Timepoint 4: 19777 valid measurements

NOVA 1 ICC calculation successful:
Number of participa

No valid ICC results to summarize



In [3]:
icc_results$full_results$NOVA4

Call: ICC(x = nova_data_clean)

Intraclass correlation coefficients 
                         type  ICC   F   df1   df2 p lower bound upper bound
Single_raters_absolute   ICC1 0.31 3.3 23905 95624 0        0.31        0.32
Single_random_raters     ICC2 0.31 3.3 23905 95620 0        0.31        0.32
Single_fixed_raters      ICC3 0.31 3.3 23905 95620 0        0.31        0.32
Average_raters_absolute ICC1k 0.70 3.3 23905 95624 0        0.69        0.70
Average_random_raters   ICC2k 0.70 3.3 23905 95620 0        0.69        0.70
Average_fixed_raters    ICC3k 0.70 3.3 23905 95620 0        0.69        0.70

 Number of subjects = 23906     Number of Judges =  5
See the help file for a discussion of the other 4 McGraw and Wong estimates,

# Check how many participants have how many data points

In [3]:
count_timepoints_per_participant <- function(combined_results) {
  # Initialize dataframe to store timepoint counts
  participant_counts <- data.frame(
    eid = combined_results$eid,
    timepoints = 0
  )
  
  # For each participant, count timepoints with data
  for(i in seq_along(participant_counts$eid)) {
    timepoints_with_data <- 0
    
    # Check each timepoint
    for(t in 0:4) {
      # Check if there's any NOVA data at this timepoint
      cols <- grep(paste0("_t", t, "$"), names(combined_results), value = TRUE)
      cols <- cols[grep("Energy_KJ_NOVA_[1-4]", cols)] # only check NOVA 1-4
      
      # If any NOVA category has data, count this timepoint
      if(any(!is.na(combined_results[i, cols]) & combined_results[i, cols] > 0)) {
        timepoints_with_data <- timepoints_with_data + 1
      }
    }
    
    participant_counts$timepoints[i] <- timepoints_with_data
  }
  
  # Create summary table
  summary_table <- table(participant_counts$timepoints)
  
  # Print results
  cat("\nNumber of participants with different numbers of timepoints:\n")
  for(i in 0:5) {
    n_participants <- sum(participant_counts$timepoints == i)
    percentage <- (n_participants / nrow(participant_counts)) * 100
    cat(sprintf("%d timepoints: %d participants (%.1f%%)\n", 
                i, n_participants, percentage))
  }
  
  # Save detailed results
  write.csv(participant_counts, 
            "/dagher/dagher11/filip/UPF/data/participant_timepoint_counts.csv",
            row.names = FALSE)
  
  return(participant_counts)
}

# Usage:
timepoint_counts <- count_timepoints_per_participant(combined_results)


Number of participants with different numbers of timepoints:
0 timepoints: 0 participants (0.0%)
1 timepoints: 9859 participants (29.2%)
2 timepoints: 8000 participants (23.7%)
3 timepoints: 8216 participants (24.3%)
4 timepoints: 6541 participants (19.4%)
5 timepoints: 1149 participants (3.4%)
