In [9]:
import pandas as pd
import numpy as np
from ergo_round__ import choose_prefix, correct_rounds_from_lookup, ColumnConfig, visit_order_lookup

pd.set_option('display.max_rows', 500)
pd.set_option('display.min_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 2000)
pd.set_option('max_colwidth', 400)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Notebook Overview

This notebook demonstrates how to assign the correct **Rotterdam Study round (prefix)** to participant visits based on visit_date, study_id (cohort), and visit_nr.


The notebook expects a DataFrame with the following columns:

- **`ergo_id`** *(int)*: Unique identifier for each participant  
- **`visit_date`** *(str in `'YYYY-MM-DD'` format)*: The date the participant attended the visit  
- **`study_id`** *(str)*: The Rotterdam Study cohort name (e.g., `'Rotterdam Study 1'`)

### Goal

Assign the appropriate **visit round prefix** (e.g., `'e1'`, `'e2'`, `'e3'`, etc.) to each participant visit, accounting for:

- Participants arriving **late** to one or more visits  
- Participants arriving **early**  to one or more visits
- Participants who **missed** a visit entirely
- Participants who **missed** multiple visits 

The examples are based on the full RS dataset

### test data

In [10]:
data = {'ergo_id': [1, 1, 1, 1,
                    2, 2, 2,
                    3, 3,
                    4,
                    5, 5, 5,
                    6, 6, 6, 6,
                    7,
                    8],
                    
 'visit_date': ['1993-09-02', '1996-01-01', '2000-01-01', '2002-01-01',  # Late for multiple visits
                '1990-01-01', '1994-01-01', '1996-02-01',                # Early for one visit  
                '1990-01-01', '1998-01-01',                              # Missing one visit 
                '1996-01-01',                                            # Missed baseline   
                '2002-01-01', '2004-08-01', '2012-08-01',                # A few months late, on time, a few months late
                '1993-03-08', '1999-09-08', '2001-06-10', '2004-05-11',  # Multiple edge cases 
                '2016-03-16',                                            # Baseline at ERGO 6
                '2011-11-03'],                                           # Baseline at ERGO 5                                          

 'study_id': ['Rotterdam Study 1', 'Rotterdam Study 1', 'Rotterdam Study 1', 'Rotterdam Study 1',
              'Rotterdam Study 1', 'Rotterdam Study 1', 'Rotterdam Study 1',
              'Rotterdam Study 1', 'Rotterdam Study 1',
              'Rotterdam Study 1', 
              'Rotterdam Study 2', 'Rotterdam Study 2', 'Rotterdam Study 2',
              'Rotterdam Study 1', 'Rotterdam Study 1', 'Rotterdam Study 1', 'Rotterdam Study 1',
              'Rotterdam Study 2',
              'Rotterdam Study 2']
}

df = pd.DataFrame(data)

### visit_nr 

In [11]:
df

Unnamed: 0,ergo_id,visit_date,study_id
0,1,1993-09-02,Rotterdam Study 1
1,1,1996-01-01,Rotterdam Study 1
2,1,2000-01-01,Rotterdam Study 1
3,1,2002-01-01,Rotterdam Study 1
4,2,1990-01-01,Rotterdam Study 1
5,2,1994-01-01,Rotterdam Study 1
6,2,1996-02-01,Rotterdam Study 1
7,3,1990-01-01,Rotterdam Study 1
8,3,1998-01-01,Rotterdam Study 1
9,4,1996-01-01,Rotterdam Study 1


In [12]:
# Assuming df is your DataFrame
df['visit_date'] = pd.to_datetime(df['visit_date'], format='%Y-%m-%d', errors='coerce'); 
df = df.sort_values(by = ['ergo_id', 'visit_date']);

# Detemine visit numbers based on the ascending dates 
df['visit_nr'] = df.groupby(['ergo_id']).cumcount() + 1;
df['total_visits'] = df.groupby('ergo_id')['ergo_id'].transform('count')

### round

In [13]:
config = ColumnConfig
df['round'] = df.apply(lambda row: choose_prefix(row, config), axis=1)

In [14]:
print(df)

    ergo_id visit_date           study_id  visit_nr  total_visits round
0         1 1993-09-02  Rotterdam Study 1         1             4    e2
1         1 1996-01-01  Rotterdam Study 1         2             4    e2
2         1 2000-01-01  Rotterdam Study 1         3             4    e3
3         1 2002-01-01  Rotterdam Study 1         4             4    e4
4         2 1990-01-01  Rotterdam Study 1         1             3    e1
5         2 1994-01-01  Rotterdam Study 1         2             3    e2
6         2 1996-02-01  Rotterdam Study 1         3             3    e3
7         3 1990-01-01  Rotterdam Study 1         1             2    e1
8         3 1998-01-01  Rotterdam Study 1         2             2    e3
9         4 1996-01-01  Rotterdam Study 1         1             1    e2
10        5 2002-01-01  Rotterdam Study 2         1             3    ep
11        5 2004-08-01  Rotterdam Study 2         2             3    e4
12        5 2012-08-01  Rotterdam Study 2         3             

### correct round for edge cases

In [7]:
df = correct_rounds_from_lookup(df, config)

In [8]:
print(df)

    ergo_id visit_date           study_id  visit_nr  total_visits round
0         1 1993-09-02  Rotterdam Study 1         1             4    e1
1         1 1996-01-01  Rotterdam Study 1         2             4    e2
2         1 2000-01-01  Rotterdam Study 1         3             4    e3
3         1 2002-01-01  Rotterdam Study 1         4             4    e4
4         2 1990-01-01  Rotterdam Study 1         1             3    e1
5         2 1994-01-01  Rotterdam Study 1         2             3    e2
6         2 1996-02-01  Rotterdam Study 1         3             3    e3
7         3 1990-01-01  Rotterdam Study 1         1             2    e1
8         3 1998-01-01  Rotterdam Study 1         2             2    e3
9         4 1996-01-01  Rotterdam Study 1         1             1    e2
10        5 2002-01-01  Rotterdam Study 2         1             3    ep
11        5 2004-08-01  Rotterdam Study 2         2             3    e4
12        5 2012-08-01  Rotterdam Study 2         3             

## check result