# Preparing High Erlotinib Dose Lung Cancer Treatment Group Data

## Import raw LXF A677 high erlotinb dose PKPD data

In [1]:
import os
import pandas as pd


# Import LXF A677 PK data
path = os.path.dirname(os.getcwd())
data_raw = pd.read_csv(path + '/data/raw_data/PK_LXF_erlo.csv', sep=';')

# Display data
print('Raw PK Data Set for all dosing regimens:')
data_raw

Raw PK Data Set for all dosing regimens:


Unnamed: 0,#ID,TIME,DOSE,ADDL,II,Y,YTYPE,CENS,CELL LINE,DOSE GROUP,DRUG,EXPERIMENT,TUMOR SIZE,BW
0,6,0.0,.,.,.,.,.,.,1,100.00,1,2,68.7500,24.2
1,6,2.0,.,.,.,.,.,.,1,100.00,1,2,75.4290,24.7
2,6,3.0,2450,.,.,.,.,.,1,100.00,1,2,75.4290,24.7
3,6,4.0,.,.,.,.,.,.,1,100.00,1,2,115.3510,23.6
4,6,4.0,2350,2,1,.,.,.,1,100.00,1,2,115.3510,23.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,162,21.0,.,.,.,.,.,.,1,6.25,1,2,490.3980,24.2
468,162,23.0,.,.,.,.,.,.,1,6.25,1,2,425.2700,24.5
469,162,25.0,.,.,.,.,.,.,1,6.25,1,2,432.6660,24.5
470,162,28.0,.,.,.,.,.,.,1,6.25,1,2,585.0000,25.3


## Cleaning the data

The keys of interest for our analysis are

- **#ID** indicating which mouse was measured,
- **TIME** indicating the time point of each measurement,
- **DOSE** indicating the dose amount administered at the time point,
- **DURATION** indicating the duration of a single dose administration (bolus injection, duration=0/NaN,),
- **PLASMA CONCENTRATION** indicating the concentration of erlotinib in the blood plasma,
- **TUMOUR VOLUME** indicating the measured tumour volume,
- **BODY WEIGHT** indicating the body weight of the mouse.

In discussion with the authors and a comparison with their study report in [1], the relevant columns in the dataset were identified as **#ID**, **TIME**, **DOSE** (**ADDL**, **II**), **Y**, **TUMOR SIZE** and **BW**, where **Y** encodes for the **PLASMA CONCENTRATION**, **TUMOR SIZE** for the **TUMOUR VOLUME**, and **BW** for the **BODY WEIGHT**. The applied dose is encoded by **DOSE** **ADDL**, **II**, where **DOSE** is the administered dose amount at the given time point, **ADDL* the number of additionally applied doses of the same amount, and **II** the time interval separating the doses. The remaining keys are partially Monolix-specific modelling keys, and partially inferred model parameters. We thus ignore those columns.

The raw datasets do not contain the units of the measured quantities. From [1] as well as Roche's study report, we may infer that

- **TIME**: is measured in $\text{day}$,
- **DOSE**: in $\text{ng}$,
- **PLASMA CONCENTRATION** in $\text{ng} / \text{L}$,
- **TUMOUR VOLUME**: is measured in $\text{mm}^3$,
- **BODY WEIGHT**: is measured in $\text{g}$.

For reasons that will become clear later, we will choose to measure the dose in $\text{mg}$, the plasma concentration in $\text{mg}/\text{L}$ and the tumour volume in $\text{cm}^3$.

According to the study report, the medium dose group where administered an oral dose of erlotinib of $100\, \text{mg}/\text{kg}/\text{L}$ per dose. For now we will assume that the doses recorded in the dataset are correct.

## Filter for medium dose data

In [2]:
import numpy as np

# Filter medium dose data
data = data_raw.apply(pd.to_numeric, errors='coerce')
data = data[data['DOSE GROUP'] == 100]

# Forget columns that carry no relevant info
assert len(data['CELL LINE'].unique()) == 1
assert len(data['DOSE GROUP'].unique()) == 1
assert len(data['EXPERIMENT'].unique()) == 1
assert len(data['DRUG'].unique()) == 1
assert len(data['YTYPE'].unique()) == 2
assert np.isnan(data['YTYPE'].unique()[0])
assert data['YTYPE'].unique()[1] == 1
 
data = data[['#ID', 'TIME', 'DOSE', 'ADDL', 'II', 'Y', 'CENS', 'TUMOR SIZE', 'BW']]

data

Unnamed: 0,#ID,TIME,DOSE,ADDL,II,Y,CENS,TUMOR SIZE,BW
0,6,0.0,,,,,,68.7500,24.2
1,6,2.0,,,,,,75.4290,24.7
2,6,3.0,2450.0,,,,,75.4290,24.7
3,6,4.0,,,,,,115.3510,23.6
4,6,4.0,2350.0,2.0,1.0,,,115.3510,23.6
...,...,...,...,...,...,...,...,...,...
104,134,21.0,,,,,,109.6515,22.9
105,134,23.0,,,,,,183.9965,23.2
106,134,25.0,,,,,,242.5920,24.5
107,134,28.0,,,,,,300.8000,23.8


## Save all dose events as separate rows

In [3]:
# Split data into rows with dose info and rows withour dose info
dose_mask = pd.notnull(data['DOSE'])
doses = data[dose_mask]
data = data[~dose_mask]

# Add all dose events as rows to data
for _, row in doses.iterrows():
    # Get id, time, dose amount, multiplicity, interval
    index = row['#ID']
    time = row['TIME']
    dose = row['DOSE']
    number = row['ADDL']
    interval = row['II']

    if np.isnan(number):
        number = 1
        interval = 0
    else:
        number = int(number + 1)

    # Add dose entry to data
    for n in range(number):
        # Compute dose time (first iteration is at original time)
        new_time = time + n * interval

        # Create an id and a time mask
        mask_id = data['#ID'] == index
        mask_time = data['TIME'] == new_time

        if data[mask_id & mask_time].empty:
            # This means there is no row entry for the indiviudal at that time point.
            data = data.append(pd.DataFrame({
                '#ID': [index],
                'TIME': [new_time],
                'DOSE': [dose]}))
        else:
            # This means the individual was measured at that time point
            if data[mask_id & mask_time]['DOSE'].dropna().empty:
                data['DOSE'].where(~(mask_id & mask_time), other=dose, inplace=True)
            else:
                try:
                    assert (data[mask_id & mask_time]['DOSE'] == dose).all()
                except AssertionError:
                    print(data[mask_id & mask_time]['DOSE'])
                    print(dose)

# Remove the now unnecessary ADDL and II colum
data = data[['#ID', 'TIME', 'DOSE', 'Y', 'CENS', 'TUMOR SIZE', 'BW']]

# Let IDs be integers
data['#ID'] = data['#ID'].astype(int)

data

Unnamed: 0,#ID,TIME,DOSE,Y,CENS,TUMOR SIZE,BW
0,6,0.0,,,,68.7500,24.2
1,6,2.0,,,,75.4290,24.7
3,6,4.0,2350.0,,,115.3510,23.6
5,6,7.0,2100.0,,,34.2225,21.2
7,6,9.0,,,,10.3125,19.0
...,...,...,...,...,...,...,...
0,134,3.0,2350.0,,,,
0,134,5.0,2250.0,,,,
0,134,6.0,2250.0,,,,
0,134,8.0,2150.0,,,,


## Remove Tumor size and BW entries for concentration measurement rows. 

According to report those values where not measured together with the concentration. (not 100% sure about the body weight)

In [4]:
mask = data['Y'].isnull()
data['TUMOR SIZE'].where(mask, inplace=True)
data['BW'].where(mask, inplace=True)

data

Unnamed: 0,#ID,TIME,DOSE,Y,CENS,TUMOR SIZE,BW
0,6,0.0,,,,68.7500,24.2
1,6,2.0,,,,75.4290,24.7
3,6,4.0,2350.0,,,115.3510,23.6
5,6,7.0,2100.0,,,34.2225,21.2
7,6,9.0,,,,10.3125,19.0
...,...,...,...,...,...,...,...
0,134,3.0,2350.0,,,,
0,134,5.0,2250.0,,,,
0,134,6.0,2250.0,,,,
0,134,8.0,2150.0,,,,


## Remove censored data for now (can always insert it back once we support it in our inference)

In [5]:
# Mask where CENS is NaN
mask = data['CENS'].isnull()

# Set censored concentration measurements to nan
data['Y'].where(mask, inplace=True)

# Remove the now unnecessary CENS colum
data = data[['#ID', 'TIME', 'DOSE', 'Y', 'TUMOR SIZE', 'BW']]

data

Unnamed: 0,#ID,TIME,DOSE,Y,TUMOR SIZE,BW
0,6,0.0,,,68.7500,24.2
1,6,2.0,,,75.4290,24.7
3,6,4.0,2350.0,,115.3510,23.6
5,6,7.0,2100.0,,34.2225,21.2
7,6,9.0,,,10.3125,19.0
...,...,...,...,...,...,...
0,134,3.0,2350.0,,,
0,134,5.0,2250.0,,,
0,134,6.0,2250.0,,,
0,134,8.0,2150.0,,,


## Rename columns and transform units

In [6]:
# Rename TIME, DOSE, PLASMA CONCENTRATION, TUMOUR VOLUME and BODY WEIGHT with units as they are
data = data.rename(columns={
    'TIME': 'TIME in day',
    'DOSE': 'DOSE in ng',
    'Y': 'PLASMA CONCENTRATION in ng/L',
    'TUMOR SIZE': 'TUMOUR VOLUME in mm^3',
    'BW': 'BODY WEIGHT in g'})

# Convert units
data['DOSE in ng'] *= 1E-03
data = data.rename(columns={'DOSE in ng': 'DOSE in mg'})

data['PLASMA CONCENTRATION in ng/L'] *= 1E-03
data = data.rename(columns={'PLASMA CONCENTRATION in ng/L': 'PLASMA CONCENTRATION in mg/L'})

data['TUMOUR VOLUME in mm^3'] *= 1E-03
data = data.rename(columns={'TUMOUR VOLUME in mm^3': 'TUMOUR VOLUME in cm^3'})

data

Unnamed: 0,#ID,TIME in day,DOSE in mg,PLASMA CONCENTRATION in mg/L,TUMOUR VOLUME in cm^3,BODY WEIGHT in g
0,6,0.0,,,0.068750,24.2
1,6,2.0,,,0.075429,24.7
3,6,4.0,2.35,,0.115351,23.6
5,6,7.0,2.10,,0.034222,21.2
7,6,9.0,,,0.010313,19.0
...,...,...,...,...,...,...
0,134,3.0,2.35,,,
0,134,5.0,2.25,,,
0,134,6.0,2.25,,,
0,134,8.0,2.15,,,


## Illustrate high erlotinib dose group data

In [7]:
import erlotinib.plots as eplt


# Create scatter plot
fig = eplt.PKTimeSeriesPlot()
fig.add_data(data, id_key='#ID', time_key='TIME in day', biom_key='PLASMA CONCENTRATION in mg/L', dose_key='DOSE in mg')
fig.set_axis_labels(time_label='Time in day', biom_label='Plasma conc. in mg/L', dose_label='Dose in mg')

# Show figure
fig.show()

**Figure 1:** Plasma concentration time series in lung cancer treatment group with high erlotinib dose ($100\, \text{mg}$ oral dose per $\text{g}$ body weight).

In [8]:
# Create scatter plot
fig = eplt.PDTimeSeriesPlot()
fig.add_data(data, id_key='#ID', time_key='TIME in day', biom_key='TUMOUR VOLUME in cm^3')
fig.set_axis_labels(xlabel=r'$\text{Time in day}$', ylabel=r'$\text{Tumour volume in cm}^3$')

# Show figure
fig.show()

**Figure 2:** Tumour volume time series in lung cancer treatment group with medium erlotinib dose ($100\, \text{mg}$ oral dose per $\text{g}$ body weight).

## Export data

In [9]:
# Export cleaned data
path = os.path.dirname(os.getcwd())
data.to_csv(path_or_buf=path + '/data/lxf_high_erlotinib_dose.csv', index=False)