# Preparing Medium Erlotinib Dose Lung Cancer Treatment Group Data

## Import raw LXF A677 medium erlotinb dose PKPD data

In [1]:
import os
import pandas as pd

# Import LXF A677 PK data
path = os.path.dirname(os.getcwd())
data_raw = pd.read_csv(path + '/data/raw_data/PK_LXF_erlo.csv', sep=';')

# Display data
print('Raw PK Data Set for all dosing regimens:')
data_raw

Raw PK Data Set for all dosing regimens:


Unnamed: 0,#ID,TIME,DOSE,ADDL,II,Y,YTYPE,CENS,CELL LINE,DOSE GROUP,DRUG,EXPERIMENT,TUMOR SIZE,BW
0,6,0.0,.,.,.,.,.,.,1,100.00,1,2,68.7500,24.2
1,6,2.0,.,.,.,.,.,.,1,100.00,1,2,75.4290,24.7
2,6,3.0,2450,.,.,.,.,.,1,100.00,1,2,75.4290,24.7
3,6,4.0,.,.,.,.,.,.,1,100.00,1,2,115.3510,23.6
4,6,4.0,2350,2,1,.,.,.,1,100.00,1,2,115.3510,23.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,162,21.0,.,.,.,.,.,.,1,6.25,1,2,490.3980,24.2
468,162,23.0,.,.,.,.,.,.,1,6.25,1,2,425.2700,24.5
469,162,25.0,.,.,.,.,.,.,1,6.25,1,2,432.6660,24.5
470,162,28.0,.,.,.,.,.,.,1,6.25,1,2,585.0000,25.3


## Cleaning the data

The keys of interest for our analysis are

- **#ID** indicating which mouse was measured,
- **TIME** indicating the time point of each measurement,
- **DOSE** indicating the dose amount administered at the time point,
- **DURATION** indicating the duration of a single dose administration (bolus injection, duration=0/NaN,),
- **PLASMA CONCENTRATION** indicating the concentration of erlotinib in the blood plasma,
- **TUMOUR VOLUME** indicating the measured tumour volume,
- **BODY WEIGHT** indicating the body weight of the mouse.

In discussion with the authors and a comparison with their study report in [1], the relevant columns in the dataset were identified as **#ID**, **TIME**, **DOSE** (**ADDL**, **II**), **Y**, **TUMOR SIZE** and **BW**, where **Y** encodes for the **PLASMA CONCENTRATION**, **TUMOR SIZE** for the **TUMOUR VOLUME**, and **BW** for the **BODY WEIGHT**. The applied dose is encoded by **DOSE** **ADDL**, **II**, where **DOSE** is the administered dose amount at the given time point, **ADDL* the number of additionally applied doses of the same amount, and **II** the time interval separating the doses. The remaining keys are partially Monolix-specific modelling keys, and partially inferred model parameters. We thus ignore those columns.

The raw datasets do not contain the units of the measured quantities. From [1] as well as Roche's study report, we may infer that

- **TIME**: is measured in $\text{day}$,
- **DOSE**: in $\text{ng}$,
- **PLASMA CONCENTRATION** in $\text{ng} / \text{L}$,
- **TUMOUR VOLUME**: is measured in $\text{mm}^3$,
- **BODY WEIGHT**: is measured in $\text{g}$.

For reasons that will become clear later, we will choose to measure the dose in $\text{mg}$, the plasma concentration in $\text{mg}/\text{L}$ and the tumour volume in $\text{cm}^3$.

According to the study report, the medium dose group where administered an oral dose of erlotinib of $25\, \text{mg}/\text{kg}/\text{L}$ per dose. For now we will assume that the doses recorded in the dataset are correct.

## Filter for medium dose data

In [2]:
import numpy as np

# Filter medium dose data
data = data_raw.apply(pd.to_numeric, errors='coerce')
data = data[data['DOSE GROUP'] == 25.0]

# Forget columns that carry no relevant info
assert len(data['CELL LINE'].unique()) == 1
assert len(data['DOSE GROUP'].unique()) == 1
assert len(data['EXPERIMENT'].unique()) == 1
assert len(data['DRUG'].unique()) == 1
assert len(data['YTYPE'].unique()) == 2
assert np.isnan(data['YTYPE'].unique()[0])
assert data['YTYPE'].unique()[1] == 1
 
data = data[['#ID', 'TIME', 'DOSE', 'ADDL', 'II', 'Y', 'CENS', 'TUMOR SIZE', 'BW']]

data

Unnamed: 0,#ID,TIME,DOSE,ADDL,II,Y,CENS,TUMOR SIZE,BW
109,34,0.0,,,,,,165.292,27.7
110,34,2.0,,,,,,217.328,26.9
111,34,3.0,675.0,,,,,217.328,26.9
112,34,4.0,,,,,,292.500,28.0
113,34,4.0,700.0,2.0,1.0,,,292.500,28.0
...,...,...,...,...,...,...,...,...,...
288,167,21.0,,,,,,1094.400,28.7
289,167,23.0,,,,,,1284.400,28.5
290,167,25.0,,,,,,1428.768,29.2
291,167,28.0,,,,,,1414.485,29.6


## Save all dose events as separate rows

In [3]:
# Split data into rows with dose info and rows withour dose info
dose_mask = pd.notnull(data['DOSE'])
doses = data[dose_mask]
data = data[~dose_mask]

# Add all dose events as rows to data
for _, row in doses.iterrows():
    # Get id, time, dose amount, multiplicity, interval
    index = row['#ID']
    time = row['TIME']
    dose = row['DOSE']
    number = row['ADDL']
    interval = row['II']

    if np.isnan(number):
        number = 1
        interval = 0
    else:
        number = int(number + 1)

    # Add dose entry to data
    for n in range(number):
        # Compute dose time (first iteration is at original time)
        new_time = time + n * interval

        # Create an id and a time mask
        mask_id = data['#ID'] == index
        mask_time = data['TIME'] == new_time

        if data[mask_id & mask_time].empty:
            # This means there is no row entry for the indiviudal at that time point.
            data = data.append(pd.DataFrame({
                '#ID': [index],
                'TIME': [new_time],
                'DOSE': [dose]}))
        else:
            # This means the individual was measured at that time point
            if data[mask_id & mask_time]['DOSE'].dropna().empty:
                data['DOSE'].where(~(mask_id & mask_time), other=dose, inplace=True)
            else:
                try:
                    assert (data[mask_id & mask_time]['DOSE'] == dose).all()
                except AssertionError:
                    print(data[mask_id & mask_time]['DOSE'])
                    print(dose)

# Remove the now unnecessary ADDL and II colum
data = data[['#ID', 'TIME', 'DOSE', 'Y', 'CENS', 'TUMOR SIZE', 'BW']]

# Let IDs be integers
data['#ID'] = data['#ID'].astype(int)

data

Unnamed: 0,#ID,TIME,DOSE,Y,CENS,TUMOR SIZE,BW
109,34,0.0,,,,165.2920,27.7
110,34,2.0,,,,217.3280,26.9
112,34,4.0,700.0,,,292.5000,28.0
114,34,7.0,700.0,,,389.2285,27.7
116,34,9.0,700.0,,,396.1175,27.9
...,...,...,...,...,...,...,...
0,167,8.0,700.0,,,,
0,167,10.0,700.0,,,,
0,167,12.0,675.0,,,,
0,167,13.0,675.0,,,,


## Remove Tumor size and BW entries for concentration measurement rows. 

According to report those values where not measured together with the concentration. (not 100% sure about the body weight)

In [4]:
mask = data['Y'].isnull()
data['TUMOR SIZE'].where(mask, inplace=True)
data['BW'].where(mask, inplace=True)

data

Unnamed: 0,#ID,TIME,DOSE,Y,CENS,TUMOR SIZE,BW
109,34,0.0,,,,165.2920,27.7
110,34,2.0,,,,217.3280,26.9
112,34,4.0,700.0,,,292.5000,28.0
114,34,7.0,700.0,,,389.2285,27.7
116,34,9.0,700.0,,,396.1175,27.9
...,...,...,...,...,...,...,...
0,167,8.0,700.0,,,,
0,167,10.0,700.0,,,,
0,167,12.0,675.0,,,,
0,167,13.0,675.0,,,,


## Remove censored data for now (can always insert it back once we support it in our inference)

In [5]:
# Mask where CENS is NaN
mask = data['CENS'].isnull()

# Set censored concentration measurements to nan
data['Y'].where(mask, inplace=True)

# Remove the now unnecessary CENS colum
data = data[['#ID', 'TIME', 'DOSE', 'Y', 'TUMOR SIZE', 'BW']]

data

Unnamed: 0,#ID,TIME,DOSE,Y,TUMOR SIZE,BW
109,34,0.0,,,165.2920,27.7
110,34,2.0,,,217.3280,26.9
112,34,4.0,700.0,,292.5000,28.0
114,34,7.0,700.0,,389.2285,27.7
116,34,9.0,700.0,,396.1175,27.9
...,...,...,...,...,...,...
0,167,8.0,700.0,,,
0,167,10.0,700.0,,,
0,167,12.0,675.0,,,
0,167,13.0,675.0,,,


## Rename columns and transform units

In [6]:
# Rename TIME, DOSE, PLASMA CONCENTRATION, TUMOUR VOLUME and BODY WEIGHT with units as they are
data = data.rename(columns={
    '#ID': 'ID',
    'TIME': 'TIME in day',
    'DOSE': 'DOSE in ng',
    'Y': 'PLASMA CONCENTRATION in ng/L',
    'TUMOR SIZE': 'TUMOUR VOLUME in mm^3',
    'BW': 'BODY WEIGHT in g'})

# Convert units
data['DOSE in ng'] *= 1E-03
data = data.rename(columns={'DOSE in ng': 'DOSE in mg'})

data['PLASMA CONCENTRATION in ng/L'] *= 1E-03
data = data.rename(columns={'PLASMA CONCENTRATION in ng/L': 'PLASMA CONCENTRATION in ng/mL'})

data['TUMOUR VOLUME in mm^3'] *= 1E-03
data = data.rename(columns={'TUMOUR VOLUME in mm^3': 'TUMOUR VOLUME in cm^3'})

data

Unnamed: 0,ID,TIME in day,DOSE in mg,PLASMA CONCENTRATION in ng/mL,TUMOUR VOLUME in cm^3,BODY WEIGHT in g
109,34,0.0,,,0.165292,27.7
110,34,2.0,,,0.217328,26.9
112,34,4.0,0.700,,0.292500,28.0
114,34,7.0,0.700,,0.389228,27.7
116,34,9.0,0.700,,0.396118,27.9
...,...,...,...,...,...,...
0,167,8.0,0.700,,,
0,167,10.0,0.700,,,
0,167,12.0,0.675,,,
0,167,13.0,0.675,,,


## Format dataset into better generalisable form

ID | Time | Time unit | Biomarker | Measurement | Biomarker unit | Dose | Dose unit | Duration

In [7]:
# Split tumour volume and body weight measurements
tumour_volume_measurements = data[['ID', 'TIME in day', 'TUMOUR VOLUME in cm^3']]
body_weight_measurements = data[['ID', 'TIME in day', 'BODY WEIGHT in g']]
concentration_measurements = data[['ID', 'TIME in day', 'PLASMA CONCENTRATION in ng/mL']]
doses = data[['ID', 'TIME in day', 'DOSE in mg']]

# Rename column names
tumour_volume_measurements = tumour_volume_measurements.rename(columns={'TIME in day': 'Time'})
tumour_volume_measurements = tumour_volume_measurements.rename(columns={'TUMOUR VOLUME in cm^3': 'Measurement'})
body_weight_measurements = body_weight_measurements.rename(columns={'TIME in day': 'Time'})
body_weight_measurements = body_weight_measurements.rename(columns={'BODY WEIGHT in g': 'Measurement'})
concentration_measurements = concentration_measurements.rename(columns={'TIME in day': 'Time'})
concentration_measurements = concentration_measurements.rename(columns={'PLASMA CONCENTRATION in ng/mL': 'Measurement'})
doses = doses.rename(columns={'TIME in day': 'Time'})
doses = doses.rename(columns={'DOSE in mg': 'Dose'})

# Add columns
tumour_volume_measurements['Time unit'] = 'd'
tumour_volume_measurements['Biomarker'] = 'Tumour volume'
tumour_volume_measurements['Biomarker unit'] = 'cm^3'
body_weight_measurements['Time unit'] = 'd'
body_weight_measurements['Biomarker'] = 'Body weight'
body_weight_measurements['Biomarker unit'] = 'g'
concentration_measurements['Time unit'] = 'd'
concentration_measurements['Biomarker'] = 'Plasma concentration'
concentration_measurements['Biomarker unit'] = 'ng/mL'
doses['Time unit'] = 'd'
doses['Dose unit'] = 'mg'
doses['Duration'] = np.nan

# Filter NaNs
mask = tumour_volume_measurements['Biomarker'].notnull()
tumour_volume_measurements = tumour_volume_measurements[mask]
mask = body_weight_measurements['Biomarker'].notnull()
body_weight_measurements = body_weight_measurements[mask]
mask = concentration_measurements['Biomarker'].notnull()
concentration_measurements = concentration_measurements[mask]
mask = doses['Dose'].notnull()
doses = doses[mask]

# Merge to final dataset
data = tumour_volume_measurements.append(body_weight_measurements)
data = data.append(concentration_measurements)
data = data.append(doses)

# Sort columns (for aestetic reasons)
data = data[['ID', 'Time', 'Time unit', 'Biomarker', 'Measurement', 'Biomarker unit', 'Dose', 'Dose unit', 'Duration']]
data

Unnamed: 0,ID,Time,Time unit,Biomarker,Measurement,Biomarker unit,Dose,Dose unit,Duration
109,34,0.0,d,Tumour volume,0.165292,cm^3,,,
110,34,2.0,d,Tumour volume,0.217328,cm^3,,,
112,34,4.0,d,Tumour volume,0.292500,cm^3,,,
114,34,7.0,d,Tumour volume,0.389228,cm^3,,,
116,34,9.0,d,Tumour volume,0.396118,cm^3,,,
...,...,...,...,...,...,...,...,...,...
0,167,8.0,d,,,,0.700,mg,
0,167,10.0,d,,,,0.700,mg,
0,167,12.0,d,,,,0.675,mg,
0,167,13.0,d,,,,0.675,mg,


## Illustrate medium erlotinib dose group data

In [8]:
import erlotinib.plots as eplt


# Create scatter plot
fig = eplt.PKTimeSeriesPlot()
fig.add_data(data, biomarker='Plasma concentration')
fig.set_axis_labels(time_label='Time in day', biom_label='Plasma conc. in ng/mL', dose_label='Dose in mg')

# Show figure
fig.show()

**Figure 1:** Plasma concentration time series in lung cancer treatment group with medium erlotinib dose ($25\, \text{mg}$ oral dose per $\text{g}$ body weight).

In [11]:
# Create scatter plot
fig = eplt.PDTimeSeriesPlot()
fig.add_data(data, biomarker='Tumour volume')
fig.set_axis_labels(xlabel=r'$\text{Time in day}$', ylabel=r'$\text{Tumour volume in cm}^3$')

# Show figure
fig.show()

**Figure 2:** Tumour volume time series in lung cancer treatment group with medium erlotinib dose ($25\, \text{mg}$ oral dose per $\text{g}$ body weight).

## Export data

In [12]:
# Export cleaned LXF A677 control growth data
path = os.path.dirname(os.getcwd())
data.to_csv(path_or_buf=path + '/data/lxf_medium_erlotinib_dose.csv', index=False)