# Preparing Medium Erlotinib Dose Lung Cancer Treatment Group Data

## Import raw LXF A677 medium erlotinb dose PKPD data

In [99]:
import os
import pandas as pd


# Import LXF A677 PK data
path = os.path.dirname(os.getcwd())
data_raw = pd.read_csv(path + '/data/raw_data/PK_LXF_erlo.csv', sep=';')

# Display data
print('Raw PK Data Set for all dosing regimens:')
data_raw

Raw PK Data Set for all dosing regimens:


Unnamed: 0,#ID,TIME,DOSE,ADDL,II,Y,YTYPE,CENS,CELL LINE,DOSE GROUP,DRUG,EXPERIMENT,TUMOR SIZE,BW
0,6,0.0,.,.,.,.,.,.,1,100.00,1,2,68.7500,24.2
1,6,2.0,.,.,.,.,.,.,1,100.00,1,2,75.4290,24.7
2,6,3.0,2450,.,.,.,.,.,1,100.00,1,2,75.4290,24.7
3,6,4.0,.,.,.,.,.,.,1,100.00,1,2,115.3510,23.6
4,6,4.0,2350,2,1,.,.,.,1,100.00,1,2,115.3510,23.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,162,21.0,.,.,.,.,.,.,1,6.25,1,2,490.3980,24.2
468,162,23.0,.,.,.,.,.,.,1,6.25,1,2,425.2700,24.5
469,162,25.0,.,.,.,.,.,.,1,6.25,1,2,432.6660,24.5
470,162,28.0,.,.,.,.,.,.,1,6.25,1,2,585.0000,25.3


## Cleaning the data

The keys of interest for our analysis are

- **#ID** indicating which mouse was measured,
- **TIME** indicating the time point of each measurement,
- **DOSE** indicating the dose amount administered at the time point,
- **DURATION** indicating the duration of a single dose administration (bolus injection, duration=0/NaN,),
- **PLASMA CONCENTRATION** indicating the concentration of erlotinib in the blood plasma,
- **TUMOUR VOLUME** indicating the measured tumour volume,
- **BODY WEIGHT** indicating the body weight of the mouse.

In discussion with the authors and a comparison with their study report in [1], the relevant columns in the dataset were identified as **#ID**, **TIME**, **DOSE** (**ADDL**, **II**), **Y**, **TUMOR SIZE** and **BW**, where **Y** encodes for the **PLASMA CONCENTRATION**, **TUMOR SIZE** for the **TUMOUR VOLUME**, and **BW** for the **BODY WEIGHT**. The applied dose is encoded by **DOSE** **ADDL**, **II**, where **DOSE** is the administered dose amount at the given time point, **ADDL* the number of additionally applied doses of the same amount, and **II** the time interval separating the doses. The remaining keys are partially Monolix-specific modelling keys, and partially inferred model parameters. We thus ignore those columns.

The raw datasets do not contain the units of the measured quantities. From [1] as well as Roche's study report, we may infer that

- **TIME**: is measured in $\text{day}$,
- **DOSE**: in $\text{ng}$,
- **PLASMA CONCENTRATION** in $\text{ng} / \text{L}$,
- **TUMOUR VOLUME**: is measured in $\text{mm}^3$,
- **BODY WEIGHT**: is measured in $\text{g}$.

For reasons that will become clear later, we will choose to measure the dose in $\text{mg}$, the plasma concentration in $\text{mg}/\text{L}$ and the tumour volume in $\text{cm}^3$.

According to the study report, the medium dose group where administered an oral dose of erlotinib of $25\, \text{mg}/\text{kg}/\text{L}$ per dose. For now we will assume that the doses recorded in the dataset are correct.

## Filter for medium dose data

In [100]:
import numpy as np

# Filter medium dose data
data = data_raw.apply(pd.to_numeric, errors='coerce')
data = data_raw[data_raw['DOSE GROUP'] == 25.0]

# Forget columns that carry no relevant info
assert len(data['CELL LINE'].unique()) == 1
assert len(data['DOSE GROUP'].unique()) == 1
assert len(data['EXPERIMENT'].unique()) == 1
assert len(data['DRUG'].unique()) == 1
assert np.array_equal(data['YTYPE'].unique(), np.array(['.', '1']))
 
data = data[['#ID', 'TIME', 'DOSE', 'ADDL', 'II', 'Y', 'CENS', 'TUMOR SIZE', 'BW']]

# Replace '.' by nans and transform to numerics again
data.replace(to_replace='.', value=np.nan, inplace=True)
data = data.apply(pd.to_numeric, errors='coerce')

data

Unnamed: 0,#ID,TIME,DOSE,ADDL,II,Y,CENS,TUMOR SIZE,BW
109,34,0.0,,,,,,165.292,27.7
110,34,2.0,,,,,,217.328,26.9
111,34,3.0,675.0,,,,,217.328,26.9
112,34,4.0,,,,,,292.500,28.0
113,34,4.0,700.0,2.0,1.0,,,292.500,28.0
...,...,...,...,...,...,...,...,...,...
288,167,21.0,,,,,,1094.400,28.7
289,167,23.0,,,,,,1284.400,28.5
290,167,25.0,,,,,,1428.768,29.2
291,167,28.0,,,,,,1414.485,29.6


## Save all dose events as separate rows

In [101]:
# Find all implicit doses
implicit_doses = data[pd.notnull(data['ADDL'])]

# Add implicit doses to data
for _, row in implicit_doses.iterrows():
    # Get id, time, dose amount, multiplicity, interval
    index = row['#ID']
    time = row['TIME']
    dose = row['DOSE']
    number = int(row['ADDL'])
    interval = row['II']

    # Add dose entry to data
    for _ in range(number):
        # Compute dose time
        time += interval

        # Create an id and a time mask
        mask_id = data['#ID'] == index
        mask_time = data['TIME'] == time

        if data[mask_id & mask_time].empty:
            # This means there is no row entry for the indiviudal at that time point.
            data = data.append(pd.DataFrame({
                '#ID': [index],
                'TIME': [time],
                'DOSE': [dose]}))
        else:
            # This means the individual was measured at that time point
            assert data[mask_id & mask_time]['DOSE'].isnull()

            data[mask_id & mask_time]['DOSE'] = dose

# Remove the now unnecessary ADDL and II colum
data = data[['#ID', 'TIME', 'DOSE', 'Y', 'CENS', 'TUMOR SIZE', 'BW']]

# Let IDs be integers
data['#ID'] = data['#ID'].astype(int)

data

Unnamed: 0,#ID,TIME,DOSE,Y,CENS,TUMOR SIZE,BW
109,34,0.0,,,,165.292,27.7
110,34,2.0,,,,217.328,26.9
111,34,3.0,675.0,,,217.328,26.9
112,34,4.0,,,,292.500,28.0
113,34,4.0,700.0,,,292.500,28.0
...,...,...,...,...,...,...,...
0,167,8.0,700.0,,,,
0,167,10.0,700.0,,,,
0,167,12.0,675.0,,,,
0,167,13.0,675.0,,,,


## Remove censored data for now (can always insert it back once we support it in our inference)

In [102]:
# Mask where CENS is NaN
mask = data['CENS'].isnull()

# Set censored concentration measurements to nan
data[~mask]['Y'] = np.nan

# Remove the now unnecessary CENS colum
data = data[['#ID', 'TIME', 'DOSE', 'Y', 'TUMOR SIZE', 'BW']]

data

Unnamed: 0,#ID,TIME,DOSE,Y,TUMOR SIZE,BW
109,34,0.0,,,165.292,27.7
110,34,2.0,,,217.328,26.9
111,34,3.0,675.0,,217.328,26.9
112,34,4.0,,,292.500,28.0
113,34,4.0,700.0,,292.500,28.0
...,...,...,...,...,...,...
0,167,8.0,700.0,,,
0,167,10.0,700.0,,,
0,167,12.0,675.0,,,
0,167,13.0,675.0,,,


## Rename columns and transform units

In [103]:
# Rename TIME, DOSE, PLASMA CONCENTRATION, TUMOUR VOLUME and BODY WEIGHT with units as they are
data = data.rename(columns={
    'TIME': 'TIME in day',
    'DOSE': 'DOSE in ng',
    'Y': 'PLASMA CONCENTRATION in ng/L',
    'TUMOR SIZE': 'TUMOUR VOLUME in mm^3',
    'BW': 'BODY WEIGHT in g'})

# Convert units
data['DOSE in ng'] *= 1E-03
data = data.rename(columns={'DOSE in ng': 'DOSE in mg'})

data['PLASMA CONCENTRATION in ng/L'] *= 1E-03
data = data.rename(columns={'PLASMA CONCENTRATION in ng/L': 'PLASMA CONCENTRATION in mg/L'})

data['TUMOUR VOLUME in mm^3'] *= 1E-03
data = data.rename(columns={'TUMOUR VOLUME in mm^3': 'TUMOUR VOLUME in cm^3'})

data

Unnamed: 0,#ID,TIME in day,DOSE in mg,PLASMA CONCENTRATION in mg/L,TUMOUR VOLUME in cm^3,BODY WEIGHT in g
109,34,0.0,,,0.165292,27.7
110,34,2.0,,,0.217328,26.9
111,34,3.0,0.675,,0.217328,26.9
112,34,4.0,,,0.292500,28.0
113,34,4.0,0.700,,0.292500,28.0
...,...,...,...,...,...,...
0,167,8.0,0.700,,,
0,167,10.0,0.700,,,
0,167,12.0,0.675,,,
0,167,13.0,0.675,,,


## Illustrate medium erlotinib dose group data

## Export data

In [105]:
# Export cleaned LXF A677 control growth data
path = os.path.dirname(os.getcwd())
data.to_csv(path_or_buf=path + '/data/lxf_medium_erlotinib_dose.csv', index=False)