# Preparing Lung Cancer Control Group Data

## Import raw LXF A677 control growth data

In [6]:
import os

import pandas as pd


# Import LXF A677 data
path = os.path.dirname(os.getcwd())
data_raw = pd.read_csv(path + '/data/raw_data/Ctrl_Growth_LXF.csv')

# Display data
data_raw

Unnamed: 0,#ID,TIME,DOSE,ADDL,II,Y,YTYPE,CENS,CELL LINE,DOSE GROUP,DRUG,DRUGCAT,EXPERIMENT,BW,YTV,KA,V,KE,w0
0,40,0,.,.,.,191.808,2,.,1,0,2,0,2,26.8,.,55,1.11,3.98,191.8080
1,94,0,.,.,.,77.2475,2,.,1,0,2,0,2,18.3,.,55,1.11,3.98,77.2475
2,95,0,.,.,.,186.2,2,.,1,0,2,0,2,22.3,.,55,1.11,3.98,186.2000
3,40,3,0,.,.,.,.,.,1,0,2,0,2,26.1,.,55,1.11,3.98,191.8080
4,40,4,0,2,1,.,.,.,1,0,2,0,2,26.5,.,55,1.11,3.98,191.8080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,140,2,.,.,.,126.852,2,.,1,0,2,0,2,23.6,126.852,55,1.11,3.98,79.3305
154,94,4,.,.,.,125.316,2,.,1,0,2,0,2,18.5,125.316,55,1.11,3.98,77.2475
155,170,4,.,.,.,109.33,2,.,1,0,2,0,2,27.9,109.33,55,1.11,3.98,80.0565
156,170,2,.,.,.,94.221,2,.,1,0,2,0,2,27.7,94.221,55,1.11,3.98,80.0565


## Cleaning the data

The keys of interest for our analysis are

- **#ID** indicating which mouse was measured,
- **TIME** indicating the time point of each measurement,
- **TUMOUR VOLUME** indicating the measured tumour volume,
- **BODY WEIGHT** indicating the body weight of the mouse.

In discussion with the authors and a comparison with their study report in [1], the relevant columns in the dataset were identified as **#ID**, **TIME**, **Y** and **BW**, where **Y** encodes for the **TUMOUR VOLUME** and **BW** for the **BODY WEIGHT**. The remaining keys are partially Monolix-specific modelling keys, and partially inferred model parameters. We thus ignore those columns.

The raw datasets do not contain the units of the measured quantities. From [1] as well as Roche's study report, we may infer that

- **TIME**: is measured in $\text{day}$,
- **TUMOUR VOLUME**: is measured in $\text{mm}^3$,
- **BODY WEIGHT**: is measured in $\text{g}$.

For reasons that will become clear later, we will choose to measure the tumour volume in $\text{cm}^3$.

## Cleaned LXF A677 control growth data

In [7]:
# Make sure that data is stored as numeric data
data = data_raw.apply(pd.to_numeric, errors='coerce')

# Mask data for non-null Y rows
data = data[data['Y'].notnull()]

# Rename #ID to ID
data = data.rename(columns={'#ID': 'ID'})

# Rename TIME to TIME in day
data = data.rename(columns={'TIME': 'TIME in day'})

# Rename Y to TUMOUR VOLUME in mm^3
data = data.rename(columns={'Y': 'TUMOUR VOLUME in mm^3'})

# Rename BW to BODY WEIGHT in g
data = data.rename(columns={'BW': 'BODY WEIGHT in g'})

# Raise error if DOSE, ADDL, II, YTYPE, CENS, CELL LINE, DOSE GROUP, DRUG, EXPERIMENT or DRUGCAT are not uni-valued
if len(data['DOSE'].unique()) > 1:
    raise ValueError
if len(data['ADDL'].unique()) > 1:
    raise ValueError
if len(data['II'].unique()) > 1:
    raise ValueError
if len(data['YTYPE'].unique()) > 1:
    raise ValueError
if len(data['CENS'].unique()) > 1:
    raise ValueError
if len(data['CELL LINE'].unique()) > 1:
    raise ValueError
if len(data['DOSE GROUP'].unique()) > 1:
    raise ValueError
if len(data['DRUG'].unique()) > 1:
    raise ValueError
if len(data['EXPERIMENT'].unique()) > 1:
    raise ValueError
if len(data['DRUGCAT'].unique()) > 1:
    raise ValueError

# Keep only #ID, TIME and TUMOUR VOLUME column
data = data[['ID', 'TIME in day', 'TUMOUR VOLUME in mm^3', 'BODY WEIGHT in g']]

# Sort data such that time is increasing (for later convenience)
data.sort_values('TIME in day', inplace=True)

# Convert tumour measurements to cm^3
data['TUMOUR VOLUME in mm^3'] *= 1E-03
data = data.rename(columns={'TUMOUR VOLUME in mm^3': 'TUMOUR VOLUME in cm^3'})

# Delete raw data from memory
del data_raw

# Display cleaned data set
data

Unnamed: 0,ID,TIME in day,TUMOUR VOLUME in cm^3,BODY WEIGHT in g
0,40,0,0.191808,26.8
1,94,0,0.077248,18.3
2,95,0,0.186200,22.3
59,136,0,0.118588,25.4
60,140,0,0.079330,22.7
...,...,...,...,...
77,136,30,1.459342,24.2
103,94,30,0.576240,19.2
90,169,30,0.746986,28.0
67,140,30,2.122582,24.1


## Format dataset into better generalisable form

ID | Time | Time unit | Biomarker | Measurement | Biomarker unit

In [8]:
# Split tumour volume and body weight measurements
tumour_volume_measurements = data[['ID', 'TIME in day', 'TUMOUR VOLUME in cm^3']]
body_weight_measurements = data[['ID', 'TIME in day', 'BODY WEIGHT in g']]

# Rename column names
tumour_volume_measurements = tumour_volume_measurements.rename(columns={'TIME in day': 'Time'})
tumour_volume_measurements = tumour_volume_measurements.rename(columns={'TUMOUR VOLUME in cm^3': 'Measurement'})
body_weight_measurements = body_weight_measurements.rename(columns={'TIME in day': 'Time'})
body_weight_measurements = body_weight_measurements.rename(columns={'BODY WEIGHT in g': 'Measurement'})

# Add unit columns
tumour_volume_measurements['Time unit'] = 'd'
tumour_volume_measurements['Biomarker'] = 'Tumour volume'
tumour_volume_measurements['Biomarker unit'] = 'cm^3'
body_weight_measurements['Time unit'] = 'd'
body_weight_measurements['Biomarker'] = 'Body weight'
body_weight_measurements['Biomarker unit'] = 'g'

# Filter NaNs
mask = tumour_volume_measurements['Biomarker'].notnull()
tumour_volume_measurements = tumour_volume_measurements[mask]
mask = body_weight_measurements['Biomarker'].notnull()
body_weight_measurements = body_weight_measurements[mask]

# Merge to final dataset
data = tumour_volume_measurements.append(body_weight_measurements)

# Sort columns (for aestetic reasons)
data = data[['ID', 'Time', 'Time unit', 'Biomarker', 'Measurement', 'Biomarker unit']]
data

## Illustrate control growth data

In [10]:
import erlotinib.plots as eplt


# Create scatter plot
fig = eplt.PDTimeSeriesPlot()
fig.add_data(data, biomarker='Tumour volume')
fig.set_axis_labels(xlabel=r'$\text{Time in day}$', ylabel=r'$\text{Tumour volume in cm}^3$')

# Show figure
fig.show()

**Figure 1:** Untreated tumour growth of patient-derived tumour explants LXF A677 (adenocarcinoma of the lung) implanted in mice.

## Export cleaned data

In [11]:
# Export cleaned LXF A677 control growth data
path = os.path.dirname(os.getcwd())
data.to_csv(path_or_buf=path + '/data/lxf_control_growth.csv', index=False)

## References

- <a name="ref1"> [1] </a> Eigenmann et. al., Combining Nonclinical Experiments with Translational PKPD Modeling to Differentiate Erlotinib and Gefitinib, Mol Cancer Ther (2016)