In [41]:
from lightgbm import LGBMRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

In [34]:
seed = 42

# Load data

In [17]:
meta = pd.read_csv("../data/E0022_P01-P05_sample_map.txt", sep='\t')
protein_raw = pd.read_csv("../data/E0022_P05_protein_intensities.txt", sep='\t')
ic50 = pd.read_csv("../data/DrugResponse_PANCANCER_GDSC1_GDSC2_IC_20191119.csv")

protein_raw = protein_raw.rename(columns={'Unnamed: 0': 'Automatic_MS_filename'})
protein_raw_merge = pd.merge(protein_raw, meta[['Automatic_MS_filename', 'Cell_line']])

protein_sample_avg = protein_raw_merge.drop(['Automatic_MS_filename'],
                                      axis=1).groupby(['Cell_line']).agg(np.nanmean).reset_index()

In [18]:
# shuffle so that we randomly pick data version
ic50_shuffle = ic50.sample(frac=1).reset_index(drop=True).drop_duplicates(
    ['Drug Id', 'Cell line name'])

# two drugs - 201 and 1001

## 201 - most effective

In [30]:
drug_id = 201
tmp_df = pd.merge(
    protein_sample_avg,
    ic50_shuffle[ic50_shuffle['Drug Id'] == drug_id][['Cell line name', 'IC50']],
    how='inner',
    left_on='Cell_line',
    right_on='Cell line name')

X = tmp_df.drop(['Cell_line', 'Cell line name', 'IC50'], axis=1)
y = tmp_df['IC50']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [38]:
reg = LGBMRegressor()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [43]:
mean_squared_error(y_test, y_pred)

3.364149487082684

## 1001

In [45]:
drug_id = 1001
tmp_df = pd.merge(
    protein_sample_avg,
    ic50_shuffle[ic50_shuffle['Drug Id'] == drug_id][['Cell line name', 'IC50']],
    how='inner',
    left_on='Cell_line',
    right_on='Cell line name')

X = tmp_df.drop(['Cell_line', 'Cell line name', 'IC50'], axis=1)
y = tmp_df['IC50']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [46]:
reg = LGBMRegressor()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)

In [47]:
mean_squared_error(y_test, y_pred)

1.2540908072460701