# Install Necessary Pacakges

In [1]:
! pip install iterative-stratification

Collecting iterative-stratification
  Downloading https://files.pythonhosted.org/packages/9d/79/9ba64c8c07b07b8b45d80725b2ebd7b7884701c1da34f70d4749f7b45f9a/iterative_stratification-0.1.6-py3-none-any.whl
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.6


# Import Necessary Packages

In [2]:
import pandas as pd
import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from google.colab import drive
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from xgboost import XGBClassifier

# Custom Function Define

In [3]:
def df_pre_processing(raw_df, type='training', verbose=True):
    # expand features 2 non-numerical features 'cp_type', 'cp_dose' to 4 dummy 
    # features based on categorical values 
    processed_df = pd.concat([raw_df, pd.get_dummies(raw_df['cp_dose'], prefix='cp_dose')], axis=1)
    processed_df = pd.concat([processed_df, pd.get_dummies(raw_df['cp_type'], \
                                                                           prefix='cp_type')], axis=1)

    # drop the three original features
    processed_df = processed_df.drop(['cp_type', 'cp_dose'], axis=1)

    # removed the samples with wrong cp_type -- removed 1866 samples
    processed_df = processed_df.loc[processed_df['cp_type_trt_cp']==1].reset_index(drop=True)

    # drop the original sig_id column
    processed_df = processed_df.drop(columns='sig_id')

    # show shape of processed df
    if verbose:
        print(f"Processed {type} dataset shape = {processed_df.shape}.")
        
    return processed_df

# Unzip and data preprocess

In [4]:
#data_path = "./"
drive.mount('/content/drive')
!unzip /content/drive/Shareddrives/moa_data/lish-moa.zip
data_path = '/content/'

# data_path = "/content/drive/MyDrive/Colab Notebooks/MoA/lish-moa/"

# Load in data set
train_features_y1 = pd.read_csv(data_path + "train_targets_scored.csv")
train_features1 = pd.read_csv(data_path + "train_features.csv")


train_features1, test_features1, train_features_y1 , test_features_y1 = train_test_split(train_features1, train_features_y1, test_size = 0.1, random_state = 0)

# Preprocess training feature 
train_features1_processed = df_pre_processing(train_features1, type='training', verbose=True)
test_features1_processed = df_pre_processing(test_features1, type='training', verbose=True)

# Preprocess training labels
train_features_y1_processed  = train_features_y1.loc[train_features1['cp_type']=='trt_cp'].reset_index(drop=True)
train_features_y1_processed = train_features_y1_processed.drop("sig_id", axis= 1)

test_features_y1_processed  = test_features_y1.loc[test_features1['cp_type']=='trt_cp'].reset_index(drop=True)
test_features_y1_processed = test_features_y1_processed.drop("sig_id", axis= 1)

Mounted at /content/drive
Archive:  /content/drive/Shareddrives/moa_data/lish-moa.zip
  inflating: sample_submission.csv   
  inflating: test_features.csv       
  inflating: train_drug.csv          
  inflating: train_features.csv      
  inflating: train_targets_nonscored.csv  
  inflating: train_targets_scored.csv  
Processed training dataset shape = (19759, 877).
Processed training dataset shape = (2189, 877).


# Define Parameters Used for the Trainnig

In [5]:
X = train_features1_processed
Y = train_features_y1_processed
cols = Y.columns

# Train the Model

In [6]:
total_loss = 0
SEED = 42
feature_importance_list  = []

for c, column in enumerate(cols,1):
    #print(f"col{c} is in process")
    y = Y[column]
    
    # Split
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3,
                                                                    random_state=SEED)    
    
    # Define Classifer Model
    model = XGBClassifier(
                         tree_method = 'gpu_hist',
                         min_child_weight = 30,
                         learning_rate = 0.05,
                         colsample_bytree = 0.65,
                         gamma = 3.705,
                         max_depth = 10,
                         n_estimators = 170,
                         #subsample =  0.864, 
                         subsample =  0.9,
                         #booster='dart',
                         validate_parameters = True,
                         #grow_policy = 'depthwise',
                         predictor = 'gpu_predictor'
                              
                        )
                        
    # Train Model
    model.fit(X_train, y_train)
    pred = model.predict_proba(test_features1_processed)[:,1]
    feature_importance_list.append(model.feature_importances_)

    # Loss
    cross_entropy = -(np.array(np.log(pred+1e-5) * test_features_y1_processed[column])+ (1-test_features_y1_processed[column]) *np.log(1-pred+1e-5))
    
    total_loss += np.mean(cross_entropy)
    #print(total_loss)
    # Prediction
    predictions = model.predict(test_features1_processed)

# Compute Mean Log Loss

In [7]:
total_loss/206

0.018503261830285063

# Compute Average Feature Importance

In [None]:
feature_importance_mat = np.array(feature_importance_list)
drop_line_ind = []
select_line = []
for i, feature in enumerate(feature_importance_list):
  if sum(feature) > 0:
    select_line.append(i)
  else:
    #print(f"{i}th featrue has problem")
    drop_line_ind.append(i)

feature_importance_arr = np.mean(feature_importance_mat[[select_line]],axis=0)
np.save('feature_importantce_arr.npy', feature_importance_arr )

# Illustrate Model Summary

In [9]:
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.65, gamma=3.705,
              learning_rate=0.05, max_delta_step=0, max_depth=10,
              min_child_weight=30, missing=None, n_estimators=170, n_jobs=1,
              nthread=None, objective='binary:logistic',
              predictor='gpu_predictor', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
              subsample=0.9, tree_method='gpu_hist', validate_parameters=True,
              verbosity=1)