In [2]:
import os
import pandas as pd
import numpy as np
from preprocess import *
import optuna
from sklearn.metrics import r2_score 
from optuna.samplers import TPESampler


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import xgboost as xgb

In [27]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

df_train = df_train.drop('Id',axis=1)
df_train = df_train.drop('parentspecies',axis=1)



X_train = df_train.loc[:,df_train.columns != 'pSat_Pa']
y_train = np.log10(df_train.loc[:, 'pSat_Pa'])
y_test = df_test.drop("Id", axis=1)
y_test = y_test.drop("parentspecies", axis=1)

In [4]:
RANDOM_SEED = 42
train_x, test_x, train_y, test_y = train_test_split(X_train, y_train, test_size=0.25, random_state=RANDOM_SEED)

In [5]:

train_x.dtypes

MW                              float64
NumOfAtoms                        int64
NumOfC                            int64
NumOfO                            int64
NumOfN                            int64
NumHBondDonors                    int64
NumOfConf                         int64
NumOfConfUsed                     int64
C.C..non.aromatic.                int64
C.C.C.O.in.non.aromatic.ring      int64
hydroxyl..alkyl.                  int64
aldehyde                          int64
ketone                            int64
carboxylic.acid                   int64
ester                             int64
ether..alicyclic.                 int64
nitrate                           int64
nitro                             int64
aromatic.hydroxyl                 int64
carbonylperoxynitrate             int64
peroxide                          int64
hydroperoxide                     int64
carbonylperoxyacid                int64
nitroester                        int64
dtype: object

In [6]:
print(f"The dimension of train_x is {train_x.shape}")
print(f"The dimension of train_y is {train_y.shape}")
print(f"The dimension of test_x is {test_x.shape}")
print(f"The dimension of train_x is {test_y.shape}")

The dimension of train_x is (20360, 24)
The dimension of train_y is (20360,)
The dimension of test_x is (6787, 24)
The dimension of train_x is (6787,)


In [7]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

In [19]:

def objective(trial):
    
    params = {
        "n_estimators": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "subsample": trial.suggest_float("subsample", 0.25, 0.75),
        "subsample_freq": 1,
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 0.5),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 50),
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "random_state": RANDOM_SEED,
        "verbosity": -1
        
    }
    datas = lgb.Dataset(train_x, label=train_y)
    model = lgb.train(params, datas)

    predictions = model.predict(test_x)
    r2 = r2_score(test_y, predictions)
    return r2
    

# Remember to assign "lgbm-wine-1" as the study_name to your Optuna study. 
study_name = "lgbm-svp-1"

# For this assignment, it is enough to use a simple sqlite3 database for persisting study history
storage = "sqlite:///optuna.sqlite3"

# Create (and run) the study and record the history in the SQlite3 database file

study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED),
    study_name=study_name,
    storage=storage,
    load_if_exists=True
)
study.optimize(objective, n_trials=100)


print("Best R2", study.best_value)
print("Best params:", study.best_trial.params)


Found `n_estimators` in params. Will use it instead of argument


Found `n_estimators` in params. Will use it instead of argument


Found `n_estimators` in params. Will use it instead of argument


Found `n_estimators` in params. Will use it instead of argument


Found `n_estimators` in params. Will use it instead of argument


Found `n_estimators` in params. Will use it instead of argument


Found `n_estimators` in params. Will use it instead of argument


Found `n_estimators` in params. Will use it instead of argument


Found `n_estimators` in params. Will use it instead of argument


Found `n_estimators` in params. Will use it instead of argument


Found `n_estimators` in params. Will use it instead of argument


Found `n_estimators` in params. Will use it instead of argument


Found `n_estimators` in params. Will use it instead of argument


Found `n_estimators` in params. Will use it instead of argument


Found `n_estimators` in params. Will use it instead of argument


Found `n_

Best R2 0.7363444503622264
Best params: {'learning_rate': 0.01267513058971133, 'subsample': 0.4664565157755191, 'colsample_bytree': 0.4147928366143744, 'min_child_samples': 50, 'num_leaves': 1005}


Only one study of a name can exist at a time so the previous study must be deleted first before the code is run again

In [18]:
optuna.delete_study(study_name="lgbm-svp-1", storage="sqlite:///optuna.sqlite3")

In [15]:
import plotly.io as pio

# Configure Jupyter Notebook to render plotly figures drawn by Optuna
pio.renderers.default = "notebook"

In [20]:
# Plot the hyperparameter importance
study_name = "lgbm-svp-1"
storage = "sqlite:///optuna.sqlite3"

### START CODE HERE
study = optuna.load_study(study_name=study_name, storage=storage)
fig = fig = optuna.visualization.plot_param_importances(study, evaluator=optuna.importance.FanovaImportanceEvaluator(seed=RANDOM_SEED))
fig.show()

In [21]:
fig = optuna.visualization.plot_slice(study, params=["colsample_bytree", "learning_rate"])
fig.show()

In [31]:
print(study.best_trial.params)


{'learning_rate': 0.01267513058971133, 'subsample': 0.4664565157755191, 'colsample_bytree': 0.4147928366143744, 'min_child_samples': 50, 'num_leaves': 1005}


In [22]:
datas = lgb.Dataset(X_train, label=y_train)
model = lgb.train(study.best_trial.params, datas)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.132885 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 659
[LightGBM] [Info] Number of data points in the train set: 27147, number of used features: 23
[LightGBM] [Info] Start training from score -3.854276


In [25]:
def write_to_file(y_test, y_pred, file_name):
    f = open(file_name, 'w')
    f.write("Id,target\n")
    for i in range(y_test.shape[0]):
        f.write("{},{}\n".format(df_test.loc[i,"Id"], y_pred[i]))
    f.close()

In [28]:
predictions = model.predict(y_test)
write_to_file(df_test, predictions, "predictions.csv")