In [1]:
import os
import pandas as pd
import numpy as np
from preprocess import *
import optuna
from sklearn.metrics import r2_score 
from optuna.samplers import TPESampler


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [2]:
RANDOM_SEED = 42

In [19]:
# Read the data
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

#Add a row to df_test which has the parentspecies as 'decane_toluene' to force pd.get_dummies to create a one-hot vector for it
df_test.loc[df_test.shape[0]] = [1004491, 117.990223164, 10, 3, 5, 0, 1, 5, 4, "decane_toluene", 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]

# Drop Columns and save 'parentspecies' 
df_train = df_train.drop('Id',axis=1)
parent_species_train = df_train["parentspecies"]
df_train = df_train.drop("parentspecies", axis=1)


df_test = df_test.drop("Id", axis=1)
parent_species_test = df_test["parentspecies"]
df_test = df_test.drop("parentspecies", axis=1)

# Split data into features and target
X_train = df_train.loc[:,df_train.columns != 'pSat_Pa']
y_train = np.log10(df_train.loc[:, 'pSat_Pa'])

# Scale all the features
scaler = StandardScaler()
normalized_data = scaler.fit_transform(X_train)
X_train_norm = pd.DataFrame(normalized_data, columns=X_train.columns)

normalized_data = scaler.transform(df_test)
X_test_norm = pd.DataFrame(normalized_data, columns=df_test.columns)

# Add 'parentspecies' back to df
X_train_norm["parentspecies"] = parent_species_train
X_test_norm["parentspecies"] = parent_species_test


# One-hot encoding
X_train_norm = pd.get_dummies(X_train_norm, columns = ["parentspecies"])
X_test_norm = pd.get_dummies(X_test_norm, columns = ["parentspecies"])

# Drop the add row
X_test_norm = X_test_norm[:-1]


In [20]:
X_train_norm.dtypes

MW                                   float64
NumOfAtoms                           float64
NumOfC                               float64
NumOfO                               float64
NumOfN                               float64
NumHBondDonors                       float64
NumOfConf                            float64
NumOfConfUsed                        float64
C.C..non.aromatic.                   float64
C.C.C.O.in.non.aromatic.ring         float64
hydroxyl..alkyl.                     float64
aldehyde                             float64
ketone                               float64
carboxylic.acid                      float64
ester                                float64
ether..alicyclic.                    float64
nitrate                              float64
nitro                                float64
aromatic.hydroxyl                    float64
carbonylperoxynitrate                float64
peroxide                             float64
hydroperoxide                        float64
carbonylpe

In [21]:
X_test_norm.dtypes

MW                                   float64
NumOfAtoms                           float64
NumOfC                               float64
NumOfO                               float64
NumOfN                               float64
NumHBondDonors                       float64
NumOfConf                            float64
NumOfConfUsed                        float64
C.C..non.aromatic.                   float64
C.C.C.O.in.non.aromatic.ring         float64
hydroxyl..alkyl.                     float64
aldehyde                             float64
ketone                               float64
carboxylic.acid                      float64
ester                                float64
ether..alicyclic.                    float64
nitrate                              float64
nitro                                float64
aromatic.hydroxyl                    float64
carbonylperoxynitrate                float64
peroxide                             float64
hydroperoxide                        float64
carbonylpe

Find and remove outliers

In [22]:
rf_model = RandomForestRegressor(n_jobs=-1, n_estimators=500, random_state=RANDOM_SEED)
rf_model.fit(X_train_norm, y_train)

predicted = rf_model.predict(X_train_norm)
abs_error = np.abs(predicted - y_train)

indexes = [index for index, value in enumerate(abs_error) if value > 1.4]

X_train_norm.drop(index=indexes, inplace=True)
y_train.drop(index=indexes, inplace=True)

In [7]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

In [24]:
def objective(trial):
    
    params = {
        "n_estimators": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "subsample": trial.suggest_float("subsample", 0.25, 0.75),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.25, 0.75),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 100),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "random_state": RANDOM_SEED
    }

    bst = xgb.XGBRegressor(**params)

    score = cross_validate(bst, X_train_norm, y_train, n_jobs=-1, scoring="r2", cv=5)

    
    return np.mean(score["test_score"])
    

 
study_name = "xgb-svp-1"


storage = "sqlite:///optuna.sqlite3"


study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED),
    study_name=study_name,
    storage=storage,
    load_if_exists=True
)
study.optimize(objective, n_trials=100)


print("Best R2", study.best_value)
print("Best params:", study.best_trial.params)

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not used.

Parameters: { "subsample_freq" } are not

Best R2 0.751323509699539
Best params: {'learning_rate': 0.025164664397195367, 'subsample': 0.39135880063847506, 'colsample_bytree': 0.6167724082246673, 'min_child_weight': 3, 'max_depth': 6}


In [23]:
optuna.delete_study(study_name="xgb-svp-1", storage="sqlite:///optuna.sqlite3")

In [10]:
import plotly.io as pio

# Configure Jupyter Notebook to render plotly figures drawn by Optuna
pio.renderers.default = "notebook"

In [25]:
# Plot the hyperparameter importance
study_name = "xgb-svp-1"
storage = "sqlite:///optuna.sqlite3"

### START CODE HERE
study = optuna.load_study(study_name=study_name, storage=storage)
fig = fig = optuna.visualization.plot_param_importances(study, evaluator=optuna.importance.FanovaImportanceEvaluator(seed=RANDOM_SEED))
fig.show()

In [26]:
fig = optuna.visualization.plot_slice(study, params=["colsample_bytree", "learning_rate"])
fig.show()

In [27]:
study.best_params

{'learning_rate': 0.025164664397195367,
 'subsample': 0.39135880063847506,
 'colsample_bytree': 0.6167724082246673,
 'min_child_weight': 3,
 'max_depth': 6}

In [28]:
bst = xgb.XGBRegressor(**study.best_params)
bst.fit(X_train_norm, y_train)
test_predictions = bst.predict(X_test_norm)

In [29]:
def write_to_file(y_test, y_pred, file_name):
    f = open(file_name, 'w')
    f.write("Id,target\n")
    for i in range(y_test.shape[0]):
        f.write("{},{}\n".format(df_test.loc[i,"Id"], y_pred[i]))
    f.close()

In [30]:
df_test = pd.read_csv("../data/test.csv")
write_to_file(df_test, test_predictions, "predictions.csv")