### Machine learning using XGBoost

In [None]:
import covalent as ct
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, ParameterGrid
import pandas as pd
import xgboost as xgb
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
executor = ct.executor.SlurmExecutor(username="venkat", address="beehive.agnostiq.ai", poll_freq=20,
    conda_env="covalent", ssh_key_file="~/.ssh/id_ed25519",
    remote_workdir="/federation/venkat",
    options={
        "ntasks": 1,
        "cpus-per-task": 2,
        "partition": "debug",
        "nodelist": "beehive-debug-st-t2medium-1"
    })

In [None]:
def load_dataset():
    boston = load_boston()
    data = pd.DataFrame(boston.data)
    data['PRICE'] = boston.target
    X, y = data.iloc[:, :-1], data.iloc[:, -1]
    return X, y

def split_dataset(features: pd.DataFrame, targets: pd.DataFrame):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
    return X_train, X_test, y_train, y_test

def get_model(params: dict):
    return xgb.XGBRegressor(**params)

def train(model: xgb.XGBModel, features: np.ndarray, targets: np.ndarray):
    model.fit(features, targets)
    return model

def predict(model: xgb.XGBModel, features: np.ndarray):
    return model.predict(features)

def measure_model_performance(targets: np.ndarray, predictions: np.ndarray):
    return np.sqrt(mean_squared_error(targets, predictions))

### Local execution withouth covalent

In [None]:
X, y = load_dataset()
X_train, X_test, y_train, y_test = split_dataset(X, y)
model_to_train = get_model({"objective":"reg:squarederror",
                                "colsample_bytree":0.3,
                                "learning_rate": 0.1,
                                "max_depth":5,
                                "alpha": 10,
                                "n_estimators": 10})
trained_model= train(model_to_train, X_train, y_train)
predictions = trained_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print("RMSE: %f"%(rmse))

### Convert to workflow

In [None]:
@ct.electron
def split_dataset(features: pd.DataFrame, targets: pd.DataFrame):
    X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=123)
    return X_train, X_test, y_train, y_test

@ct.electron
def get_model(params: dict) -> xgb.XGBModel:
    model = xgb.XGBRegressor(**params)
    print(type(model))
    return model

@ct.electron
def train(model: xgb.XGBModel, features, targets):
    print(type(model))
    model.fit(features, targets)
    return model

@ct.electron
def predict(model: xgb.XGBModel, features: np.ndarray):
    return model.predict(features)

@ct.electron
def measure_model_performance(targets: np.ndarray, predictions: np.ndarray):
    return np.sqrt(mean_squared_error(targets, predictions))

In [None]:
@ct.lattice
def workflow(X, y):
    X_train, X_test, y_train, y_test = split_dataset(features=X, targets=y)
    model_to_train = get_model({"objective":"reg:squarederror",
                                "colsample_bytree":0.3,
                                "learning_rate": 0.1,
                                "max_depth": 5,
                                "alpha": 10,
                                "n_estimators": 10})
    trained_model = train(model=model_to_train, features=X_train, targets=y_train)
    predictions = predict(model=trained_model, features=X_test)
    score = measure_model_performance(targets=y_test, predictions=predictions)
    return score

In [None]:
X, y = load_dataset()
dispatch_id = ct.dispatch(workflow)(X, y)
print(dispatch_id)
result = ct.get_result(dispatch_id=dispatch_id, wait=True)
print(result)

#### Hyperparameter optimization

In [None]:
params_to_tune={
    "colsample_bytree": np.linspace(0.1, 0.5, 1),
    "learning_rate": np.linspace(0.01, 0.1, 1),
    "max_depth": [5],
    "alpha": np.arange(5, 11, 1),
    "n_estimators": np.arange(8, 11, 1)
}

# Get a list of all parameters to use to build a model and cross validate
grid = list(ParameterGrid(params_to_tune))
print(len(grid))

In [None]:
@ct.electron
def cross_validate_model(model_params: dict, features, target, n_folds: int):
    model = xgb.XGBRegressor(**model_params)
    cv_scores = cross_val_score(model, features, targets, scoring='neg_root_mean_squared_error', cv=n_folds)
    return np.mean(cv_scores), np.std(cv_scores)

In [None]:
@ct.lattice
def workflow(parameters, X, y):
    results = []
    for p in parameters:
        avg_score, score_std = cross_validate_model(model_params=p, features=X, targets=y, n_folds=3)
        entry = {'params': p, 'avg_score': avg_score, 'score_std': score_std}
        results.append(entry)
    return results

In [None]:
dispatch_id = ct.dispatch(workflow)(grid, X_train, y_train)
print(dispatch_id)
#result = ct.get_result(dispatch_id=dispatch_id, wait=True)
#print(result)

In [None]:
scores = list()
scores_std = list()
for p in grid:
    model = xgb.XGBRegressor(objective="reg:squarederror", **p)
    cv_scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=3)
    scores.append(np.mean(cv_scores))
    scores_std.append(np.std(cv_scores))

#### use covalent to dispatch to Slurm

#### Study model performance as a function of the tuned parameters

In [None]:
import matplotlib.pyplot as plt

In [None]:
colsample_bytree_values = []
n_estimator_values = []
learning_rate_values = []
max_depth_values = []
alpha_values = []
for p in grid:
    colsample_bytree_values.append(p['colsample_bytree'])
    learning_rate_values.append(p['learning_rate'])
    max_depth_values.append(p['max_depth'])
    alpha_values.append(p['alpha'])
    n_estimator_values.append(p['n_estimators'])

### Group scores with colsample_bytree parameter values

In [None]:
temp = []
for colsample, score in zip(colsample_bytree_values, scores):
    entry = {'colsample_bytree': colsample, 'score': score}
    temp.append(entry)
df = pd.DataFrame(temp)
temp2 = []
for unique in np.unique(colsample_bytree_values):
    entry = {'colsample_bytree': unique, 'avg_score': np.mean(df[df['colsample_bytree'] == unique]['score'])}
    temp2.append(entry)
df_colsample_bytree_vs_scores = pd.DataFrame(temp2)

### Group scores with learning_rate values

In [None]:
temp = []
for lrate, score in zip(learning_rate_values, scores):
    entry = {'learning_rate': lrate, 'score': score}
    temp.append(entry)
df = pd.DataFrame(temp)
temp2 = []
for unique in np.unique(learning_rate_values):
    entry = {'learning_rate': unique, 'avg_score': np.mean(df[df['learning_rate'] == unique]['score'])}
    temp2.append(entry)
df_learning_rate_vs_scores = pd.DataFrame(temp2)

### Group scores with alpha values

In [None]:
temp = []
for alpha, score in zip(alpha_values, scores):
    entry = {'alpha': alpha, 'score': score}
    temp.append(entry)
df = pd.DataFrame(temp)
temp2 = []
for unique in np.unique(alpha_values):
    entry = {'alpha': unique, 'avg_score': np.mean(df[df['alpha'] == unique]['score'])}
    temp2.append(entry)
df_alpha_vs_scores = pd.DataFrame(temp2)

In [None]:
plt.figure()
plt.plot(df_colsample_bytree_vs_scores['colsample_bytree'], df_colsample_bytree_vs_scores['avg_score'], 'o-', ms=10)

In [None]:
plt.figure()
plt.plot(df_learning_rate_vs_scores['learning_rate'], df_learning_rate_vs_scores['avg_score'], 'o-', ms=10)

In [None]:
plt.figure()
plt.plot(df_alpha_vs_scores['alpha'], df_alpha_vs_scores['avg_score'], 'o-', ms=10)