In [1]:
import pandas as pd

from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix


from xgboost.sklearn import XGBClassifier


In [12]:
class DataLoader:
    def __init__(self, name) -> None:
        datasets = {
        "kbr": "/home/agata/urban-mobility-project/thesis/data/kbr_data.csv",
        "gadow_agg": "/home/agata/urban-mobility-project/thesis/data/gadow_data_agg.csv",
        "gadow_not_agg": "/home/agata/urban-mobility-project/thesis/data/gadow_data_not_agg.csv"
        }
        self.name = name
        self.df = pd.read_csv(datasets[name])

    def get_X_y(self):
        y = self.df['travel_mode']
        X = self.df.drop(columns=['travel_mode'])
        return X, y
        


In [15]:
import os

def train_model(model, X, y, outpath, **kwargs):

    rskf = RepeatedStratifiedKFold(
        n_splits=kwargs["n_splits"],
        n_repeats=kwargs["n_repeats"],
        random_state=kwargs["random_state"],
    )

    random_search = RandomizedSearchCV(
        model,
        param_distributions=kwargs["params"],
        n_iter=kwargs["n_iter"],
        scoring=["accuracy", "precision", "recall", "f1_micro"],
        n_jobs=4,
        cv=rskf.split(X, y),
    )

    random_search.fit(X, y)

    print('\n All results:')
    print(random_search.cv_results_)
    print('\n Best estimator:')
    print(random_search.best_estimator_)
    print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (kwargs["n_splits"], kwargs["n_iter"]))
    print(random_search.best_score_ * 2 - 1)
    print('\n Best hyperparameters:')
    print(random_search.best_params_)
    results = pd.DataFrame(random_search.cv_results_)
    results.to_csv(os.path.join(outpath, f'{type(model).__name__}-random-grid-search-results-01.csv', index=False))



In [None]:
model_params = params = {
    "min_child_weight": [1, 5, 10],
    "gamma": [0.5, 1, 1.5, 2, 5],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "max_depth": [3, 4, 5],
}

xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)