In [1]:
import pandas as pd

from distributed import LocalCluster, Client
import xgboost as xgb
import dask.dataframe

In [2]:
training_data = pd.read_csv('data/training_set_VU_DM.csv')
test_data = pd.read_csv('data/test_set_VU_DM.csv')

In [3]:
booked_data = training_data[training_data['booking_bool'] == 1].reset_index(drop=True)

In [4]:
model_traindata = booked_data[['srch_id','visitor_location_country_id','prop_country_id','prop_id','prop_location_score1','prop_location_score2','srch_length_of_stay','srch_booking_window']]
model_testdata = test_data[['srch_id','visitor_location_country_id','prop_country_id','prop_id','prop_location_score1','prop_location_score2','srch_length_of_stay','srch_booking_window']]

In [5]:
model_traindata_drop = model_traindata.dropna().reset_index(drop=True)
model_testdata_drop = model_testdata.dropna().reset_index(drop=True)

In [6]:
X_train = model_traindata_drop[['srch_id','visitor_location_country_id','prop_country_id','prop_location_score1','prop_location_score2','srch_length_of_stay','srch_booking_window']]
Y_train = model_traindata_drop['prop_id']

X_test = model_testdata_drop[['srch_id','visitor_location_country_id','prop_country_id','prop_location_score1','prop_location_score2','srch_length_of_stay','srch_booking_window']]

In [7]:
del booked_data

del model_traindata
del model_testdata

del model_traindata_drop
del model_testdata_drop

In [8]:
X_train_dask = dask.dataframe.from_pandas(X_train,npartitions=100)
Y_train_dask = dask.dataframe.from_pandas(Y_train,npartitions=100)

X_test_dask = dask.dataframe.from_pandas(X_test,npartitions=100)

In [None]:
def main(client: Client) -> None:
    clf = xgb.dask.DaskXGBClassifier(n_estimators=100, tree_method="hist")
    clf.client = client  # assign the client
    clf.fit(X_train_dask, Y_train_dask, eval_set=[(X_train_dask, Y_train_dask)])
    proba = clf.predict_proba(X_test_dask)


if __name__ == "__main__":
    with LocalCluster() as cluster:
        with Client(cluster) as client:
            main(client)