In [40]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [41]:
# imports and seperation
data_raw = pd.read_csv("../data/parkinsons.data")
features = data_raw.loc[:, data_raw.columns != "status"].values[:, 1:]
y = data_raw.loc[:, "status"].values

#  scale data
scaler = MinMaxScaler((-1, 1))
X = scaler.fit_transform(features)

In [42]:
data_raw.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [43]:
params = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta': 1,
    'subsample': 1,
    'colsample_bytree': 1,

}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
xgb = xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

xgb.train(
    params,
    dtrain,
    num_boost_round=999,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10,
)

cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=999,
    seed=42,
    nfold=5,
    early_stopping_rounds=10
)

[0]	Test-rmse:0.38408
[1]	Test-rmse:0.37783
[2]	Test-rmse:0.37713
[3]	Test-rmse:0.37834
[4]	Test-rmse:0.37848
[5]	Test-rmse:0.37872
[6]	Test-rmse:0.37868
[7]	Test-rmse:0.37863
[8]	Test-rmse:0.37863
[9]	Test-rmse:0.37865
[10]	Test-rmse:0.37865
[11]	Test-rmse:0.37865
[12]	Test-rmse:0.37865


In [46]:
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    params['eta'] = eta
    cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=999,
            seed=42,
            nfold=2,
            metrics=['mae'],
            early_stopping_rounds=10,
          )
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()

    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

CV with eta=0.3
	MAE 0.2070055 for 41 rounds

CV with eta=0.2
	MAE 0.207124 for 62 rounds

CV with eta=0.1
	MAE 0.200987 for 120 rounds

CV with eta=0.05
	MAE 0.20072050000000002 for 257 rounds

CV with eta=0.01
	MAE 0.1977915 for 997 rounds

CV with eta=0.005
	MAE 0.1993685 for 998 rounds

Best params: 0.01, MAE: 0.1977915
