In [1]:
import numpy as np 
import pandas as pd 
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import accuracy_score
import joblib

In [23]:
X_train = pd.read_csv("./X_train.csv")
X_test = pd.read_csv("./X_test.csv")
y_train = pd.read_csv("./y_train.csv")
y_test = pd.read_csv("./y_test.csv")

In [16]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,urlLength,urlDepth,protocol,domain,domainAge,registrar,sslAge,PageRank
0,0,25,-1,1.0,81.0,2529.0,280.0,46.0,13004209.0
1,1,46,0,1.0,167.0,4749.752131,0.0,123.0,10000000.0
2,2,25,-1,1.0,331.0,10212.0,78.0,53.0,27764.0
3,3,22,-1,1.0,78.0,4521.0,421.0,4.0,9270.0
4,4,27,-1,1.0,230.0,2413.0,45.0,95.741924,47590709.0


In [24]:
X_train.drop("Unnamed: 0",axis = 1,inplace=True)
y_test.drop("Unnamed: 0",axis=1,inplace= True)
y_train.drop("Unnamed: 0",axis=1,inplace= True)

In [4]:
%%time
xgb1 = XGBClassifier(objective="binary:logistic")
params = {}
grid = GridSearchCV(estimator=xgb1, param_grid=params, scoring='accuracy', n_jobs=-1, cv=5, verbose=3 )
grid.fit(X_train,y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
CPU times: total: 984 ms
Wall time: 2.81 s


In [5]:
print('\n All results:')
print(grid.cv_results_)
print('\n Best estimator:')
print(grid.best_estimator_)
print('\n Best score:')
print(grid.best_score_ * 2 - 1)
print('\n Best parameters:')
print(grid.best_params_)


 All results:
{'mean_fit_time': array([1.43537946]), 'std_fit_time': array([0.0094241]), 'mean_score_time': array([0.01170921]), 'std_score_time': array([0.00116602]), 'params': [{}], 'split0_test_score': array([0.99883477]), 'split1_test_score': array([0.99930086]), 'split2_test_score': array([0.99860172]), 'split3_test_score': array([0.9988345]), 'split4_test_score': array([0.9986014]), 'mean_test_score': array([0.99883465]), 'std_test_score': array([0.00025535]), 'rank_test_score': array([1])}

 Best estimator:
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=N

In [6]:
#Defining the hyperparameter search space 
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }


In [7]:
def objective(space):
    clf=XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']), eval_metric="auc",early_stopping_rounds=10,verbose=0)
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

In [9]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'colsample_bytree': 0.5301738947353489, 'gamma': 8.898569179543443, 'max_depth': 16.0, 'min_child_weight': 10.0, 'reg_alpha': 41.0, 'reg_lambda': 0.09809576801400245}


In [25]:
len(X_train),len(y_train)

(17339, 17339)

In [26]:
best_hyperparams['max_depth'] = int(best_hyperparams['max_depth'])
best_hyperparams['min_child_weight'] = int(best_hyperparams['min_child_weight'])
best_hyperparams['reg_alpha'] = int(best_hyperparams['reg_alpha'])
xgb = XGBClassifier(objective="binary:logistic",**best_hyperparams)
xgb.fit(X_train,y_train)

In [29]:
X_test.drop("Unnamed: 0",axis = 1,inplace=True)


In [30]:
accuracy = xgb.score(X_test,y_test)
print(f"Model accuracy with best hyperparameters: {accuracy}")

Model accuracy with best hyperparameters: 0.9941275167785235


In [31]:
joblib.dump(xgb,'XGBModel.pkl')

['XGBModel.pkl']

In [32]:
X_train.head()

Unnamed: 0,urlLength,urlDepth,protocol,domain,domainAge,registrar,sslAge,PageRank
0,44,-1,1.0,100.0,4785.979623,0.0,17.0,10000000.0
1,20,-1,1.0,107.0,13802.0,0.0,11.0,69012.0
2,49,0,1.0,164.0,4785.979623,0.0,123.0,10000000.0
3,107,2,1.0,100.0,4785.979623,0.0,47.0,10000000.0
4,25,-1,1.0,84.0,7398.0,268.0,7.0,1012987.0
