***Optimization of Wafer dataset with XGBoost***

In [None]:
!pip install optuna
!pip install xgboost

In [284]:
import numpy as np
import pandas as pd 
import optuna
import xgboost as xgb 

In [285]:
from sklearn.model_selection import train_test_split

In [286]:
df = pd.read_csv('wafer_16012020_051629.csv')

In [287]:
x= df.drop(columns=['Unnamed: 0' , 'Good/Bad'])

In [289]:
for (columnName, columnData) in x.iteritems():
    x[columnName]= x[columnName].fillna(x[columnName].mean())

In [None]:
y = df['Good/Bad']
y

In [291]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [292]:
sta_sca = StandardScaler()
x = sta_sca.fit_transform(x)

In [340]:
def objective_classification(trial) :
    train_x , test_x , train_y , test_y= train_test_split(x , y , test_size = .20 )
    
    param  = {
        #'tree_method' :'approx',
        
        #'sketch_eps' :trial.suggest_float('sketch_eps',0.01,1.0), ## Only used for 'approx' tree method, consider setting low number for accuracy 
        
        'verbosity':3, ##print more info on tree buliding process
        
        'validate_parameters':'True', ## performs validation to check whether parameter exists or not.
        
        #'nthread' : 10, ## No. of parallel threads to run Xgboost
        
        'max_depth' :trial.suggest_int('max_depth',1,20), ## Higher the depth of value more complex and ovefitting could occur
        
        'booster':trial.suggest_categorical('booster',['dart','gbtree','gblinear']), ## Dart and gbree for tree based booster method. gblinear used for linear function.
        
        'base_score' :trial.suggest_float('base_score' , .1 , .9), ## initial prediction score of all instances
        
        #'eval_metric' :'logloss', ## Added as objective used as evaluation  metrics for validation data, multiple metrics can be passed in a list for evaluation.
        
        'seed' :5, ## Random number seed
        
        'objective' : 'binary:logistic' , ## Used to pass learning objective to be applied on model to get desired result
        
        'lambda' : trial.suggest_float('lambda' , 1e-4 , 1), ## Model will become more conservative with increase in value. L2 regularization term on weights
        
        'alpha' :trial.suggest_float('alpha' , 1e-4 , 1), ## Model will become more conservative with increase in value. L1 regularization term on weights
        
        'subsample' : trial.suggest_float('subsample' , .1,.5), ## Occurs once in every bossting itration. Tuning value to 0.5 means that XGBoost would randomly sample half of the training data
        
        'colsample_bytree' : trial.suggest_float('colsample_bytree' , .5 ,1.0), ## Subsample ratio of columns when constructing each tree
        
        'min_child_weight' :trial.suggest_int('min_child_weight',1,10), ## Minimum sum of weights of all observations required in a child.Too high values can lead to under-fitting hence, it should be tuned using CV
        
        'max_delta_step' :trial.suggest_int('max_delta_step',1,10), ## Used when in logistic regression when class is extremely imbalanced. Tuning value between 1-10 help control the update.
        
        'sampling_method':trial.suggest_categorical('sampling_method',['uniform','gradient_based']), ## used to sample the training instances. set subsample >= 0.5 for good results.
        
        #'updater':'grow_colmaker', ## An advanced parameter that is usually set automatically, depending on some other parameters, used to run sequence of tree updaters 
        
        'refresh_leaf':'True', ## When this flag is 1, tree leafs as well as tree nodes’ stats are updated. When it is 0, only node stats are updated.
        
        'grow_policy':trial.suggest_categorical('grow_policy',['depthwise','lossguide']), ## Supported only if tree_method is set to hist or gpu_hist. Controls a way new nodes are added to the tree.
        
        'max_leaves' :trial.suggest_int('max_leaves',1,10), ## Maximum number of nodes to be added. Only relevant when grow_policy=lossguide is set.
        
        'num_parallel_tree' :trial.suggest_int('num_parallel_tree',1,10) ## Support boosted random forest. Defines number of parallel trees during each iterations.
        
    }
   
    if param['booster'] in ['dart','gbtree']:
      
      param['gamma'] : trail.suggest_float('gamma',1e-3,4) ## Gamma specifies the minimum loss reduction required to make a split. The values can vary depending on the loss function and should be tuned.
      
      param['eta'] : trail.suggest_float('eta',.01,0.2) ##Makes the model more robust by shrinking the weights on each step. Typical final values to be used: 0.01-0.2
      

    if param['booster'] == 'dart':
      param['sample_type']:trail.suggest_categorical('sample_type',['uniform','weighted']) ## uniform: dropped trees are selected uniformly.weighted: dropped trees are selected in proportion to weight.

      param['normalize_type']:trail.suggest_categorical('normalize_type',['tree','forest']) ## tree: new trees have the same weight of each of dropped trees. forest: new trees have the same weight of sum of dropped trees (forest).

      param['rate_drop'] :trail.suggest_float('rate_drop',0.0,1.0) ## Dropout rate and has range from 0.0 to 1.0.

      param['one_drop'] :trail.suggest_int('one_drop',0) ## Least one tree is always dropped during the dropout if this enabled

      param['skip_drop'] :trail.suggest_float('skip_drop',0.0,1.0) ## Probability of skipping the dropout procedure and has range from 0.0 to 1.0.

    if param['booster'] == 'gblinear':
      param['updater']:trail.suggest_categorical('updater',['shotgun','coord_descent']) ## Defines various algo to fit linear based model.
      param['feature_selector']:trail.suggest_categorical('feature_selector',['cyclic','shuffle','random','greedy','thrifty']) ## Defines various feature selection and ordering method.
        
    xgb_classification = xgb.XGBClassifier(**param)
    xgb_classification.fit(train_x , train_y,eval_set  = [(test_x, test_y)],eval_metric='logloss')
 
    pred = xgb_classification.predict(test_x)
    accuracy  = xgb_classification.score(test_x,test_y) 
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    #return accuracy
        
        
    

In [341]:
find_param = optuna.create_study(direction='minimize')

[32m[I 2021-11-19 18:18:47,250][0m A new study created in memory with name: no-name-fb08f9b2-7fff-4c6a-a886-0f17c67959ef[0m


In [342]:
find_param.optimize(objective_classification  , n_trials=10)

[18:18:49] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:18:49] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:18:49] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:18:49] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:18:49] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:18:49] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:18:49] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[0]	validation_0-logloss:0.901224
[18:18:49] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning

[33m[W 2021-11-19 18:18:50,684][0m Trial 0 failed, because the value None could not be cast to float.[0m


Accuracy: 60.00%
[18:18:50] EvalOneIter: 0.104233s, 100 calls @ 1042us
[18:18:50] GetGradient: 0.00196864s, 100 calls @ 19us
[18:18:50] PredictRaw: 0.000426215s, 100 calls @ 4us
[18:18:50] UpdateOneIter: 0.932842s, 100 calls @ 9328us
[18:18:50] BoostNewTrees: 0.893569s, 100 calls @ 8935us
[18:18:50] CommitModel: 0.0361474s, 100 calls @ 361us
[0]	validation_0-logloss:0.741912
[1]	validation_0-logloss:0.71238
[2]	validation_0-logloss:0.690215
[3]	validation_0-logloss:0.671432
[4]	validation_0-logloss:0.654461
[5]	validation_0-logloss:0.638652
[6]	validation_0-logloss:0.623742
[7]	validation_0-logloss:0.609634
[8]	validation_0-logloss:0.596299
[9]	validation_0-logloss:0.583731
[10]	validation_0-logloss:0.571928
[11]	validation_0-logloss:0.560883
[12]	validation_0-logloss:0.550581
[13]	validation_0-logloss:0.541001
[14]	validation_0-logloss:0.532116
[15]	validation_0-logloss:0.523893
[16]	validation_0-logloss:0.516296
[17]	validation_0-logloss:0.509287
[18]	validation_0-logloss:0.50283
[19

[33m[W 2021-11-19 18:18:51,184][0m Trial 1 failed, because the value None could not be cast to float.[0m


Accuracy: 75.00%
[18:18:51] EvalOneIter: 0.108554s, 100 calls @ 1085us
[18:18:51] GetGradient: 0.00203126s, 100 calls @ 20us
[18:18:51] PredictRaw: 0.000751848s, 100 calls @ 7us
[18:18:51] UpdateOneIter: 0.125049s, 100 calls @ 1250us
[18:18:51] DoBoost: 0.121912s, 100 calls @ 1219us
[18:18:51] PredictBatch: 0.00100044s, 202 calls @ 4us
[18:18:51] PredictBatchInternal: 0.0662028s, 203 calls @ 326us
[18:18:51] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:18:51] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[0]	validation_0-logloss:0.713058
[18:18:51] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:18:51] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[1]	validation_0-logloss:0.721231
[18:18:51] INFO: /w

[33m[W 2021-11-19 18:18:51,992][0m Trial 2 failed, because the value None could not be cast to float.[0m


Accuracy: 50.00%
[18:18:51] EvalOneIter: 0.154885s, 100 calls @ 1548us
[18:18:51] GetGradient: 0.00206232s, 100 calls @ 20us
[18:18:51] PredictRaw: 0.000521336s, 100 calls @ 5us
[18:18:51] UpdateOneIter: 0.378444s, 100 calls @ 3784us
[18:18:51] BoostNewTrees: 0.331566s, 100 calls @ 3315us
[18:18:51] CommitModel: 0.0435272s, 100 calls @ 435us
[18:18:52] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[18:18:52] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:18:52] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:18:52] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 2 extra nodes, 0 pruned nodes, max_depth=1
[0]	validation_0-logloss:0.614371
[18:18:52] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned n

[33m[W 2021-11-19 18:18:53,113][0m Trial 3 failed, because the value None could not be cast to float.[0m


[18:18:53] EvalOneIter: 0.143332s, 100 calls @ 1433us
[18:18:53] GetGradient: 0.00269681s, 100 calls @ 26us
[18:18:53] PredictRaw: 0.00354559s, 100 calls @ 35us
[18:18:53] UpdateOneIter: 0.691305s, 100 calls @ 6913us
[18:18:53] BoostNewTrees: 0.637149s, 100 calls @ 6371us
[18:18:53] CommitModel: 0.0471508s, 100 calls @ 471us
[18:18:53] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:18:53] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:18:53] INFO: /workspace/src/gbm/gbtree.cc:479: drop 0 trees, weight = 1
[0]	validation_0-logloss:0.68553
[18:18:53] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:18:53] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:18:53] INFO: /workspace/src/gbm/gbtree.cc:479: d

[33m[W 2021-11-19 18:18:53,947][0m Trial 4 failed, because the value None could not be cast to float.[0m


[18:18:53] EvalOneIter: 0.181701s, 100 calls @ 1817us
[18:18:53] GetGradient: 0.00170595s, 100 calls @ 17us
[18:18:53] PredictRaw: 0.0641459s, 100 calls @ 641us
[18:18:53] UpdateOneIter: 0.313822s, 100 calls @ 3138us
[18:18:53] BoostNewTrees: 0.231788s, 100 calls @ 2317us
[18:18:53] CommitModel: 0.0156125s, 100 calls @ 156us
[0]	validation_0-logloss:0.533175
[1]	validation_0-logloss:0.523504
[2]	validation_0-logloss:0.521145
[3]	validation_0-logloss:0.521446
[4]	validation_0-logloss:0.522643
[5]	validation_0-logloss:0.524048
[6]	validation_0-logloss:0.525397
[7]	validation_0-logloss:0.526594
[8]	validation_0-logloss:0.527615
[9]	validation_0-logloss:0.528465
[10]	validation_0-logloss:0.529158
[11]	validation_0-logloss:0.529714
[12]	validation_0-logloss:0.530152
[13]	validation_0-logloss:0.530488
[14]	validation_0-logloss:0.53074
[15]	validation_0-logloss:0.530919
[16]	validation_0-logloss:0.531039
[17]	validation_0-logloss:0.531108
[18]	validation_0-logloss:0.531135
[19]	validation_0-l

[33m[W 2021-11-19 18:18:54,567][0m Trial 5 failed, because the value None could not be cast to float.[0m


Accuracy: 75.00%
[18:18:54] EvalOneIter: 0.134129s, 100 calls @ 1341us
[18:18:54] GetGradient: 0.00315604s, 100 calls @ 31us
[18:18:54] PredictRaw: 0.000575979s, 100 calls @ 5us
[18:18:54] UpdateOneIter: 0.141808s, 100 calls @ 1418us
[18:18:54] DoBoost: 0.133203s, 100 calls @ 1332us
[18:18:54] PredictBatch: 0.000809689s, 202 calls @ 4us
[18:18:54] PredictBatchInternal: 0.0822224s, 203 calls @ 405us
[0]	validation_0-logloss:0.536253
[1]	validation_0-logloss:0.52604
[2]	validation_0-logloss:0.522504
[3]	validation_0-logloss:0.521709
[4]	validation_0-logloss:0.521997
[5]	validation_0-logloss:0.522648
[6]	validation_0-logloss:0.52335
[7]	validation_0-logloss:0.523973
[8]	validation_0-logloss:0.524471
[9]	validation_0-logloss:0.524837
[10]	validation_0-logloss:0.525081
[11]	validation_0-logloss:0.525222
[12]	validation_0-logloss:0.525277
[13]	validation_0-logloss:0.525265
[14]	validation_0-logloss:0.525201
[15]	validation_0-logloss:0.525098
[16]	validation_0-logloss:0.524967
[17]	validation

[33m[W 2021-11-19 18:18:55,181][0m Trial 6 failed, because the value None could not be cast to float.[0m


[0]	validation_0-logloss:0.583543
[1]	validation_0-logloss:0.531086
[2]	validation_0-logloss:0.496046
[3]	validation_0-logloss:0.471463
[4]	validation_0-logloss:0.453553
[5]	validation_0-logloss:0.440114
[6]	validation_0-logloss:0.429795
[7]	validation_0-logloss:0.421723
[8]	validation_0-logloss:0.415315
[9]	validation_0-logloss:0.410167
[10]	validation_0-logloss:0.405991
[11]	validation_0-logloss:0.402576
[12]	validation_0-logloss:0.399766
[13]	validation_0-logloss:0.397441
[14]	validation_0-logloss:0.39551
[15]	validation_0-logloss:0.393901
[16]	validation_0-logloss:0.392557
[17]	validation_0-logloss:0.391432
[18]	validation_0-logloss:0.390489
[19]	validation_0-logloss:0.389698
[20]	validation_0-logloss:0.389035
[21]	validation_0-logloss:0.388479
[22]	validation_0-logloss:0.388013
[23]	validation_0-logloss:0.387623
[24]	validation_0-logloss:0.387298
[25]	validation_0-logloss:0.387027
[26]	validation_0-logloss:0.386802
[27]	validation_0-logloss:0.386616
[28]	validation_0-logloss:0.386

[33m[W 2021-11-19 18:18:55,729][0m Trial 7 failed, because the value None could not be cast to float.[0m



[18:18:55] GetGradient: 0.00206549s, 100 calls @ 20us
[18:18:55] PredictRaw: 0.000715824s, 100 calls @ 7us
[18:18:55] UpdateOneIter: 0.141687s, 100 calls @ 1416us
[18:18:55] DoBoost: 0.138396s, 100 calls @ 1383us
[18:18:55] PredictBatch: 0.000962491s, 202 calls @ 4us
[18:18:55] PredictBatchInternal: 0.0743536s, 203 calls @ 366us
[0]	validation_0-logloss:0.779649
[1]	validation_0-logloss:0.768514
[2]	validation_0-logloss:0.757742
[3]	validation_0-logloss:0.746112
[4]	validation_0-logloss:0.733665
[5]	validation_0-logloss:0.720738
[6]	validation_0-logloss:0.707688
[7]	validation_0-logloss:0.694813
[8]	validation_0-logloss:0.682337
[9]	validation_0-logloss:0.670414
[10]	validation_0-logloss:0.659145
[11]	validation_0-logloss:0.648582
[12]	validation_0-logloss:0.638747
[13]	validation_0-logloss:0.629636
[14]	validation_0-logloss:0.621232
[15]	validation_0-logloss:0.613503
[16]	validation_0-logloss:0.606411
[17]	validation_0-logloss:0.599917
[18]	validation_0-logloss:0.593977
[19]	validati

[33m[W 2021-11-19 18:18:56,387][0m Trial 8 failed, because the value None could not be cast to float.[0m


Accuracy: 70.00%
[18:18:56] EvalOneIter: 0.124352s, 100 calls @ 1243us
[18:18:56] GetGradient: 0.00223449s, 100 calls @ 22us
[18:18:56] PredictRaw: 0.000796002s, 100 calls @ 7us
[18:18:56] UpdateOneIter: 0.145549s, 100 calls @ 1455us
[18:18:56] DoBoost: 0.142092s, 100 calls @ 1420us
[18:18:56] PredictBatch: 0.001023s, 202 calls @ 5us
[18:18:56] PredictBatchInternal: 0.0842223s, 203 calls @ 414us
[18:18:56] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:18:56] INFO: /workspace/src/gbm/gbtree.cc:479: drop 0 trees, weight = 1
[0]	validation_0-logloss:1.14063
[18:18:56] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:18:56] INFO: /workspace/src/gbm/gbtree.cc:479: drop 0 trees, weight = 1
[1]	validation_0-logloss:1.01582
[18:18:56] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 0 extra nodes, 0 pruned nodes, max_depth=0

[33m[W 2021-11-19 18:18:57,204][0m Trial 9 failed, because the value None could not be cast to float.[0m


Accuracy: 65.00%
[18:18:57] EvalOneIter: 0.212643s, 100 calls @ 2126us
[18:18:57] GetGradient: 0.00192358s, 100 calls @ 19us
[18:18:57] PredictRaw: 0.0564067s, 100 calls @ 564us
[18:18:57] UpdateOneIter: 0.233054s, 100 calls @ 2330us
[18:18:57] BoostNewTrees: 0.149934s, 100 calls @ 1499us
[18:18:57] CommitModel: 0.0242123s, 100 calls @ 242us


In [332]:
best_fit=find_param.best_trial.params
best_fit

{'alpha': 0.7371941547839909,
 'base_score': 0.6841291413838743,
 'booster': 'gbtree',
 'colsample_bytree': 0.847083150673991,
 'grow_policy': 'lossguide',
 'lambda': 0.3935745521844469,
 'max_delta_step': 5,
 'max_depth': 6,
 'max_leaves': 7,
 'min_child_weight': 5,
 'num_parallel_tree': 4,
 'sampling_method': 'uniform',
 'subsample': 0.2836783565872184}