In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_dir = '/content/drive/MyDrive/Project/us-patent-phrase-to-phrase-matching'

Mounted at /content/drive


In [None]:
!pip install optuna
!pip install scikit-optimize

In [None]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import optuna 
from optuna.samplers import TPESampler
from sklearn.model_selection import cross_val_score
import time
from sklearn.metrics import classification_report,precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
#import cudf
#import cuml
#from cuml import RandomForestClassifier

In [None]:
train_data = pd.read_csv(data_dir + "/train_data.csv")
val_data = pd.read_csv(data_dir + "/val_data.csv")
test_data = pd.read_csv(data_dir + "/test_data.csv")

In [None]:
y_train = train_data["score"]
y_train = (y_train * 100).astype('int64')
X_train = train_data.drop(["score"], axis =1)

In [None]:
y_val = val_data["score"]
y_val = (y_val * 100).astype('int64')
X_val = val_data.drop(["score"], axis =1)

In [None]:
y_train.value_counts()

50     8912
25     8324
0      5342
75     2912
100     861
Name: score, dtype: int64

In [None]:
y_test = test_data['score']
y_test = (y_test * 100).astype('int64')
X_test = test_data.drop(['score'], axis =1)

In [None]:
def objective(trial):

    """return the f1-score"""

    # search space
    n_estimators =  trial.suggest_int('n_estimators', low=100, high=500, step=100)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    min_samples_split = trial.suggest_int('min_samples_split', low=2, high=4, step=1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', low=1, high=5, step=1)
    max_depth = trial.suggest_int('max_depth', low=5, high=100, step=1)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt','log2'])

    # random forest classifier object
    rfc = RandomForestClassifier(n_estimators=n_estimators, 
                                 criterion=criterion,
                                 min_samples_split=min_samples_split,
                                 min_samples_leaf=min_samples_leaf,
                                 max_depth=max_depth,
                                 max_features=max_features,
                                 random_state=42)
    score =  cross_val_score(estimator=rfc, 
                             X=X_train, 
                             y=y_train, 
                             scoring='f1_micro',
                             cv=5,
                             n_jobs=-1).mean()
    return score

study = optuna.create_study(sampler=TPESampler(), direction='maximize')

# perform hyperparamter tuning (while timing the process)
time_start = time.time()
study.optimize(objective, n_trials=10)
time_bayesian = time.time() - time_start

# store result in a data frame 
values_bayesian = [100, study.best_trial.number, study.best_trial.value, time_bayesian]
columns = ['Number of iterations', 'Iteration Number of Optimal Hyperparamters', 'Score', 'Time Elapsed (s)']
results_bayesian = pd.DataFrame([values_bayesian], columns = columns)

[32m[I 2023-03-21 00:54:08,022][0m A new study created in memory with name: no-name-8a756134-77ea-4715-b4a8-88d0bbab4e40[0m
[32m[I 2023-03-21 01:01:49,823][0m Trial 0 finished with value: 0.5353116349997138 and parameters: {'n_estimators': 300, 'criterion': 'entropy', 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 93, 'max_features': 'log2'}. Best is trial 0 with value: 0.5353116349997138.[0m
[32m[I 2023-03-21 01:22:33,643][0m Trial 1 finished with value: 0.5399036653602451 and parameters: {'n_estimators': 400, 'criterion': 'gini', 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_depth': 71, 'max_features': 'auto'}. Best is trial 1 with value: 0.5399036653602451.[0m
[32m[I 2023-03-21 01:27:28,077][0m Trial 2 finished with value: 0.5263933513258794 and parameters: {'n_estimators': 200, 'criterion': 'entropy', 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_depth': 85, 'max_features': 'log2'}. Best is trial 1 with value: 0.5399036653602451.[0m
[32m[I 2023

KeyboardInterrupt: ignored

In [None]:
#Best Parameters
{'n_estimators': 200, 'criterion': 'gini', 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_depth': 52, 'max_features': 'auto'}

{'n_estimators': 200,
 'criterion': 'gini',
 'min_samples_split': 3,
 'min_samples_leaf': 2,
 'max_depth': 52,
 'max_features': 'auto'}

In [None]:
rf = RandomForestClassifier(n_estimators = 1000, 
                                criterion = 'gini',
                                min_samples_split = 3,
                                min_samples_leaf = 2,
                                max_depth = 52,
                                max_features = 'auto')
rf.fit(X_train,y_train)

  warn(


In [None]:
val_preds = rf.predict(X_val)

In [None]:
print("val_Precision:{}".format(precision_score(val_preds,y_val,pos_label='positive',average='micro')))
print("val_Recall:{}".format(recall_score(val_preds,y_val,pos_label='positive',average='micro')))
print("val_F1 Score:{}".format((f1_score(val_preds,y_val,pos_label='positive',average='micro'))))

val_Precision:0.07718770156955493
val_Recall:0.07718770156955493
val_F1 Score:0.07718770156955493




In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(val_preds,y_val)

0.07718770156955493

In [None]:
rf.classes_

array([  0,  25,  50,  75, 100])

In [None]:
from sklearn.metrics import multilabel_confusion_matrix
multilabel_confusion_matrix(y_val, val_preds, labels=["0", "25", "50", "75", "100"])

  mask &= (ar1 != a)


array([[[3564,   82],
        [ 646,  359]],

       [[2799, 1852],
        [   0,    0]],

       [[2454, 2197],
        [   0,    0]],

       [[4532,  119],
        [   0,    0]],

       [[4609,   42],
        [   0,    0]]])

In [None]:
test_preds = rf.predict(X_test)

In [None]:
print("test_Precision:{}".format(precision_score(test_preds,y_test,pos_label='positive',average='micro')))
print("test_Recall:{}".format(recall_score(test_preds,y_test,pos_label='positive',average='micro')))
print("test_F1 Score:{}".format((f1_score(test_preds,y_test,pos_label='positive',average='micro'))))

test_Precision:0.581612136720892
test_Recall:0.581612136720892
test_F1 Score:0.581612136720892




In [None]:
accuracy_score(test_preds,y_test)

0.581612136720892

In [None]:
rf.score(X_test, y_test)

0.581612136720892

In [None]:
test_data["score"].value_counts()

0.50    1829
0.25    1742
0.00    1124
0.75     619
1.00     157
Name: score, dtype: int64

In [None]:
multilabel_confusion_matrix(y_test, test_preds, labels=["0", "25", "50", "75", "100"])

  mask &= (ar1 != a)


array([[[4245,  102],
        [ 702,  422]],

       [[2774,  955],
        [ 504, 1238]],

       [[2464, 1178],
        [ 463, 1366]],

       [[4808,   44],
        [ 500,  119]],

       [[5304,   10],
        [ 120,   37]]])

**XGBoost Classifier**

In [None]:
from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

In [None]:
y_train = y_train.astype("category")

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [None]:
# Classifier
bayes_cv_tuner = BayesSearchCV(
    estimator = xgb.XGBClassifier(
        n_jobs = 1,
        objective = 'multi:softmax',
        num_class = 5, 
        eval_metric = 'auc',
        silent=1,
        tree_method='gpu_hist'
    ),
    search_spaces = {
        'learning_rate': (0.01, 1.0, 'log-uniform'),
        'min_child_weight': (0, 10),
        'max_depth': (0, 50),
        'subsample': (0.01, 1.0, 'uniform'),
        'colsample_bytree': (0.01, 1.0, 'uniform'),
        'colsample_bylevel': (0.01, 1.0, 'uniform'),
        'n_estimators': (50, 500),
    },    
    cv = StratifiedKFold(
        n_splits=3,
        shuffle=True,
        random_state=42
    ),
    n_jobs = -1,
    n_iter = 5,   
    verbose = 0,
    refit = True,
    random_state = 42
)



def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))


In [None]:
result = bayes_cv_tuner.fit(X_train, y_train, callback=status_print)

AttributeError: ignored

In [None]:
# dict = {0:'A', 0.25:'B', 0.50:'C', 0.75:'D', 1:'E'}
# y_train['score'] = y_train.replace({"score":dict})
# y_val['score'] = y_val.replace({"score":dict})
model = XGBClassifier(random_state=42)
model.fit(X_train, y_train)
#y_train_hat = model.predict(X_train)
y_val_hat = model.predict(X_val)

In [None]:
print(model)
print('Train performance')
#print(classification_report(y_train, y_train_hat))

print('Validation performance')
print('-------------------------------------------------------')
print(classification_report(y_val, y_val_hat))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_val, y_val_hat))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_val, y_val_hat))

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              objective='multi:softprob', predictor=None, ...)
Train performance
Validation performance
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.67      0.50      0.57      1005
           1       0.00      0.00

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ValueError: ignored