In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score
import matplotlib.pyplot as plt
import pickle
from p2_estimator import p2_estimator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, TensorDataset, random_split
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification

In [3]:
# overview the training data
x_train_df = pd.read_csv(os.path.join('data_reviews', 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join('data_reviews', 'y_train.csv'))

tr_text_list = x_train_df.values.tolist()
tr_y_list = y_train_df.values.tolist()

tr_y = np.hstack(np.array(tr_y_list))

reviews_list = [val[1].lower() for val in tr_text_list]

In [4]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
print(model.config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.38.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [5]:
# config["hidden_size"] = 64/128/32/...
# config["num_heads"] = 64/128/32/...
# config["hidden_size"] = 64/128/32/...

# param_grid = { 'hidden_size' : [12, 36, 72, 144],
#     'num_hidden_layers': [1, 3, 5, 7],
# }

param_grid = { 'hidden_size' : [72, 144, 288],
    'num_hidden_layers': [5, 7],
}

In [6]:
bert = p2_estimator()

auroc_scorer = make_scorer(roc_auc_score, needs_proba=True, greater_is_better=True)
grid_search = GridSearchCV(bert, param_grid, scoring=auroc_scorer, cv=5, refit=True, return_train_score=True)

grid_search.fit(reviews_list, tr_y)



In [7]:
print("Best parameters:", grid_search.best_params_)
print("Best AUROC score:", grid_search.best_score_)

Best parameters: {'hidden_size': 72, 'num_hidden_layers': 5}
Best AUROC score: 0.5903264322916667


In [None]:
best_model = grid_search.best_estimator_

with open('bert2.pkl','wb') as f:
    pickle.dump(best_model,f)