In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn
from tqdm import tqdm




# New Section

In [2]:
from datasets import load_dataset
urldata = load_dataset('csv', data_files={'train': '../data/IEEE-dict2-train.csv', 'validation': '../data/IEEE-dict2-val.csv'}, sep=",", names=["text", "label"])

Found cached dataset csv (C:/Users/ennfl/.cache/huggingface/datasets/csv/default-a7f3109c22d5965f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
from transformers import AutoModel
import torch
import torch.nn.functional as F
model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [6]:
urldata_encoded = urldata.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/76626 [00:00<?, ? examples/s]

Map:   0%|          | 0/19158 [00:00<?, ? examples/s]

In [7]:
print(urldata_encoded["train"].column_names)

['text', 'label', 'input_ids', 'attention_mask']


In [8]:
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [9]:
urldata_encoded.set_format("torch",
                            columns=["input_ids", "attention_mask", "label"], output_all_columns=True)

In [10]:
urldata_hidden = urldata_encoded.map(extract_hidden_states, batched=True, batch_size=1)

Map:   0%|          | 0/76626 [00:00<?, ? examples/s]

Map:   0%|          | 0/19158 [00:00<?, ? examples/s]

In [11]:
import numpy as np

X_train = np.array(urldata_hidden["train"]["hidden_state"])
X_valid = np.array(urldata_hidden["validation"]["hidden_state"])
y_train = np.array(urldata_hidden["train"]["label"])
y_valid = np.array(urldata_hidden["validation"]["label"])
X_train.shape, X_valid.shape

((76626, 768), (19158, 768))

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [13]:
#Random Forest Classifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
ypred = rf_model.predict(X_valid)
accuracy_score(y_valid, ypred)

0.9178933082785259

In [14]:
print(confusion_matrix(y_valid,ypred))

[[9151  436    0]
 [1137 8433    0]
 [   0    0    1]]
