In [1]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import numpy as np
import pandas as pd

In [2]:
file_path = '/content/skills_and_name (2).csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0,Hadeel Almaylam,name
0,Lama Alshgrood,skils
1,Ability to manage multiple assignments and tas...,skils
2,Excellent written and verbal communication.,skils
3,Prowess in leading team & coordinating with th...,skils
4,Superlative negotiation skills,skils


In [3]:
data.columns = ["text", "label"]

# Display the updated dataset structure
data.head()

Unnamed: 0,text,label
0,Lama Alshgrood,skils
1,Ability to manage multiple assignments and tas...,skils
2,Excellent written and verbal communication.,skils
3,Prowess in leading team & coordinating with th...,skils
4,Superlative negotiation skills,skils


In [5]:
data.drop_duplicates(inplace=True)

# Model

In [6]:
# Step 1: Preprocessing
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
# Encode labels
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])
labels = to_categorical(data['label_encoded'])


In [8]:
# Tokenize inputs
texts = data['text'].tolist()
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="tf", max_length=128)

In [11]:
# Verify lengths of inputs and labels
if len(texts) != labels.shape[0]:
    raise ValueError(f"Mismatch in lengths: {len(texts)} texts vs {labels.shape[0]} labels")


In [12]:
# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)


In [13]:
# Convert texts to tokenized inputs
train_inputs = tokenizer(train_texts, padding=True, truncation=True, return_tensors="tf", max_length=128)
test_inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors="tf", max_length=128)


In [14]:
# Step 2: Model Definition
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=labels.shape[1])


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Compile model
# Use the optimizer class name as a string:
model.compile(
    optimizer='adam',  # or tf.keras.optimizers.Adam.__name__
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

In [17]:
# Step 3: Training
history = model.fit(
    {key: tf.convert_to_tensor(train_inputs[key]) for key in train_inputs.keys()},
    tf.convert_to_tensor(train_labels),
    validation_data=({key: tf.convert_to_tensor(test_inputs[key]) for key in test_inputs.keys()}, tf.convert_to_tensor(test_labels)),
    epochs=3,
    batch_size=16
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [18]:
# Step 4: Save Model
model.save_pretrained("CV's-BERT")

In [19]:
# Step 5: Prediction Method
def classify_text(text):
    tokens = tokenizer(text, return_tensors="tf", padding=True, truncation=True, max_length=128)
    predictions = model(tokens)[0]
    predicted_label = tf.argmax(predictions, axis=1).numpy()
    return label_encoder.inverse_transform(predicted_label)[0]


In [26]:
# Example Usage
example_text = "Project Management	"
print("Predicted Label:", classify_text(example_text))

Predicted Label: name
