In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Let's prepare data Preparation
import nltk
nltk.download('names')
from nltk.corpus import names

male_names = names.words('male.txt')
female_names = names.words('female.txt')

data = [(name, 0) for name in male_names] + [(name, 1) for name in female_names]

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Tokenization and Encoding

class NamesDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=32):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        name, label = self.data[idx]
        encoding = self.tokenizer(name, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

dataset = NamesDataset(data, tokenizer)

# Model Architecture
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Let's train the model
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

# Inference
def classify_name(name, model, tokenizer):
    encoding = tokenizer(name, return_tensors='pt')
    with torch.no_grad():
        output = model(**encoding)
    probabilities = torch.softmax(output.logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1).item()
    if predicted_class == 0:
        return "Male"
    else:
        return "Female"

# Let's pick a random english name as example usage
name_to_classify = "John"
print(f"The name {name_to_classify} is classified as {classify_name(name_to_classify, model, tokenizer)}")

  torch.utils._pytree._register_pytree_node(
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\flore\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\names.zip.


Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.3600789309146414
Epoch 2, Loss: 0.23510636371302318


KeyboardInterrupt: 