# Import Libraries

In [7]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments , Trainer
from sklearn.model_selection import train_test_split

# Split Data to Train & Validation

In [55]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
data_frame = pd.read_csv("/content/iranianNamesDataset.csv")
data_frame = shuffle(data_frame)
train_ = []
val_   = []
for row in data_frame.iterrows():
  if row[1]["Gender"] == "M":
      label = 0
  else:
      label = 1
  if np.random.rand () > 0.9:

    val_.append([row[1]["Names"],label])
  else:
    train_.append([row[1]["Names"],label])

# Save To CSV

In [57]:
train_frame = pd.DataFrame(train_,columns=["text","label"])
val_frame = pd.DataFrame(val_,columns=["text","label"])

train_frame.to_csv("train_names.csv",index=False)
val_frame.to_csv("val_names.csv",index=False)

# Load Dataset with datasets library

In [8]:
from datasets import load_dataset
geneder_classification = load_dataset("csv",data_files={
    "train":"/content/train_names.csv",
    "validation":"/content/val_names.csv"
})

In [9]:
geneder_classification

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 17881
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [10]:
print(geneder_classification["train"][0])
print(geneder_classification["train"][-1])
print(geneder_classification["validation"][0])
print(geneder_classification["validation"][-1])

{'text': 'بی بی راحله', 'label': 1}
{'text': 'محمدقاشم', 'label': 0}
{'text': 'صونا', 'label': 1}
{'text': 'سعد', 'label': 0}


# Load Tokenizer and Model

In [11]:
model_id = "distilbert-base-uncased"

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
def tokenize(batch):
  return tokenizer(batch["text"],padding=True,truncation=True)
names_encoded = geneder_classification.map(tokenize,batched=True,batch_size=None)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [14]:
print(names_encoded["train"][0])
print(names_encoded["validation"][0])

{'text': 'بی بی راحله', 'label': 1, 'input_ids': [101, 1271, 24830, 1271, 24830, 1280, 25573, 29820, 23673, 14157, 102, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}
{'text': 'صونا', 'label': 1, 'input_ids': [101, 1284, 29836, 15915, 25573, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("selected device : " , device)

selected device :  cuda


In [16]:
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_id,num_labels=num_labels).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train Arguments & Train Model

In [17]:
batch_size = 512
training_args = TrainingArguments(
    output_dir = "./output",
    num_train_epochs = 30,
    learning_rate = 0.00001,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    eval_strategy ="epoch",
    disable_tqdm=False,
    push_to_hub=False,
    report_to ="tensorboard",
)

In [18]:
from sklearn.metrics import accuracy_score,f1_score
def compute_metrics(pred):
  labels = pred.label_ids
  preds  = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average='weighted')
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc,"f1": f1}
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = names_encoded["train"],
    eval_dataset  = names_encoded["validation"],
    compute_metrics = compute_metrics,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.534171,0.677,0.557726
2,No log,0.412998,0.806,0.803763
3,No log,0.373958,0.8235,0.823669
4,No log,0.358795,0.8235,0.82456
5,No log,0.345871,0.835,0.833326
6,No log,0.340049,0.839,0.839062
7,No log,0.335762,0.8425,0.841499
8,No log,0.329054,0.855,0.854417
9,No log,0.326049,0.8525,0.853024
10,No log,0.321123,0.8585,0.858249


TrainOutput(global_step=1050, training_loss=0.324817084357852, metrics={'train_runtime': 749.0077, 'train_samples_per_second': 716.188, 'train_steps_per_second': 1.402, 'total_flos': 2220608958157440.0, 'train_loss': 0.324817084357852, 'epoch': 30.0})

In [20]:
idx_to_name = {
    0:"Male",
    1:"Women"
}
name ="کبری"
encoded_name = tokenizer(name,return_tensors="pt").to(device)
output = model(encoded_name["input_ids"])
print(idx_to_name[output.logits.argmax(-1).item()])

Women
