In [1]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset

In [2]:
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower().strip()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

df = pd.read_csv('D:\\GDG_Hammad_ML\\dataset\\train.csv')
df['comment'] = df['comment'].apply(clean_text)

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

In [3]:
hf_dataset = Dataset.from_pandas(df)

model_name = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(df['label'].unique()))

def tokenize(batch):
    return tokenizer(batch['comment'], padding=True, truncation=True, max_length=128)

hf_dataset = hf_dataset.map(tokenize, batched=True)

hf_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/213747 [00:00<?, ? examples/s]

In [4]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,                    #Make this 5 for optimal training
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    dataloader_num_workers=4,
    fp16 = True,
    save_steps=1000,
    save_total_limit=1,
)


train_test_split = hf_dataset.train_test_split(test_size=0.3, seed=42)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/18703 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 1.0875, 'grad_norm': 0.916339635848999, 'learning_rate': 5.344735435595938e-06, 'epoch': 0.03}
{'loss': 1.0015, 'grad_norm': 2.3318090438842773, 'learning_rate': 1.0668091929449493e-05, 'epoch': 0.05}
{'loss': 0.9215, 'grad_norm': 18.72220802307129, 'learning_rate': 1.601282736504543e-05, 'epoch': 0.08}
{'loss': 0.8502, 'grad_norm': 13.129029273986816, 'learning_rate': 1.9849096958174907e-05, 'epoch': 0.11}
{'loss': 0.7918, 'grad_norm': 7.887839317321777, 'learning_rate': 1.925499049429658e-05, 'epoch': 0.13}
{'loss': 0.761, 'grad_norm': 8.62987232208252, 'learning_rate': 1.866088403041825e-05, 'epoch': 0.16}
{'loss': 0.7594, 'grad_norm': 4.7552900314331055, 'learning_rate': 1.8066777566539926e-05, 'epoch': 0.19}
{'loss': 0.7384, 'grad_norm': 3.7340381145477295, 'learning_rate': 1.74726711026616e-05, 'epoch': 0.21}
{'loss': 0.7176, 'grad_norm': 5.63754415512085, 'learning_rate': 1.6879752851711028e-05, 'epoch': 0.24}
{'loss': 0.7018, 'grad_norm': 19.849838256835938, 'learning_

TrainOutput(global_step=18703, training_loss=0.7107006310205602, metrics={'train_runtime': 3517.084, 'train_samples_per_second': 42.541, 'train_steps_per_second': 5.318, 'total_flos': 9841884835677552.0, 'train_loss': 0.7107006310205602, 'epoch': 1.0})

In [5]:
results = trainer.evaluate()
print(results)

model.save_pretrained("fine_tuned_muril")
tokenizer.save_pretrained("fine_tuned_muril")


  0%|          | 0/2004 [00:00<?, ?it/s]

{'eval_loss': 0.6469289064407349, 'eval_runtime': 92.6172, 'eval_samples_per_second': 692.366, 'eval_steps_per_second': 21.637, 'epoch': 1.0}


('fine_tuned_muril\\tokenizer_config.json',
 'fine_tuned_muril\\special_tokens_map.json',
 'fine_tuned_muril\\vocab.txt',
 'fine_tuned_muril\\added_tokens.json',
 'fine_tuned_muril\\tokenizer.json')