In [1]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset

In [2]:
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower().strip()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

df = pd.read_csv('D:\\GDG_Hammad_ML\\dataset\\train.csv')
df['comment'] = df['comment'].apply(clean_text)

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

In [None]:
model_name = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(df['label'].unique()))

In [3]:
model_name = "fine_tuned_muril"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(df['label'].unique()))

In [4]:
hf_dataset = Dataset.from_pandas(df)

def tokenize(batch):
    return tokenizer(batch['comment'], padding=True, truncation=True, max_length=128)

hf_dataset = hf_dataset.map(tokenize, batched=True)

hf_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/213747 [00:00<?, ? examples/s]

In [5]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,                    #Make this 5 for optimal training
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    dataloader_num_workers=4,
    fp16 = True,
    save_steps=1000,
    save_total_limit=1,
)


train_test_split = hf_dataset.train_test_split(test_size=0.3, seed=42)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/37406 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 0.6561, 'grad_norm': 14.749953269958496, 'learning_rate': 2.667735899492115e-06, 'epoch': 0.03}
{'loss': 0.6364, 'grad_norm': 9.855474472045898, 'learning_rate': 5.33547179898423e-06, 'epoch': 0.05}
{'loss': 0.6547, 'grad_norm': 7.8750786781311035, 'learning_rate': 8.003207698476344e-06, 'epoch': 0.08}
{'loss': 0.6129, 'grad_norm': 4.7021894454956055, 'learning_rate': 1.0676289762095697e-05, 'epoch': 0.11}
{'loss': 0.586, 'grad_norm': 6.069854736328125, 'learning_rate': 1.334937182571505e-05, 'epoch': 0.13}
{'loss': 0.5851, 'grad_norm': 8.404452323913574, 'learning_rate': 1.6017107725207165e-05, 'epoch': 0.16}
{'loss': 0.5915, 'grad_norm': 11.278371810913086, 'learning_rate': 1.869018978882652e-05, 'epoch': 0.19}
{'loss': 0.5651, 'grad_norm': 2.629645824432373, 'learning_rate': 1.98485073518491e-05, 'epoch': 0.21}
{'loss': 0.5447, 'grad_norm': 8.691455841064453, 'learning_rate': 1.9551462943710086e-05, 'epoch': 0.24}
{'loss': 0.5387, 'grad_norm': 15.004962921142578, 'learning_

TrainOutput(global_step=37406, training_loss=0.5876096815105072, metrics={'train_runtime': 6237.3547, 'train_samples_per_second': 47.976, 'train_steps_per_second': 5.997, 'total_flos': 1.9683749115619344e+16, 'train_loss': 0.5876096815105072, 'epoch': 2.0})

In [6]:
results = trainer.evaluate()
print(results)

model.save_pretrained("fine_tuned_muril_3")
tokenizer.save_pretrained("fine_tuned_muril_3")


  0%|          | 0/2004 [00:00<?, ?it/s]

{'eval_loss': 0.6706209778785706, 'eval_runtime': 88.9678, 'eval_samples_per_second': 720.767, 'eval_steps_per_second': 22.525, 'epoch': 2.0}


('fine_tuned_muril_3\\tokenizer_config.json',
 'fine_tuned_muril_3\\special_tokens_map.json',
 'fine_tuned_muril_3\\vocab.txt',
 'fine_tuned_muril_3\\added_tokens.json',
 'fine_tuned_muril_3\\tokenizer.json')