<a href="https://colab.research.google.com/github/ARJUN108-verma/LLMs-Large-Language-Models-/blob/main/Text_Classification_with_XLNET.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Text Classification with XLNET:-

In [None]:
!pip install cleantext

Collecting cleantext
  Downloading cleantext-1.1.4-py3-none-any.whl.metadata (3.5 kB)
Downloading cleantext-1.1.4-py3-none-any.whl (4.9 kB)
Installing collected packages: cleantext
Successfully installed cleantext-1.1.4


In [None]:
!pip install clean

Collecting clean
  Downloading clean-0.1.4-py3-none-any.whl.metadata (1.8 kB)
Downloading clean-0.1.4-py3-none-any.whl (13 kB)
Installing collected packages: clean
Successfully installed clean-0.1.4


In [None]:
import pandas as pd
import numpy as np
import cleantext as clean
import re
from transformers import XLNetTokenizer, XLNetForSequenceClassification, pipeline, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
!pip install datasets



In [None]:
!pip install evaluate

In [None]:
import datasets
import evaluate
import random

Preprocess the data:-

In [None]:
data_val = pd.read_csv("/content/emotion-labels-val.csv")
data_train = pd.read_csv("/content/emotion-labels-train.csv")
data_test = pd.read_csv("/content/emotion-labels-test.csv")

In [None]:
data_val.head()

In [None]:
data_train.head()

In [None]:
data_test.head()

In [None]:
data = pd.concat([data_train, data_val, data_test], ignore_index=True)

In [None]:
import cleantext as clean

In [None]:

data['text_clean'] = data['text'].apply(lambda x: clean.clean(x, clean_all= True, extra_spaces=True))

In [None]:

data['text_clean'] = data['text_clean'].apply(lambda x: re.sub(r'http\S+', '', x))

In [None]:
data.head(27)

In [None]:
data['label'].value_counts().plot(kind='bar')

In [None]:
g = data.groupby('label')

In [None]:
data = pd.DataFrame(g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True)))

In [None]:
data['label'].value_counts().plot(kind='bar')

In [None]:

data['label'] = LabelEncoder().fit_transform(data['label'])

In [None]:
NUM_LABELS = 4

Split:-

In [None]:
train_split, test_split = train_test_split(data, test_size=0.2, random_state=42)

In [None]:

train_slit, val_split = train_test_split(train_split, train_size = 0.9)

In [None]:
print(len(train_slit))
print(len(val_split))
print(len(test_split))

In [None]:
print(train_split.columns)

In [None]:
train_df = pd.DataFrame({
    "label": train_split["label"].values,
    "text": train_split["text_clean"].values
})

In [None]:
train_split["label_int"] = train_split["label"].astype('category').cat.codes

In [None]:
test_df = pd.DataFrame({
    "label": train_split["label_int"].values,
    "text": train_split["text_clean"].values
})


In [None]:
from datasets import Dataset

train_dataset = Dataset.from_dict(train_df.to_dict("list"))
test_dataset = Dataset.from_dict(test_df.to_dict("list"))

In [None]:
dataset_dict = datasets.DatasetDict({"train":train_df, "test":test_df})

In [None]:
dataset_dict


Create embeddings:-

In [None]:
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding = "max_length", max_length = 128, truncation=True)

In [None]:
from datasets import Dataset, DatasetDict

In [None]:
dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df)
})

In [None]:
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets

In [None]:
print(tokenized_datasets['train']['text'][0])

In [None]:
print(tokenized_datasets['train']['input_ids'][0])

In [None]:
tokenizer.decode(5)

In [None]:
print(tokenized_datasets['train']['token_type_ids'][0])

In [None]:
print(tokenized_datasets['train']['attention_mask'][0])

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

Fine tune our model:-

In [None]:
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased',
                                                       num_labels=NUM_LABELS,
                                                       id2label={0: 'anger', 1: 'fear', 2: 'joy', 3: 'sadness'})

Add `%load_ext cudf.pandas` before importing pandas to speed up operations using GPU

In [None]:
metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch", num_train_epochs=3)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics)

In [None]:
trainer.train()

Evaluate model:-



In [None]:
trainer.evaluate()


In [None]:
model.save_pretrained("fine_tuned_model")

In [None]:

fine_tuned_model = XLNetForSequenceClassification.from_pretrained("fine_tuned_model")

In [None]:


clf = pipeline("text-classification", fine_tuned_model, tokenizer=tokenizer)


In [None]:
rand_int = random.randint(0, len(val_split))
print(val_split['text_clean'][rand_int])
answer = clf(val_split['text_clean'][rand_int], top_k=None)
print(answer)

In [None]:
!pip install --upgrade ipywidgets jupyter

In [None]:
!jupyter nbconvert --clear-output --inplace your_notebook.ipynb
