#### Import Libraries

In [3]:
!pip install accelerate -U
! pip install transformers
!pip install datasets

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from accelerate)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, accelerate
Successfully installed accelerate-0.24.1 huggingface-hub-0.19.0
Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m55.0

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import re
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score
from datasets import load_dataset
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns


#### Load the SST-2 dataset

In [6]:
dataset = load_dataset("glue", "sst2")
dataset

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [7]:
train = dataset['train']
val = dataset['validation']


#### Initialize the DistilBERT tokenizer

In [8]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

#### Data preprocessing function

In [9]:
def preprocess_text(text):
    # Remove special characters, links, and user mentions
    text = re.sub(r'\b@\w+\b', '', text)  # Remove user mentions
    text = re.sub(r'http\S+', '', text)   # Remove links
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters

    # Tokenize and join the text
    return ' '.join(tokenizer.tokenize(text))


#### Tokenize and preprocess text data

In [10]:
X_train = tokenizer(list(train['sentence']), truncation=True, padding=True, return_tensors='pt', max_length=100, return_attention_mask=True)
X_val = tokenizer(list(val['sentence']), truncation=True, padding=True, return_tensors='pt', max_length=100, return_attention_mask=True)

# Get labels
y_train = torch.tensor(list(train['label']))
y_val = torch.tensor(list(val['label']))


#### Define training arguments

In [11]:
batch_size = 32

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Sentiment_model", #change thw path whewe you want to save the training model
    evaluation_strategy="steps",
    save_total_limit=2,
    save_steps=500,
    eval_steps=500,
    report_to="tensorboard",
    per_device_train_batch_size=batch_size,
    num_train_epochs=1,
    learning_rate=2e-5,
    load_best_model_at_end=True,
)


#### Create a Trainer instance

In [12]:
pip install tensorboardX

Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/101.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m92.2/101.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [13]:
trainer = Trainer(
    model=DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3),  # Three labels: positive, negative, neutral
    args=training_args,
    train_dataset=[{'input_ids': X_train['input_ids'][i], 'attention_mask': X_train['attention_mask'][i], 'labels': y_train[i]} for i in range(len(X_train['input_ids']))],
    eval_dataset=[{'input_ids': X_val['input_ids'][i], 'attention_mask': X_val['attention_mask'][i], 'labels': y_val[i]} for i in range(len(X_val['input_ids']))],
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, p.predictions.argmax(axis=1))}
)


Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Train the Model

In [14]:
train_results=trainer.train()

Step,Training Loss,Validation Loss,Accuracy
500,0.3351,0.245975,0.90711
1000,0.2276,0.249143,0.901376
1500,0.1964,0.294623,0.889908
2000,0.1868,0.257976,0.908257


#### Generate a classification report for the validation set

In [17]:
from sklearn.metrics import classification_report
from tabulate import tabulate

model = trainer.model
eval_dataloader = trainer.get_eval_dataloader()
model.eval()

val_predictions = []
with torch.no_grad():
    for batch in eval_dataloader:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        # Get the predicted labels
        predicted_labels = outputs.logits.argmax(dim=1)
        val_predictions.extend(predicted_labels.tolist())

# Generate the classification report
class_report = classification_report(y_val, val_predictions, target_names=['positive',  'negative'])

# Split the classification report by newline and then by whitespace
table_data = [x.split() for x in class_report.split('\n')]

# Print the classification report as a table using the tabulate library
print(tabulate(table_data, headers='keys', tablefmt='fancy_grid'))



╒═══════════╤════════╤══════════╤═════════╕
│ 0         │ 1      │ 2        │ 3       │
╞═══════════╪════════╪══════════╪═════════╡
│ precision │ recall │ f1-score │ support │
├───────────┼────────┼──────────┼─────────┤
│           │        │          │         │
├───────────┼────────┼──────────┼─────────┤
│ positive  │ 0.89   │ 0.92     │ 0.91    │
├───────────┼────────┼──────────┼─────────┤
│ negative  │ 0.92   │ 0.89     │ 0.91    │
├───────────┼────────┼──────────┼─────────┤
│           │        │          │         │
├───────────┼────────┼──────────┼─────────┤
│ accuracy  │ 0.91   │ 872      │         │
├───────────┼────────┼──────────┼─────────┤
│ macro     │ avg    │ 0.91     │ 0.91    │
├───────────┼────────┼──────────┼─────────┤
│ weighted  │ avg    │ 0.91     │ 0.91    │
├───────────┼────────┼──────────┼─────────┤
│           │        │          │         │
╘═══════════╧════════╧══════════╧═════════╛


#### Model Saving

In [18]:
model.save_pretrained("/content/drive/MyDrive/Sentiment_model")
tokenizer.save_pretrained("/content/drive/MyDrive/Sentiment_tokenizer")


('/content/drive/MyDrive/Sentiment_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Sentiment_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Sentiment_tokenizer/vocab.txt',
 '/content/drive/MyDrive/Sentiment_tokenizer/added_tokens.json')