#### Import Libraries

In [1]:
!pip install accelerate -U
!pip install transformers
!pip install datasets

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/261.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m194.6/261.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from accelerate)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, accelerate
Successfully installed accelerate-0.24.1 huggingface-hub-0.19.0
Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m14.4 MB/s[0m eta [3

In [2]:

import re
import torch
import transformers
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score
from datasets import load_dataset

import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
dataset = load_dataset("glue", "sst2")

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [4]:
train = dataset['train']
val = dataset['validation']


In [5]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

#### Data preprocessing function

In [6]:
def preprocess_text(text):
    # Remove special characters, links, and user mentions
    text = re.sub(r'\b@\w+\b', '', text)  # Remove user mentions
    text = re.sub(r'http\S+', '', text)   # Remove links
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters

    # Tokenize and join the text
    return ' '.join(tokenizer.tokenize(text))


In [7]:
X_train = tokenizer(list(train['sentence']), truncation=True, padding=True, return_tensors='pt', max_length=100, return_attention_mask=True)
X_val = tokenizer(list(val['sentence']), truncation=True, padding=True, return_tensors='pt', max_length=100, return_attention_mask=True)

# Get labels
y_train = torch.tensor(list(train['label']))
y_val = torch.tensor(list(val['label']))


In [8]:
batch_size = 32

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/sentimental_analysis',
    evaluation_strategy="steps",
    save_total_limit=2,
    save_steps=500,
    eval_steps=500,
    report_to="tensorboard",
    per_device_train_batch_size=batch_size,
    num_train_epochs=1,
    learning_rate=2e-5,
    load_best_model_at_end=True,
)


In [9]:
pip install tensorboardX

Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/101.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/101.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [10]:
trainer = Trainer(
    model=DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3),  # Three labels: positive, negative, neutral
    args=training_args,
    train_dataset=[{'input_ids': X_train['input_ids'][i], 'attention_mask': X_train['attention_mask'][i], 'labels': y_train[i]} for i in range(len(X_train['input_ids']))],
    eval_dataset=[{'input_ids': X_val['input_ids'][i], 'attention_mask': X_val['attention_mask'][i], 'labels': y_val[i]} for i in range(len(X_val['input_ids']))],
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, p.predictions.argmax(axis=1))}
)


Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
train_results=trainer.train()

Step,Training Loss,Validation Loss,Accuracy
500,0.3259,0.240954,0.912844
1000,0.2272,0.244091,0.90367
1500,0.1964,0.280504,0.899083
2000,0.1846,0.254702,0.908257


In [12]:
from sklearn.metrics import classification_report
import torch

# Assuming you've already trained your model and loaded it into the trainer
# You can get the model from the trainer
model = trainer.model

# Put your evaluation dataset into a DataLoader
eval_dataloader = trainer.get_eval_dataloader()

# Set the model in evaluation mode
model.eval()

val_predictions = []
with torch.no_grad():
    for batch in eval_dataloader:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        # Get the predicted labels
        predicted_labels = outputs.logits.argmax(dim=1)
        val_predictions.extend(predicted_labels.tolist())

# Generate the classification report with the correct target names
class_report = classification_report(y_val, val_predictions, target_names=['positive',  'negative'])

# Print the classification report
print(class_report)


              precision    recall  f1-score   support

    positive       0.89      0.93      0.91       428
    negative       0.93      0.89      0.91       444

    accuracy                           0.91       872
   macro avg       0.91      0.91      0.91       872
weighted avg       0.91      0.91      0.91       872



In [13]:
model.save_pretrained("/content/drive/MyDrive/sentimental_analysis")
tokenizer.save_pretrained("/content/drive/MyDrive/sentimental_analysis")

('/content/drive/MyDrive/sentimental_analysis/tokenizer_config.json',
 '/content/drive/MyDrive/sentimental_analysis/special_tokens_map.json',
 '/content/drive/MyDrive/sentimental_analysis/vocab.txt',
 '/content/drive/MyDrive/sentimental_analysis/added_tokens.json')