<a href="https://colab.research.google.com/github/Faysal3010/Deep-Learning-/blob/main/NLP_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
!pip install torch transformers pandas scikit-learn



In [38]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import get_scheduler
from tqdm import tqdm

In [39]:
df = pd.read_csv('/content/student_people_mass_uprising_public_sentiments_dataset.csv')

In [40]:
import re
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s\u0980-\u09FF]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['comment'] = df['comment'].apply(clean_text)
df = df[['comment', 'label']].dropna()

In [42]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
num_labels = len(df['label'].unique())

In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    df['comment'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

In [44]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
max_len = 128

def encode_texts(texts):
    return tokenizer(
        texts,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

train_encodings = encode_texts(X_train)
test_encodings = encode_texts(X_test)

In [45]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = SentimentDataset(train_encodings, y_train)
test_dataset = SentimentDataset(test_encodings, y_test)

In [46]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [47]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-multilingual-cased',
    num_labels=num_labels
)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [48]:
loss_fn = torch.nn.CrossEntropyLoss()

In [54]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=3e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        loop.set_description(f'Epoch {epoch+1}')
        loop.set_postfix(loss=loss.item())

Epoch 1: 100%|██████████| 210/210 [01:12<00:00,  2.89it/s, loss=0.265]
Epoch 2: 100%|██████████| 210/210 [01:13<00:00,  2.85it/s, loss=0.567]
Epoch 3: 100%|██████████| 210/210 [01:14<00:00,  2.81it/s, loss=0.398]


In [55]:
def predict_sentiment(text):
    text_clean = clean_text(text)
    inputs = tokenizer(text_clean, max_length=max_len, padding='max_length', truncation=True, return_tensors='pt')
    inputs = {k:v.to(device) for k,v in inputs.items()}
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
        sentiment = label_encoder.inverse_transform([pred])[0]
    return text, sentiment

In [63]:
sample_text =input( "লিখুন : ")
print("Sample prediction:", predict_sentiment(sample_text))

লিখুন : পুলিশ অনেক খারাপ কাজ করসে 
Sample prediction: ('পুলিশ অনেক খারাপ কাজ করসে ', 'Positive')
