### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


### Load Train and Test data

In [None]:
# Load only the 'text' and 'sentiment' columns
df_train = pd.read_csv('/content/train.csv',encoding="unicode_escape", usecols=["text", "sentiment"])
df_test = pd.read_csv('/content/test.csv',encoding="unicode_escape", usecols=["text", "sentiment"])


In [4]:
# Display basic information about the dataset
print(f"Train Dataset shape: {df_train.shape}")
print("\nSample data:")
df_train.head()

Dataset shape: (27481, 2)

Sample data:


Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [None]:
print(f"Test Dataset shape: {df_train.shape}")
print("\nSample data:")
df_test.head()

Test Dataset shape: (27481, 2)

Sample data:


Unnamed: 0,text,sentiment
0,Last session of the day http://twitpic.com/67ezh,neutral
1,Shanghai is also really exciting (precisely -...,positive
2,"Recession hit Veronique Branquinho, she has to...",negative
3,happy bday!,positive
4,http://twitpic.com/4w75p - I like it!!,positive


### Check for Missing values

In [7]:
# Check for missing values
print("\nMissing values:")
print(df_train.isnull().sum())


Missing values:
text         1
sentiment    0
dtype: int64


### Dropping Rows with Missing values

In [None]:
df_train = df_train.dropna(subset=['text', 'sentiment'])
df_test = df_test.dropna(subset=['text', 'sentiment'])


In [10]:
# Check class distribution
print("\nClass distribution:")
df_train['sentiment'].value_counts()


Class distribution:


Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
neutral,11117
positive,8582
negative,7781


### Convert Sentiment Labels to Numeric Values

In [None]:
'positive', 'neutral', 'negative' as labels
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
df_train['sentiment_id'] = df_train['sentiment'].map(label_map)
df_test['sentiment_id'] = df_test['sentiment'].map(label_map)

In [None]:
print(f"\nTraining set size: {df_train.shape[0]}")
print(f"Testing set size: {df_test.shape[0]}")


Training set size: 27480
Testing set size: 3534


### Define SentimentDataset Class

In [14]:
# Create a PyTorch dataset
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

### Load Pre-trained Tokenizer and Model

In [None]:
# Load pre-trained tokenizer and model
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3  # positive, neutral, negative
)

### Move Model to Device

In [None]:
# Move model to device
model = model.to(device)

### Create PyTorch Datasets

In [17]:
# Create datasets
train_dataset = SentimentDataset(
    df_train['text'].values,
    df_train['sentiment_id'].values,
    tokenizer
)

test_dataset = SentimentDataset(
    df_test['text'].values,
    df_test['sentiment_id'].values,
    tokenizer
)

### Create Data Loaders and Set Up Optimizer and Scheduler

In [18]:
# Create data loaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Number of training epochs
epochs = 5
total_steps = len(train_loader) * epochs

# Create scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

### Define Training Function

In [19]:
# Training function
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    return total_loss / len(dataloader)

### Define Evaluation Function

In [20]:
# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    actual_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())

    return predictions, actual_labels

### Training Loop

In [None]:
# Training loop
print("\nTraining the model...")
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")

    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    print(f"Training loss: {train_loss:.4f}")



Training the model...

Epoch 1/5
Training loss: 0.5876

Epoch 2/5
Training loss: 0.4239

Epoch 3/5
Training loss: 0.3083

Epoch 4/5
Training loss: 0.2257

Epoch 5/5
Training loss: 0.1690


### Final Evaluation and Saving the Model

In [22]:
# Final evaluation
print("\nFinal Evaluation:")
predictions, actual_labels = evaluate(model, test_loader, device)

# Generate final classification report
report = classification_report(actual_labels, predictions, target_names=label_map.keys())
print("\nFinal Classification Report:")
print(report)

# Save the model
model.save_pretrained("sentiment_analysis_transformer")
tokenizer.save_pretrained("sentiment_analysis_transformer")
print("\nModel saved successfully!")


Final Evaluation:

Final Classification Report:
              precision    recall  f1-score   support

    positive       0.85      0.82      0.84      1103
     neutral       0.75      0.75      0.75      1430
    negative       0.77      0.80      0.78      1001

    accuracy                           0.79      3534
   macro avg       0.79      0.79      0.79      3534
weighted avg       0.79      0.79      0.79      3534


Model saved successfully!
