In [4]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load and preprocess your Hinglish sentiment analysis dataset (X: text, y: sentiment labels)
# Replace "hinglish_Sentiment.csv" with your dataset file path
dataset_file = "hinglish_sentiment_data.csv"
df = pd.read_csv(dataset_file)

# Define a custom dataset class
class HinglishSentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]["text"]
        sentiment = self.data.iloc[idx]["sentiment"]

        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()

        label_mapping = {"positive": 2, "neutral": 1, "negative": 0}
        label = label_mapping[sentiment]

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": label
        }

# Define the model architecture
class SentimentClassifier(nn.Module):
    def __init__(self, pretrained_model_name, num_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name, num_labels=num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        return logits

# Hyperparameters
batch_size = 32
max_length = 128
learning_rate = 2e-5
epochs = 5

def train_sentiment_model(df, pretrained_model_name="bert-base-multilingual-uncased"):
    # Initialize the tokenizer and dataset
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
    train_dataset = HinglishSentimentDataset(df, tokenizer, max_length)

    # Create data loader for training
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Initialize the model and optimizer
    model = SentimentClassifier(pretrained_model_name, num_classes=3)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()

    # Training loop
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["label"]

            optimizer.zero_grad()

            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

        # After training your model, save it
        torch.save(model.state_dict(), "sentiment_model.pth")

# Example usage for training a sentiment model
train_sentiment_model(df)

# Function for predicting sentiment
def predict_sentiment(model, text):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )
        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()

        logits = model(input_ids.unsqueeze(0), attention_mask.unsqueeze(0))
        predicted_label = torch.argmax(logits, dim=1).item()

        label_mapping = {2: "positive", 1: "neutral", 0: "negative"}
        predicted_sentiment = label_mapping[predicted_label]

        return predicted_sentiment

# Load the trained model
model = SentimentClassifier("bert-base-multilingual-uncased", num_classes=3)
model.load_state_dict(torch.load("sentiment_model.pth"))

# Example usage for predicting sentiment
test_text = "This song makes me super happy. I sing it for myself. Take! Khush raho abaad raho. Hemant Da blessings!"
predicted_sentiment = predict_sentiment(model, test_text)
print("Predicted Sentiment:", predicted_sentiment)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 