In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.0 MB/s[0m eta [36m0:00:0

In [4]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load and preprocess your Hinglish sentiment analysis dataset (X: text, y: sentiment labels)
# Replace "hinglish_Sentiment.csv" with your dataset file path
dataset_file = "/content/Book1.csv"
df = pd.read_csv(dataset_file)

# Define a custom dataset class
class HinglishSentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]["text"]
        sentiment = self.data.iloc[idx]["sentiment"]

        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()

        label_mapping = {"positive": 2, "neutral": 1, "negative": 0}
        label = label_mapping[sentiment]

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": label
        }

# Define the model architecture
class SentimentClassifier(nn.Module):
    def __init__(self, pretrained_model_name, num_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name, num_labels=num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        return logits

# Hyperparameters
batch_size = 32
max_length = 128
learning_rate = 2e-5
epochs = 5

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")

def train_sentiment_model(df, pretrained_model_name="bert-base-multilingual-uncased"):
    # Initialize the tokenizer and dataset
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
    train_dataset = HinglishSentimentDataset(df, tokenizer, max_length)

    # Create data loader for training
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Initialize the model and optimizer
    model = SentimentClassifier(pretrained_model_name, num_classes=3)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()

    # Training loop
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            labels = batch["label"]

            optimizer.zero_grad()

            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

        # After training your model, save it
        torch.save(model.state_dict(), "sentiment_model.pth")

# Example usage for training a sentiment model
train_sentiment_model(df)

# Function for predicting sentiment
def predict_sentiment(model, text):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )
        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()

        logits = model(input_ids.unsqueeze(0), attention_mask.unsqueeze(0))
        predicted_label = torch.argmax(logits, dim=1).item()

        label_mapping = {2: "positive", 1: "neutral", 0: "negative"}
        predicted_sentiment = label_mapping[predicted_label]

        return predicted_sentiment

# Load the trained model
model = SentimentClassifier("bert-base-multilingual-uncased", num_classes=3)
model.load_state_dict(torch.load("sentiment_model.pth"))

# Example usage for predicting sentiment
test_text = "This song makes me super happy. I sing it for myself. Take! Khush raho abaad raho. Hemant Da blessings!"
predicted_sentiment = predict_sentiment(model, test_text)
print("Predicted Sentiment:", predicted_sentiment)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 1.0809
Epoch 2/5, Loss: 1.0600
Epoch 3/5, Loss: 0.9268
Epoch 4/5, Loss: 0.7859
Epoch 5/5, Loss: 0.6889


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted Sentiment: positive


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

def perform_ner(text):
    # Load pre-trained model and tokenizer (adjust the model name as needed)
    model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)

    # Tokenize the input text at the word level
    word_tokens = text.split()

    # Perform NER inference
    with torch.no_grad():
        tokens = tokenizer(word_tokens, return_tensors="pt", is_split_into_words=True)
        outputs = model(**tokens)

    # Get the predicted NER labels for each word
    predicted_labels = [model.config.id2label[label_id] for label_id in torch.argmax(outputs.logits, dim=2).tolist()[0]]

    return predicted_labels

# Example usage:
hinglish_text = "मेरा नाम John है मैं न्यूयॉर्क में रहता हूँ"
predicted_labels = perform_ner(hinglish_text)
print(predicted_labels)


Downloading (…)okenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'O']


In [5]:
test_text = "This song makes me super happy. I sing it for myself. Take! Khush raho abaad raho. Hemant Da blessings!"
predicted_sentiment = predict_sentiment(model, test_text)
print("Predicted Sentiment:", predicted_sentiment)
predicted_labels = perform_ner(test_text)
print("Predicted NER:", predicted_labels)

Predicted Sentiment: positive


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Predicted NER: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O']
