3 epochs

Tamil mBERT

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import RandomOverSampler

In [None]:
# Load and preprocess dataset
df = pd.read_csv('/content/Tam-SA-train.csv')
df['Label'] = df['Label'].map({'Positive': 0, 'Negative': 1, 'unknown_state': 2, 'Mixed_feelings': 3})

# Handle class imbalance using oversampling
ros = RandomOverSampler(random_state=42)
train_texts_resampled, train_labels_resampled = ros.fit_resample(
    pd.DataFrame(df['Text']), df['Label']
)

# Train-test split (after resampling)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts_resampled['Text'], train_labels_resampled, test_size=0.2, stratify=train_labels_resampled
)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

class TamilDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets and loaders
train_dataset = TamilDataset(train_texts.values, train_labels.values, tokenizer)
val_dataset = TamilDataset(val_texts.values, val_labels.values, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Compute class weights for weighted loss
classes = np.array([0, 1, 2, 3])
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=train_labels_resampled
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Optimizer and loss function
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

# Training loop (increase epochs)
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
print(f"\n\nAccuracy: {accuracy * 100:.4f}%")
print(classification_report(true_labels, predictions, target_names=['Positive', 'Negative', 'unknown_state', 'Mixed_feelings']))

model.save_pretrained('/content/Tamil_model')
tokenizer.save_pretrained('/content/Tamil_model')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Loss: 1.1301
Epoch 2/3, Loss: 0.7050
Epoch 3/3, Loss: 0.4163


Accuracy: 86.3668%
                precision    recall  f1-score   support

      Positive       0.86      0.65      0.74      3629
      Negative       0.90      0.94      0.92      3629
 unknown_state       0.81      0.94      0.87      3629
Mixed_feelings       0.89      0.92      0.91      3629

      accuracy                           0.86     14516
     macro avg       0.87      0.86      0.86     14516
  weighted avg       0.87      0.86      0.86     14516



Testing

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the saved model and tokenizer
model_path = '/content/Tamil_model'
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


input_sentence = "Ippa than konjam jaathi veri illama konjam ellarum natpa palagi varanga,  athukulla jaathi veriya thundra mathiri oru padam.."

inputs = tokenizer(
    input_sentence,
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors="pt"
)

# Move tensors to device
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)

model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    prediction = torch.argmax(outputs.logits, dim=1).item()

label_map = {0: "Positive", 1: "Negative", 2: "unknown_state", 3: "Mixed_feelings"}
predicted_label = label_map[prediction]

print(f"Predicted Sentiment: {predicted_label}")

OSError: Can't load tokenizer for '/content/Tamil_model'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '/content/Tamil_model' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Load the test dataset
test_df = pd.read_csv('/content/Tam-SA-test-without-labels.csv')

# Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained('/content/Tamil_model')
model = BertForSequenceClassification.from_pretrained('/content/Tamil_model')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

class TamilTestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0)
        }

# Create the test dataset and DataLoader
test_dataset = TamilTestDataset(test_df['Text'], tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16)

# Predict the labels
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())

# Map predictions to original labels (you can modify this if needed)
label_map = {0: "Positive", 1: "Negative", 2: "unknown_state", 3: "Mixed_feelings"}
predicted_labels = [label_map[pred] for pred in predictions]

# Add the predicted labels as a new column to the test dataframe
test_df['label'] = predicted_labels

# Save the updated dataframe to a new CSV file
test_df.to_csv('/content/new_tamil_test_with_labels.csv', index=False)

print("Prediction completed and saved to 'new_tamil_test_with_labels.csv'.")


OSError: Can't load tokenizer for '/content/Tamil_model'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '/content/Tamil_model' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.

Tulu mBERT

In [None]:
df = pd.read_csv('/content/Tulu_SA_train.csv')
df = df.dropna()
df.isna().sum()

Unnamed: 0,0
Text,0
Label,0


In [None]:
df['Label'] = df['Label'].map({'Positive': 0, 'Negative': 1, 'Neutral': 2, 'Mixed': 3, 'Not Tulu': 4})

# Handle class imbalance using oversampling
ros = RandomOverSampler(random_state=42)
train_texts_resampled, train_labels_resampled = ros.fit_resample(
    pd.DataFrame(df['Text']), df['Label']
)

# Train-test split (after resampling)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts_resampled['Text'], train_labels_resampled, test_size=0.2, stratify=train_labels_resampled
)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

class TuluDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets and loaders
train_dataset = TuluDataset(train_texts.values, train_labels.values, tokenizer)
val_dataset = TuluDataset(val_texts.values, val_labels.values, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Model (set num_labels=5 for the five unique classes)
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=5)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Compute class weights for weighted loss (after resampling)
classes = np.array([0, 1, 2, 3, 4])
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=train_labels_resampled
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Optimizer and loss function
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

# Training loop (increase epochs)
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
print(f"\n\nAccuracy: {accuracy * 100:.4f}%")
print(classification_report(true_labels, predictions, target_names=['Positive', 'Negative', 'Neutral', 'Mixed', 'Not Tulu']))

model.save_pretrained('/content/Tulu_model')
tokenizer.save_pretrained('/content/Tulu_model')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Loss: 1.0529
Epoch 2/3, Loss: 0.5462
Epoch 3/3, Loss: 0.3015


Accuracy: 84.3864%
              precision    recall  f1-score   support

    Positive       0.77      0.82      0.79       880
    Negative       0.89      0.99      0.93       880
     Neutral       0.87      0.69      0.77       880
       Mixed       0.81      0.93      0.87       880
    Not Tulu       0.90      0.79      0.84       880

    accuracy                           0.84      4400
   macro avg       0.85      0.84      0.84      4400
weighted avg       0.85      0.84      0.84      4400



('/content/Tulu_model/tokenizer_config.json',
 '/content/Tulu_model/special_tokens_map.json',
 '/content/Tulu_model/vocab.txt',
 '/content/Tulu_model/added_tokens.json')

Testing

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the saved model and tokenizer
model_path = '/content/Tulu_model'
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


input_sentence = "Enchi pankda comedy"

inputs = tokenizer(
    input_sentence,
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors="pt"
)

# Move tensors to device
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)

model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    prediction = torch.argmax(outputs.logits, dim=1).item()

label_map = {0 : 'Positive', 1 : 'Negative', 2 : 'Neutral', 3 : 'Mixed', 4 : 'Not Tulu'}
predicted_label = label_map[prediction]

print(f"Predicted Sentiment: {predicted_label}")

Predicted Sentiment: Negative


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import numpy as np

test_df = pd.read_csv('/content/Tulu_SA_test_without_label.csv')

# Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained('/content/Tulu_model')
model = BertForSequenceClassification.from_pretrained('/content/Tulu_model')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

class TuluTestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0)
        }

test_dataset = TuluTestDataset(test_df['Text'], tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16)

model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())

label_map = {0 : 'Positive', 1 : 'Negative', 2 : 'Neutral', 3 : 'Mixed', 4 : 'Not Tulu'}
predicted_labels = [label_map[pred] for pred in predictions]

test_df['label'] = predicted_labels

# Save the updated dataframe to a new CSV file
test_df.to_csv('/content/new_tulu_test_with_labels.csv', index=False)

print("Prediction completed and saved to 'new_tulu_test_with_labels.csv'.")


Prediction completed and saved to 'tulu_test_with_labels.csv'.


Tulu IndicBERT

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset
import tensorflow as tf

In [None]:
df = pd.read_csv("/content/Tulu_SA_train.csv")
text_column = "Text"
label_column = "Label"
df = df.dropna(subset=[text_column, label_column])

label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
df[label_column] = df[label_column].map(label_mapping)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df[text_column], df[label_column], test_size=0.2, random_state=42, stratify=df[label_column]
)

# Convert to Hugging Face Dataset format
def create_dataset(texts, labels):
    return Dataset.from_dict({"Text": texts, "Label": labels})

train_dataset = create_dataset(train_texts.tolist(), train_labels.tolist())
test_dataset = create_dataset(test_texts.tolist(), test_labels.tolist())

# Load IndicBERT tokenizer and model
model_name = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label_mapping), from_pt=True
)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["Text"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Format the dataset for TensorFlow
train_dataset = train_dataset.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols="Label",
    shuffle=True,
    batch_size=8
)
test_dataset = test_dataset.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols="Label",
    shuffle=False,
    batch_size=8
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

history = model.fit(train_dataset, validation_data=test_dataset, epochs=3)

predictions = model.predict(test_dataset)
y_pred = tf.argmax(predictions.logits, axis=1).numpy()

y_true = []
for batch in test_dataset:
    _, labels = batch
    y_true.extend(labels.numpy())

print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=list(label_mapping.keys())))

model.save_pretrained('/content/Tulu_model')
tokenizer.save_pretrained('/content/Tulu_model')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFAlbertForSequenceClassification: ['sop_classifier.classifier.weight', 'sop_classifier.classifier.bias']
- This IS expected if you are initializing TFAlbertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/10640 [00:00<?, ? examples/s]

Map:   0%|          | 0/2661 [00:00<?, ? examples/s]

Epoch 1/3
Epoch 2/3
Epoch 3/3

Classification Report:
              precision    recall  f1-score   support

    Not Tulu       0.90      0.72      0.80       880
    Positive       0.73      0.57      0.64       754
     Neutral       0.45      0.94      0.61       635
       Mixed       0.28      0.03      0.06       223
    Negative       0.00      0.00      0.00       169

    accuracy                           0.63      2661
   macro avg       0.47      0.45      0.42      2661
weighted avg       0.63      0.63      0.60      2661



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


('/content/Tulu_model/tokenizer_config.json',
 '/content/Tulu_model/special_tokens_map.json',
 '/content/Tulu_model/spiece.model',
 '/content/Tulu_model/added_tokens.json',
 '/content/Tulu_model/tokenizer.json')