In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification

# Load the dataset
data = pd.read_csv('Big_Dataset.csv')

# Fix column name if necessary (to ensure consistency)
if 'Langid' in data.columns:
    data.rename(columns={'Langid': 'Langid'}, inplace=True)

# Define features and labels
X = data['Sentences'].dropna()  # Sentences/Texts
y = data['Langid'].dropna()  # Language labels (ensure correct column name)

# Split into train and validation sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Create a mapping of unique language codes to integer labels
label_map = {label: idx for idx, label in enumerate(y.unique())}

# Convert labels to numerical format
y_train_encoded = y_train.map(label_map)
y_test_encoded = y_test.map(label_map)

# Load the pretrained multilingual BERT model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased", num_labels=len(label_map)
)

# Print label mapping for reference
print("Label Mapping:", label_map)

# Check the size of the training and test datasets
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Label Mapping: {'te': 0, 'kn': 1, 'ml': 2, 'ta': 3, 'hi': 4, 'mr': 5, 'gu': 6, 'bn': 7, 'pa': 8, 'ur': 9, 'or': 10, 'sd': 11}
Training set size: 4199
Test set size: 1800


In [2]:
from transformers import BertTokenizer, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch

In [3]:
# 1. Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')


In [4]:
# Tokenize the sentences (both train and test sets)
train_encodings = tokenizer(list(X_train), padding=True, truncation=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(X_test), padding=True, truncation=True, max_length=128, return_tensors="pt")

In [5]:
# 2. Prepare DataLoader
# Convert the labels into tensors
train_labels = torch.tensor(y_train_encoded.values)
test_labels = torch.tensor(y_test_encoded.values)

In [6]:
# Create TensorDataset for both training and test datasets
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

In [7]:
# Create DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [8]:
# 3. Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)




In [9]:
# 4. Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [10]:
epochs = 30


In [None]:
from tqdm import tqdm  # Import tqdm for progress bar

for epoch in range(epochs):
    model.train()  # Set the model to training mode
    total_train_loss = 0

    # Wrap train_loader with tqdm to show progress
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=True)

    for batch in progress_bar:
        # Get the inputs and labels
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        # Calculate the loss
        loss = outputs.loss
        total_train_loss += loss.item()

        # Backward pass (compute gradients)
        loss.backward()

        # Update model parameters
        optimizer.step()

        # Update tqdm progress bar with current loss
        progress_bar.set_postfix(loss=loss.item())

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs} - Training loss: {avg_train_loss}")


Epoch 1/30:   1%|          | 3/263 [00:27<39:04,  9.02s/it, loss=2.47]

In [None]:
# 5. Save the model
model.save_pretrained("./language_model")

In [None]:
tokenizer.save_pretrained("./language_model")

('./language_model/tokenizer_config.json',
 './language_model/special_tokens_map.json',
 './language_model/vocab.txt',
 './language_model/added_tokens.json')

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the trained model and tokenizer
model_path = "./language_model"  # Path where the model is saved
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Move model to device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set model to evaluation mode

# Function to predict language code
def predict_language(sentence):
    # Tokenize the input sentence
    encoding = tokenizer(sentence, padding=True, truncation=True, max_length=128, return_tensors="pt")

    # Move input tensors to the correct device
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    # Make prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Get predicted label
    predicted_label_id = torch.argmax(logits, dim=1).item()

    # Convert back to language code using label_map
    predicted_language = [lang for lang, idx in label_map.items() if idx == predicted_label_id][0]

    return predicted_language

# Predict for all test sentences
predicted_languages = [predict_language(sentence) for sentence in X_test]

# Create a DataFrame with actual and predicted language codes
output_df = pd.DataFrame({
    'Sentence': X_test.values,             # Original sentence
    'Actual Language Code': y_test.values,  # Original language label
    'Predicted Language Code': predicted_languages  # Predicted language code
})

# Save to CSV file
output_file = "predicted_language_codes.csv"
output_df.to_csv(output_file, index=False)

print(f"Predictions saved to {output_file}")


Predictions saved to predicted_language_codes.csv


In [None]:
data.shape

(580, 2)

In [None]:
#Calculate accuracy score

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predicted_languages)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 97.70%


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Path to the saved model and tokenizer
model_pathe = '/content/language_model'
tokenizer_pathe= '/content/language_model'

# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained(model_pathe)
tokenizer = BertTokenizer.from_pretrained(tokenizer_pathe)

In [None]:
input_text= "kozhikkodu bas marinju niravadhi perkku parukku; oralude nila gurutharam, bas uyarthaan shramam..."
# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt")

In [None]:
# Perform inference (turn off gradients for inference)
with torch.no_grad():
    outputs = model(**inputs)

# Get the logits (raw predictions)
logits = outputs.logits

# If it's a classification task, you can get the predicted class
predicted_class = torch.argmax(logits, dim=-1)

# Print the predicted class (assuming you have class labels corresponding to languages)
# You will need to map the predicted class index to its corresponding language
languages = ['ml','hi','mr','gu','ta','te','or','bn','ur','sn','kn']  # Example language labels
predicted_language = languages[predicted_class.item()]

print(f"Predicted language: {predicted_language}")


NameError: name 'torch' is not defined