In [25]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn

In [26]:
try:
    # Load the dataset from CSV with tab as the separator
    df = pd.read_csv('SOLD.csv', sep='\t')

    # Convert labels to 0 and 1
    df['label'] = df['label'].map({'NOT': 0, 'HOF': 1})

    # Remove English text and special characters like @, #, !, ., etc.
    df['text'] = df['text'].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|#|([A-Za-z0-9])|(\.+)|\t.+", "", elem))

    # Split the dataset into train and test sets (80% train, 20% test)
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Save the preprocessed train and test datasets to new CSV files
    train_df.to_csv('train_dataset.csv', index=False)
    test_df.to_csv('test_dataset.csv', index=False)

    print("Preprocessing completed.")
    print("Train and test datasets saved to train_dataset.csv and test_dataset.csv")

except Exception as e:
    print("Error:", e)

Preprocessing completed.
Train and test datasets saved to train_dataset.csv and test_dataset.csv


In [27]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [28]:
texts = train_df['text'].tolist()
labels = train_df['label'].tolist()

In [29]:
print(len(texts), len(labels))

6000 6000


In [30]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [31]:
max_sequence_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

In [32]:
import torch
from transformers import BertTokenizer, AutoTokenizer
from transformers import BertForSequenceClassification, AdamW, AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelWithLMHead, pipeline

In [33]:
tokenizer = AutoTokenizer.from_pretrained("keshan/SinhalaBERTo")
model = AutoModelForSequenceClassification.from_pretrained("keshan/SinhalaBERTo",num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at keshan/SinhalaBERTo and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
from keras.utils import plot_model

In [35]:
from torch.utils.data import Dataset, DataLoader
class Dataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        label = self.data.iloc[index]['label']

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label)
        }

In [36]:
df = pd.DataFrame(train_df)
df['max_token_count'] = df['text'].apply(lambda x: len(tokenizer.encode(x)))

# Find the maximum token count across all rows
max_tokens = df['max_token_count'].max()

print("Maximum token count:", max_tokens)
#=========================================================================================================

dataset = Dataset(train_df, tokenizer, max_length= max_tokens)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the pre-trained model
model.to(device)

Maximum token count: 392


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [37]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW

# Define your CNN and LSTM model architecture
class CNNLSTMModel(nn.Module):
    def __init__(self, model, cnn_kernel_size, lstm_hidden_size, num_labels, dropout_rate):
        super(CNNLSTMModel, self).__init__()
        self.bert = model
        self.cnn = nn.Conv1d(in_channels=model.config.hidden_size, out_channels=64, kernel_size=cnn_kernel_size)
        self.lstm = nn.LSTM(input_size=64, hidden_size=lstm_hidden_size, batch_first=True)
        self.fc = nn.Linear(lstm_hidden_size, num_labels)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_hidden_states = outputs.last_hidden_state  # Use the last hidden state from BERT

        # Permute dimensions for CNN
        cnn_input = bert_hidden_states.permute(0, 2, 1)
        cnn_output = self.cnn(cnn_input)

        # Permute dimensions for LSTM
        lstm_input = cnn_output.permute(0, 2, 1)
        lstm_output, _ = self.lstm(lstm_input)

        # Take the final hidden state from LSTM for classification
        lstm_final_hidden = lstm_output[:, -1, :]

        lstm_final_hidden = self.dropout(lstm_final_hidden)

        logits = self.fc(lstm_final_hidden)
        return logits

# Set hyperparameters
cnn_kernel_size = 3
lstm_hidden_size = 128
num_labels = 2
dropout_rate = 0.5

# Initialize your CNNLSTMModel
modell = CNNLSTMModel(model, cnn_kernel_size, lstm_hidden_size, num_labels, dropout_rate)
modell.to(device)

# Set training parameters
batch_size = 16
learning_rate = 2e-5
epochs = 1

# Create data loader
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Create optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    total_loss = 0
    model.train()

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)
        logits = outputs.logits  # Use the logits attribute

        loss = loss_fn(logits, labels)

        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(data_loader)
    print(f'Epoch {epoch + 1}/{epochs} - Loss: {avg_loss:.4f}')

Epoch 1/1 - Loss: 0.5401


In [38]:
print(modell)

CNNLSTMModel(
  (bert): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(52000, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-5): 6 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Line

In [39]:
save_directory = "/content/drive/MyDrive/sinhala_proj/ensemble"
model.save_pretrained(save_directory)

In [40]:
model_path = "/content/drive/MyDrive/sinhala_proj/ensemble"
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [41]:
texts = test_df['text'].tolist()
labels = test_df['label'].tolist()

In [42]:
print(len(texts), len(labels))

1500 1500


In [43]:
def get_accuracy(test_df, model, tokenizer, batch_size=16, device='cuda'):
    predicted_labels = []

    num_batches = (len(test_df) - 1) // batch_size + 1

    for i in range(num_batches):
        batch_texts = list(test_df['text'][i * batch_size: (i + 1) * batch_size])

        encoded_inputs = tokenizer.batch_encode_plus(batch_texts,
                                                     padding=True,
                                                     truncation=True,
                                                     max_length=508,
                                                     return_tensors='pt')

        with torch.no_grad():
            model.to(device)
            model_outputs = model(**encoded_inputs.to(device))

        batch_predicted_labels = torch.argmax(model_outputs.logits, dim=1)
        predicted_labels.extend(batch_predicted_labels.tolist())

    predicted_labels = predicted_labels[:len(test_df)]
    return np.array(predicted_labels)

In [44]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

predictions = get_accuracy(test_df, model, tokenizer)

y= np.array(test_df['label'])
F1_score = f1_score(y, predictions)
accuracy = accuracy_score(y, predictions)
print(f'Testing Accuracy: {accuracy:.4f}')
print(f'Testing F1 Score: {F1_score:.4f}')

Testing Accuracy: 0.8113
Testing F1 Score: 0.7244
