
# Import Libraries

In [1]:
!pip install pandas scikit-learn torch nltk

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
nltk.download('punkt')
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# Load Train and Test Data

In [7]:

train_df = pd.read_csv('final_augmented_data (tamil).csv')
test_df = pd.read_csv('tam_training_data_hum_ai.csv')

# Drop unnecessary columns and rename columns

In [8]:

train_df = train_df.drop("back_translated_DATA", axis=1)
train_df = train_df.rename(columns={"back_translated_DATA_tamil": "DATA"})


# Modify data

In [9]:
id_index = train_df.columns.get_loc("ID")
cols = list(train_df.columns)
cols.insert(id_index + 1, cols.pop(cols.index("DATA")))
train_df = train_df.loc[:, cols]

# Encode Labels

In [10]:

le = LabelEncoder()
train_df['LABEL'] = le.fit_transform(train_df['LABEL'])

# Tokenization and Vocabulary Building

In [11]:

def tokenize_and_build_vocab(data, max_vocab_size=10000):
    all_tokens = []
    for text in data:
        tokens = word_tokenize(text.lower())
        all_tokens.extend(tokens)

    vocab_counter = Counter(all_tokens)
    most_common = vocab_counter.most_common(max_vocab_size - 2)
    vocab = {word: idx + 2 for idx, (word, _) in enumerate(most_common)}
    vocab["<PAD>"] = 0
    vocab["<UNK>"] = 1
    return vocab

def encode_texts(data, vocab, max_len=100):
    encoded_texts = []
    for text in data:
        tokens = word_tokenize(text.lower())
        encoded = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
        if len(encoded) < max_len:
            encoded += [vocab["<PAD>"]] * (max_len - len(encoded))
        else:
            encoded = encoded[:max_len]
        encoded_texts.append(encoded)
    return np.array(encoded_texts)

vocab = tokenize_and_build_vocab(train_df['DATA'])
train_encoded = encode_texts(train_df['DATA'], vocab)
train_labels = train_df['LABEL'].values

# Prepare Dataset and DataLoader

In [12]:

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encoded, train_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# RNN Model

In [30]:

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        _, hidden = self.rnn(embedded)
        output = self.fc(hidden.squeeze(0))
        return output

# Model Configuration
vocab_size = len(vocab)
embed_dim = 100
hidden_dim = 128
output_dim = len(le.classes_)
model = RNNModel(vocab_size, embed_dim, hidden_dim, output_dim)

#  LSTM Model

In [22]:

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        output = self.fc(hidden.squeeze(0))
        return output

# Model Configuration
vocab_size = len(vocab)
embed_dim = 100
hidden_dim = 128
output_dim = len(le.classes_)
model = LSTMModel(vocab_size, embed_dim, hidden_dim, output_dim)

# GRU Model

In [37]:

class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        _, hidden = self.gru(embedded)
        output = self.fc(hidden.squeeze(0))
        return output

# Model Configuration
vocab_size = len(vocab)
embed_dim = 100
hidden_dim = 128
output_dim = len(le.classes_)
model = GRUModel(vocab_size, embed_dim, hidden_dim, output_dim)

# BiLSTM Model

In [43]:

class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.bilstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Multiply hidden_dim by 2 for bidirectional

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.bilstm(embedded)
        # Use the output of the last time step from both directions
        output = self.fc(lstm_out[:, -1, :])
        return output

# Model Configuration
vocab_size = len(vocab)
embed_dim = 100
hidden_dim = 128
output_dim = len(le.classes_)
model = BiLSTMModel(vocab_size, embed_dim, hidden_dim, output_dim)

# Training Setup

In [44]:

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BiLSTMModel(
  (embedding): Embedding(3158, 100)
  (bilstm): LSTM(100, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)

# Train the Model

In [45]:

num_epochs = 500
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss:.4f}")


Epoch 1/500, Loss: 35.6072
Epoch 2/500, Loss: 35.3979
Epoch 3/500, Loss: 35.4961
Epoch 4/500, Loss: 35.4473
Epoch 5/500, Loss: 35.3714
Epoch 6/500, Loss: 35.3943
Epoch 7/500, Loss: 35.4244
Epoch 8/500, Loss: 35.3754
Epoch 9/500, Loss: 35.3672
Epoch 10/500, Loss: 35.3575
Epoch 11/500, Loss: 35.3583
Epoch 12/500, Loss: 35.3717
Epoch 13/500, Loss: 35.3646
Epoch 14/500, Loss: 35.3595
Epoch 15/500, Loss: 35.3580
Epoch 16/500, Loss: 35.3596
Epoch 17/500, Loss: 35.3600
Epoch 18/500, Loss: 35.3637
Epoch 19/500, Loss: 35.3712
Epoch 20/500, Loss: 35.3626
Epoch 21/500, Loss: 35.3572
Epoch 22/500, Loss: 35.3561
Epoch 23/500, Loss: 35.3753
Epoch 24/500, Loss: 35.3629
Epoch 25/500, Loss: 35.3544
Epoch 26/500, Loss: 35.3556
Epoch 27/500, Loss: 35.3634
Epoch 28/500, Loss: 35.3546
Epoch 29/500, Loss: 35.3602
Epoch 30/500, Loss: 35.3659
Epoch 31/500, Loss: 35.3566
Epoch 32/500, Loss: 35.3622
Epoch 33/500, Loss: 35.3597
Epoch 34/500, Loss: 35.3618
Epoch 35/500, Loss: 35.3590
Epoch 36/500, Loss: 35.3562
E

# Prepare Test Data

In [46]:

test_df.dropna(subset=['DATA'], inplace=True)
test_encoded = encode_texts(test_df['DATA'], vocab)
test_dataset = TextDataset(test_encoded, np.zeros(len(test_encoded)))
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


# Predict Labels

In [47]:

model.eval()
all_preds = []
with torch.no_grad():
    for texts, _ in test_loader:
        texts = texts.to(device)
        outputs = model(texts)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)

decoded_labels = le.inverse_transform(all_preds)
test_df['LABEL'] = decoded_labels

# **Using BiLSTM**

In [48]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the actual labels from the Excel file
actual_labels = pd.read_csv('tamil-test.xlsx - Sheet1.csv')['Label'].values

# Ensure both sets of labels are the same length
min_len = min(len(actual_labels), len(decoded_labels))
actual_labels = actual_labels[:min_len]
decoded_labels = decoded_labels[:min_len]


# Calculate the metrics
accuracy = accuracy_score(actual_labels, decoded_labels)
precision = precision_score(actual_labels, decoded_labels, average='weighted') # Use weighted for multiclass
recall = recall_score(actual_labels, decoded_labels, average='weighted') # Use weighted for multiclass
f1 = f1_score(actual_labels, decoded_labels, average='weighted') # Use weighted for multiclass

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.48
Precision: 0.2304
Recall: 0.48
F1 Score: 0.3113513513513514


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **Using GRU**

In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the actual labels from the Excel file
actual_labels = pd.read_csv('tamil-test.xlsx - Sheet1.csv')['Label'].values

# Ensure both sets of labels are the same length
min_len = min(len(actual_labels), len(decoded_labels))
actual_labels = actual_labels[:min_len]
decoded_labels = decoded_labels[:min_len]


# Calculate the metrics
accuracy = accuracy_score(actual_labels, decoded_labels)
precision = precision_score(actual_labels, decoded_labels, average='weighted') # Use weighted for multiclass
recall = recall_score(actual_labels, decoded_labels, average='weighted') # Use weighted for multiclass
f1 = f1_score(actual_labels, decoded_labels, average='weighted') # Use weighted for multiclass

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.51
Precision: 0.4602105263157895
Recall: 0.51
F1 Score: 0.3828930817610063


# **Using LSTM**

In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the actual labels from the Excel file
actual_labels = pd.read_csv('tamil-test.xlsx - Sheet1.csv')['Label'].values

# Ensure both sets of labels are the same length
min_len = min(len(actual_labels), len(decoded_labels))
actual_labels = actual_labels[:min_len]
decoded_labels = decoded_labels[:min_len]


# Calculate the metrics
accuracy = accuracy_score(actual_labels, decoded_labels)
precision = precision_score(actual_labels, decoded_labels, average='weighted') # Use weighted for multiclass
recall = recall_score(actual_labels, decoded_labels, average='weighted') # Use weighted for multiclass
f1 = f1_score(actual_labels, decoded_labels, average='weighted') # Use weighted for multiclass

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.48
Precision: 0.2304
Recall: 0.48
F1 Score: 0.3113513513513514


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **Using RNN Model**

In [35]:
# prompt: compare the labels of the mal_test.xlsx - Sheet1.csv with the predicted labels and Accuracy,Precision,Recall,F1 Score

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the actual labels from the Excel file
actual_labels = pd.read_csv('tamil-test.xlsx - Sheet1.csv')['Label'].values

# Ensure both sets of labels are the same length
min_len = min(len(actual_labels), len(decoded_labels))
actual_labels = actual_labels[:min_len]
decoded_labels = decoded_labels[:min_len]


# Calculate the metrics
accuracy = accuracy_score(actual_labels, decoded_labels)
precision = precision_score(actual_labels, decoded_labels, average='weighted') # Use weighted for multiclass
recall = recall_score(actual_labels, decoded_labels, average='weighted') # Use weighted for multiclass
f1 = f1_score(actual_labels, decoded_labels, average='weighted') # Use weighted for multiclass

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.52
Precision: 0.2704
Recall: 0.52
F1 Score: 0.35578947368421054


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Make tsv file

In [None]:

test_df.to_csv('Team_Absolute_Zero_mal_bilstm.tsv', sep='\t', index=False)
print("Predictions saved to 'Team_Absolute_Zero_mal_bilstm.tsv'")