IMPORTING LIBRARIES

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
import time
import random
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk import SnowballStemmer
import re
from nltk.tokenize import word_tokenize
from sklearn.metrics import precision_recall_fscore_support
from nltk.corpus import stopwords
nltk.download('stopwords')

torch.backends.cudnn.deterministic = True

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\haliu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


DEFINING HYPERPARAMETERS

In [2]:
# define the hyperparameters
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_CLASSES = 3
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 20000
LEARNING_RATE = 0.001
BATCH_SIZE = 128
NUM_EPOCHS = 15

# define the device to use
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

IMPORTING DATASET

In [4]:
# Importing the train datasets for the three languages
hausa_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/Project_Languages/hausa/train.tsv",sep='\t')
igbo_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/Project_Languages/igbo/train.tsv",sep='\t')
pidgin_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/Project_Languages/pidgin/train.tsv",sep='\t')

In [5]:
# Merge the three dataframes based on the common columns 'tweet' and 'label'
merged_df = pd.concat([hausa_df, igbo_df, pidgin_df], axis=0)

# Save the merged dataset to a new file
merged_df.to_csv('merged_dataset.tsv', index=False)

# Import the merged dataset
merged_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/merged_dataset.tsv", sep=',')


In [6]:
# Importing the validate datasets for the three languages
hausa_validate_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/Project_Languages/hausa/dev.tsv",sep='\t')
igbo_validate_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/Project_Languages/igbo/dev.tsv",sep='\t')
pidgin_validate_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/Project_Languages/pidgin/dev.tsv",sep='\t')

In [7]:
# Merge the three dataframes based on the common columns 'tweet' and 'label'
merged_validate_df = pd.concat([hausa_validate_df, igbo_validate_df, pidgin_validate_df], axis=0)

# Save the merged dataset to a new file
merged_validate_df.to_csv('merged_validate_dataset.tsv', index=False)

# Import the merged dataset
merged_validate_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/merged_validate_dataset.tsv", sep=',')


DATA CLEANING

In [8]:
# Define a function to clean the tweets
def clean_tweet(tweet):
    # Convert the tweet to lowercase
    tweet = tweet.lower()

    # Tokenize the tweet
    words = word_tokenize(tweet)

    # Remove non-alphanumeric characters
    words = [re.sub(r'\W+', '', word) for word in words]

    # Remove stopwords
    stop_words = list(stopwords.words('english'))
    stop_words += ['user', 'im', 'una', 'na', 'wer', 'dey', 'us', 'dem', 'dat', 'omo', 'wey']
    words = [word for word in words if word not in stop_words]

    # Remove empty strings and single characters
    words = [word for word in words if len(word) > 1]

    # Join the words back into a string
    cleaned_tweet = ' '.join(words)

    return cleaned_tweet

# Apply the clean_tweet function to the "tweet" column of the dataframe
merged_df["tweet"] = merged_df["tweet"].apply(clean_tweet)

merged_validate_df["tweet"] = merged_validate_df["tweet"].apply(clean_tweet)

PERFORMING STEMMING

In [9]:
# create an instance of the SnowballStemmer class for each language
stemmer = SnowballStemmer("english")

# function to perform stemming on a text based on its language
def stem_text(tweet):
    return " ".join([stemmer.stem(word) for word in nltk.word_tokenize(tweet)])
    
# apply the stem_text function to the "tweet" column of the DataFrame
merged_df["tweet"] = merged_df["tweet"].apply(stem_text)

# apply the stem_text function to the "tweet" column of the DataFrame
merged_validate_df["tweet"] = merged_validate_df["tweet"].apply(stem_text)

# Save the cleaned dataframe to a new csv file
merged_df.to_csv("cleaned_tweets.tsv", index=False)

# Save the cleaned dataframe to a new csv file
merged_validate_df.to_csv("cleaned_tweets_validate.tsv", index=False)

In [10]:
# Import the cleaned datasets
merged_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/cleaned_tweets.tsv", sep=',')

merged_validate_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/cleaned_tweets_validate.tsv", sep=',')

ENCODING THE LABELS

In [11]:
# Encoding the labels into int type
label_encoder = LabelEncoder()
merged_df['label'] = label_encoder.fit_transform(merged_df['label'])

# Save the encoded dataframe to a new csv file
merged_df.to_csv("merged_encoded_dataset.tsv", index=False)

merged_validate_df['label'] = label_encoder.fit_transform(merged_validate_df['label'])

# Save the encoded dataframe to a new csv file
merged_validate_df.to_csv("merged_encoded_dataset_validate.tsv", index=False)

In [12]:
# Import the encoded datasets
dasaset_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/merged_encoded_dataset.tsv")
dasaset_validate_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/merged_encoded_dataset_validate.tsv")
dasaset_validate_df.head()

Unnamed: 0,tweet,label
0,allah ya kai rahmarsa kabarin ta uwa ga gimbiy...,2
1,wannan kasa tamu allah ya kyauta wai lailai sa...,2
2,nan da zuwa shekara mai zuwa da yarda allah,2
3,hhm rahama allah ya shiry ki dinga daukan shaw...,2
4,innalillahi wainnailaihirrajiun allah swa ya k...,2


IMPORTING TEST DATASETS

In [13]:
hausa_test_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/Project_Languages/hausa/test.tsv",sep='\t')
igbo_test_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/Project_Languages/igbo/test.tsv",sep='\t')
pidgin_test_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/Project_Languages/pidgin/test.tsv",sep='\t')

In [14]:
# Encoding the labels into int type
hausa_test_df['label'] = label_encoder.fit_transform(hausa_test_df['label'])
igbo_test_df['label'] = label_encoder.fit_transform(igbo_test_df['label'])
pidgin_test_df['label'] = label_encoder.fit_transform(pidgin_test_df['label'])

# Save the encoded dataframe to a new csv file
hausa_test_df.to_csv("hausa_dataset.tsv", index=False)
igbo_test_df.to_csv("igbo_dataset.tsv", index=False)
pidgin_test_df.to_csv("pidgin_dataset.tsv", index=False)

In [15]:
# Import the datasets
hausa_test_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/hausa_dataset.tsv", sep=',')
igbo_test_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/igbo_dataset.tsv", sep=',')
hausa_test_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/pidgin_dataset.tsv", sep=',')

PREPARE DATASET WITH TORCHTEXT

In [3]:
# Define the fields
TEXT = torchtext.legacy.data.Field(
    tokenize='spacy',
    sequential=True,
    tokenizer_language='en_core_web_sm'
)
LABEL = torchtext.legacy.data.LabelField(dtype=torch.long)

In [4]:
# Load the CSV files
train_df = pd.read_csv('C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/merged_encoded_dataset.tsv', usecols=[0, 1], names=['tweet', 'label'], header=None)
validate_df = pd.read_csv('C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/merged_encoded_dataset_validate.tsv', usecols=[0, 1], names=['tweet', 'label'], header=None)

hausa_test_df = pd.read_csv('C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/hausa_dataset.tsv', usecols=[0, 1], names=['tweet', 'label'], header=None)
igbo_test_df = pd.read_csv('C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/igbo_dataset.tsv', usecols=[0, 1], names=['tweet', 'label'], header=None)
pidgin_test_df = pd.read_csv('C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/pidgin_dataset.tsv', usecols=[0, 1], names=['tweet', 'label'], header=None)

train_df = train_df.drop(labels=0, axis=0)
validate_df = validate_df.drop(labels=0, axis=0)

hausa_test_df = hausa_test_df.drop(labels=0, axis=0)
igbo_test_df = igbo_test_df.drop(labels=0, axis=0)
pidgin_test_df = pidgin_test_df.drop(labels=0, axis=0)

# Create the examples
examples = [torchtext.legacy.data.Example.fromlist([row['tweet'], row['label']], fields=[('tweet', TEXT), ('label', LABEL)]) for _, row in train_df.iterrows()]
validate_examples = [torchtext.legacy.data.Example.fromlist([row['tweet'], row['label']], fields=[('tweet', TEXT), ('label', LABEL)]) for _, row in validate_df.iterrows()]

hausa_examples = [torchtext.legacy.data.Example.fromlist([row['tweet'], row['label']], fields=[('tweet', TEXT), ('label', LABEL)]) for _, row in hausa_test_df.iterrows()]
igbo_examples = [torchtext.legacy.data.Example.fromlist([row['tweet'], row['label']], fields=[('tweet', TEXT), ('label', LABEL)]) for _, row in igbo_test_df.iterrows()]
pidgin_examples = [torchtext.legacy.data.Example.fromlist([row['tweet'], row['label']], fields=[('tweet', TEXT), ('label', LABEL)]) for _, row in pidgin_test_df.iterrows()]

# Create the dataset
dataset = torchtext.legacy.data.Dataset(examples, fields=[('tweet', TEXT), ('label', LABEL)])
validate_dataset = torchtext.legacy.data.Dataset(validate_examples, fields=[('tweet', TEXT), ('label', LABEL)])


hausa_test_set = torchtext.legacy.data.Dataset(hausa_examples, fields=[('tweet', TEXT), ('label', LABEL)])
igbo_test_set = torchtext.legacy.data.Dataset(igbo_examples, fields=[('tweet', TEXT), ('label', LABEL)])
pidgin_test_set = torchtext.legacy.data.Dataset(pidgin_examples, fields=[('tweet', TEXT), ('label', LABEL)])

In [5]:
train_data = dataset
valid_data = validate_dataset
hausa_test_data = hausa_test_set
igbo_test_data = igbo_test_set
pidgin_test_data = pidgin_test_set

print(f'Num Train: {len(train_data)}')
print(f'Num Test: {len(hausa_test_data)}')
print(f'Num Test: {len(igbo_test_data)}')
print(f'Num Test: {len(pidgin_test_data)}')

Num Train: 29485
Num Test: 5303
Num Test: 3682
Num Test: 4154


In [19]:
print(f'Num Train: {len(train_data)}')
print(f'Num Validation: {len(valid_data)}')

Num Train: 29485
Num Validation: 5799


Build the vocabulary based on the top "VOCABULARY_SIZE" words

In [6]:
TEXT.build_vocab(dataset, max_size=VOCABULARY_SIZE)
LABEL.build_vocab(dataset)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

Vocabulary size: 20002
Number of classes: 3


Define Data Loaders

In [7]:
train_loader, valid_loader, hausa_test_loader, igbo_test_loader, pidgin_test_loader  = \
    torchtext.legacy.data.BucketIterator.splits(
        (train_data, valid_data, hausa_test_data, igbo_test_data, pidgin_test_data),
         batch_size=BATCH_SIZE,
         sort_within_batch=False,
         sort_key=lambda x: len(x.tweet),
         shuffle=True,
         device=device
    )

In [22]:
print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.tweet.size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.tweet.size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nTest:')
for batch in hausa_test_loader:
    print(f'Text matrix size: {batch.tweet.size()}')
    print(f'Target vector size: {batch.label.size()}')
    break

Train
Text matrix size: torch.Size([48, 128])
Target vector size: torch.Size([128])

Valid:
Text matrix size: torch.Size([3, 128])
Target vector size: torch.Size([128])

Test:
Text matrix size: torch.Size([3, 128])
Target vector size: torch.Size([128])


BUILDING THE MODEL

In [8]:
class SentimentModel(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        # The Embedding layer takes the input text and maps each 
        # token to its corresponding embedding vector of size 
        # embedding_dim.
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        # The LSTM layer takes the embedded input and processes 
        # it through an LSTM neural network with hidden_dim hidden
        # units.
        self.cnn = nn.LSTM(embedding_dim, hidden_dim) 
        # Finally, the Linear layer maps the LSTM's output to a 
        # vector of size output_dim       
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.cnn(embedded)
        hidden.squeeze_(0)
        output = self.fc(hidden)
        output = self.softmax(output)
        return output


In [9]:
torch.manual_seed(RANDOM_SEED)
model = SentimentModel(input_dim=len(TEXT.vocab),
            embedding_dim=EMBEDDING_DIM,
            hidden_dim=HIDDEN_DIM,
            output_dim=NUM_CLASSES 
)

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

EVALUATING MODEL ACCURACY

In [10]:
def compute_accuracy(data_loader):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for i, (features, targets) in enumerate(data_loader):

            features = features.to(device)
            targets = targets.float().to(device)

            logits = model(features)
            _, predicted_labels = torch.max(logits, 1)

            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100

TRAINING THE MODEL

In [11]:
start_time = time.time()
NUM_EPOCHS = 15
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
# Train the model
total_steps = len(train_loader)
for epoch in range(NUM_EPOCHS):
    for i, (tweets, labels) in enumerate(train_loader):
        # Move tensors to the configured device
        tweets = tweets.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(tweets)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Print statistics
        if not i % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {i:03d}/{len(train_loader):03d} | '
                   f'Loss: {loss:.4f}')
            
    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(train_loader):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(valid_loader):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')


Epoch: 001/015 | Batch 000/231 | Loss: 1.1185
Epoch: 001/015 | Batch 050/231 | Loss: 1.0928
Epoch: 001/015 | Batch 100/231 | Loss: 1.0919
Epoch: 001/015 | Batch 150/231 | Loss: 1.1002
Epoch: 001/015 | Batch 200/231 | Loss: 1.0917
training accuracy: 35.27%
valid accuracy: 35.89%
Time elapsed: 2.40 min
Epoch: 002/015 | Batch 000/231 | Loss: 1.0953
Epoch: 002/015 | Batch 050/231 | Loss: 1.1051
Epoch: 002/015 | Batch 100/231 | Loss: 1.0984
Epoch: 002/015 | Batch 150/231 | Loss: 1.0331
Epoch: 002/015 | Batch 200/231 | Loss: 0.8676
training accuracy: 58.59%
valid accuracy: 50.13%
Time elapsed: 4.65 min
Epoch: 003/015 | Batch 000/231 | Loss: 0.8047
Epoch: 003/015 | Batch 050/231 | Loss: 0.8184
Epoch: 003/015 | Batch 100/231 | Loss: 0.7416
Epoch: 003/015 | Batch 150/231 | Loss: 0.8100
Epoch: 003/015 | Batch 200/231 | Loss: 0.6477
training accuracy: 77.36%
valid accuracy: 62.15%
Time elapsed: 6.90 min
Epoch: 004/015 | Batch 000/231 | Loss: 0.6238
Epoch: 004/015 | Batch 050/231 | Loss: 0.6383
Ep

EVALUATING THE MODEL ON THE TEST SET

In [27]:
print(f'Test accuracy for Hausa: {compute_accuracy(hausa_test_loader):.2f}%')
print(f'Test accuracy for Igbo: {compute_accuracy(igbo_test_loader):.2f}%')
print(f'Test accuracy for Pidgin: {compute_accuracy(pidgin_test_loader):.2f}%')

Test accuracy for Hausa: 64.55%
Test accuracy for Igbo: 66.51%
Test accuracy for Pidgin: 55.42%


In [49]:
# Define the testing function
def test(model, loss_fn, dataloader):
    model.eval()
    test_loss = 0.0
    test_acc = 0.0

    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            test_loss += loss.item()
            
            preds = torch.argmax(outputs, dim=1)
            test_acc += torch.sum(preds == labels)

    test_loss /= len(dataloader)
    test_acc /= len(dataloader.dataset)

    return test_loss, test_acc

In [78]:
test(model, criterion, igbo_test_loader)

(0.8648530512020506, tensor(0.6651))

In [29]:
def evaluate_model(model, test_loader):
    # Set model to evaluation mode
    model.eval()

    y_true = []
    y_pred = []

    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            # Forward pass
            batch_y_pred = model(batch_x)

            # Get predicted labels
            _, batch_y_pred = torch.max(batch_y_pred, dim=1)

            # Append true and predicted labels
            y_true += batch_y.tolist()
            y_pred += batch_y_pred.tolist()

    # Calculate precision, recall, and F-1 score
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')

    print("Precision: {:.2f}".format(precision))
    print("Recall: {:.2f}".format(recall))
    print("F-1 score: {:.2f}".format(f1_score))

    return precision, recall, f1_score



In [30]:
evaluate_model(model, hausa_test_loader)

Precision: 0.64
Recall: 0.65
F-1 score: 0.64


(0.6432796429289854, 0.6461433602043055, 0.6443836276693253)

In [31]:
evaluate_model(model, igbo_test_loader)

Precision: 0.66
Recall: 0.65
F-1 score: 0.66


(0.6644768418248478, 0.650734207688415, 0.6562284559719662)

In [32]:
evaluate_model(model, pidgin_test_loader)

Precision: 0.44
Recall: 0.43
F-1 score: 0.43


(0.4414327903540962, 0.4305022503608344, 0.43088680864047846)

TESTING FOR HAUSA

In [71]:
# Set the model to evaluation mode
model.eval()

# Test the model on a sample of the test set
for batch in hausa_test_loader:
    tweet = batch.tweet
    label = batch.label
    
    # Print the original text and label
    text_string = ' '.join([TEXT.vocab.itos[token.item()] for token in tweet[:, 0]])
    label_string = LABEL.vocab.itos[label[0].item()]
    print('Original tweet:', text_string)
    print('Original label:', label_string)
    
    # Make a prediction
    with torch.no_grad():
        output = model(tweet)
        prediction = torch.argmax(output, dim=1)
        prediction_string = LABEL.vocab.itos[prediction[0].item()]
        print('Predicted label:', prediction_string)
    
    print('---------------------------------')


Original tweet: rahama yan mata
Original label: 2
Predicted label: 1
---------------------------------
Original tweet: saboda corona wai
Original label: 1
Predicted label: 0
---------------------------------
Original tweet: ubanka zama daram <pad>
Original label: 0
Predicted label: 0
---------------------------------
Original tweet: gaisuwa muke babban director
Original label: 2
Predicted label: 2
---------------------------------
Original tweet: yau ba face mask
Original label: 1
Predicted label: 1
---------------------------------
Original tweet: mutumin nan akwai <unk> <pad>
Original label: 0
Predicted label: 1
---------------------------------
Original tweet: amma kina da kyau fa
Original label: 2
Predicted label: 2
---------------------------------
Original tweet: shi kuma yayi kukan giwa
Original label: 1
Predicted label: 0
---------------------------------
Original tweet: ta <unk> <unk> zaa sawa
Original label: 1
Predicted label: 1
---------------------------------
Original twee

TESTING FOR IGBO

In [76]:
# Test the model on a sample of the test set
for batch in igbo_test_loader:
    tweet = batch.tweet
    label = batch.label
    
    # Print the original text and label
    text_string = ' '.join([TEXT.vocab.itos[token.item()] for token in tweet[:, 0]])
    label_string = LABEL.vocab.itos[label[0].item()]
    print('Original tweet:', text_string)
    print('Original label:', label_string)
    
    # Make a prediction
    with torch.no_grad():
        output = model(tweet)
        prediction = torch.argmax(output, dim=1)
        prediction_string = LABEL.vocab.itos[prediction[0].item()]
        print('Predicted label:', prediction_string)
    
    print('---------------------------------')

Original tweet: <unk> <unk> oma
Original label: 2
Predicted label: 2
---------------------------------
Original tweet: bia <unk> egwu
Original label: 1
Predicted label: 1
---------------------------------
Original tweet: <unk> onu you <pad>
Original label: 0
Predicted label: 0
---------------------------------
Original tweet: nsogbu adiro <unk> m
Original label: 2
Predicted label: 2
---------------------------------
Original tweet: ndi facebook abiala nga
Original label: 1
Predicted label: 0
---------------------------------
Original tweet: hapu <unk> <unk> ahu
Original label: 0
Predicted label: 1
---------------------------------
Original tweet: hate speech gbakwa okwu <pad>
Original label: 0
Predicted label: 0
---------------------------------
Original tweet: chris <unk> gba gi ume
Original label: 2
Predicted label: 1
---------------------------------
Original tweet: <unk> gi gara ulo uka
Original label: 1
Predicted label: 1
---------------------------------
Original tweet: bịa buru 

TESTING FOR PIDGIN

In [77]:
# Test the model on a sample of the test set
for batch in pidgin_test_loader:
    tweet = batch.tweet
    label = batch.label
    
    # Print the original text and label
    text_string = ' '.join([TEXT.vocab.itos[token.item()] for token in tweet[:, 0]])
    label_string = LABEL.vocab.itos[label[0].item()]
    print('Original tweet:', text_string)
    print('Original label:', label_string)
    
    # Make a prediction
    with torch.no_grad():
        output = model(tweet)
        prediction = torch.argmax(output, dim=1)
        prediction_string = LABEL.vocab.itos[prediction[0].item()]
        print('Predicted label:', prediction_string)
    
    print('---------------------------------')


Original tweet: <unk> be <unk> <pad> <pad>
Original label: 2
Predicted label: 1
---------------------------------
Original tweet: one pikin wia get <unk> <pad>
Original label: 2
Predicted label: 2
---------------------------------
Original tweet: make <unk> come see <unk> <unk>
Original label: 2
Predicted label: 0
---------------------------------
Original tweet: <unk> <unk> <unk> even vex <unk> <pad>
Original label: 0
Predicted label: 0
---------------------------------
Original tweet: wetin <unk> <unk> find <unk> <unk> test <pad>
Original label: 1
Predicted label: 0
---------------------------------
Original tweet: cos <unk> <unk> <unk> you fit do <unk>
Original label: 2
Predicted label: 2
---------------------------------
Original tweet: <unk> weyrey say <unk> group baba <unk> group <pad>
Original label: 0
Predicted label: 0
---------------------------------
Original tweet: make <unk> month do finish abeg i <unk> tire <pad>
Original label: 0
Predicted label: 0
----------------------