In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
import time
import random
import pandas as pd
from sklearn.preprocessing import LabelEncoder

torch.backends.cudnn.deterministic = True

GENERAL SETTINGS

In [None]:
# define the hyperparameters
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_CLASSES = 3
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 20000
LEARNING_RATE = 0.005
BATCH_SIZE = 128
NUM_EPOCHS = 15
# define the device to use
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')



IMPORTING DATASET

In [None]:
# Importing the train datasets for the three languages
hausa_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/Project_Languages/hausa/train.tsv",sep='\t')
igbo_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/Project_Languages/igbo/train.tsv",sep='\t')
pidgin_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/Project_Languages/pidgin/train.tsv",sep='\t')

In [None]:
# Merge the two dataframes based on the common columns 'tweet' and 'label'
merged_df = pd.concat([hausa_df, igbo_df, pidgin_df], axis=0)

# Save the merged dataset to a new file
merged_df.to_csv('merged_dataset.tsv', index=False)

# Importing the merged dataset
merged_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/merged_dataset.tsv", sep=',')


DATA CLEANING

In [None]:
# Define a function to clean the tweets
def clean_tweet(tweet):
    # Convert the tweet to lowercase
    tweet = tweet.lower()

    # Tokenize the tweet
    words = word_tokenize(tweet)

    # Remove non-alphanumeric characters
    words = [re.sub(r'\W+', '', word) for word in words]

    # Remove stopwords
    stop_words = list(stopwords.words('english'))
    stop_words += ['user', 'im', 'una', 'na', 'wer', 'dey', 'us', 'dem', 'dat', 'omo', 'wey']
    words = [word for word in words if word not in stop_words]

    # Remove empty strings and single characters
    words = [word for word in words if len(word) > 1]

    # Join the words back into a string
    cleaned_tweet = ' '.join(words)

    return cleaned_tweet

# Apply the clean_tweet function to the "tweet" column of the dataframe
merged_df["tweet"] = merged_df["tweet"].apply(clean_tweet)

# Save the cleaned dataframe to a new csv file
merged_df.to_csv("cleaned_tweets.tsv", index=False)


In [None]:
merged_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/cleaned_tweets.tsv", sep=',')

In [None]:
# Encoding the labels into int type
label_encoder = LabelEncoder()
merged_df['label'] = label_encoder.fit_transform(merged_df['label'])


# Save the cleaned dataframe to a new csv file
merged_df.to_csv("merged_encoded_dataset.tsv", index=False)

In [None]:
dasaset_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/merged_encoded_dataset.tsv")
dasaset_df.head()

In [None]:
hausa_test_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/Project_Languages/hausa/test.tsv",sep='\t')
igbo_test_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/Project_Languages/igbo/test.tsv",sep='\t')
pidgin_test_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/Project_Languages/pidgin/test.tsv",sep='\t')

In [None]:
# Encoding the labels into int type
label_encoder = LabelEncoder()
hausa_test_df['label'] = label_encoder.fit_transform(hausa_test_df['label'])
igbo_test_df['label'] = label_encoder.fit_transform(igbo_test_df['label'])
pidgin_test_df['label'] = label_encoder.fit_transform(pidgin_test_df['label'])


# Save the cleaned dataframe to a new csv file
hausa_test_df.to_csv("hausa_dataset.tsv", index=False)
igbo_test_df.to_csv("igbo_dataset.tsv", index=False)
pidgin_test_df.to_csv("pidgin_dataset.tsv", index=False)

In [None]:
hausa_test_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/hausa_dataset.tsv", sep=',')
igbo_test_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/igbo_dataset.tsv", sep=',')
hausa_test_df = pd.read_csv("C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/pidgin_dataset.tsv", sep=',')


Prepare Dataset with Torchtext

In [None]:
TEXT = torchtext.legacy.data.Field(
    tokenize='spacy', # default splits on whitespace
    sequential=True,
    tokenizer_language='en_core_web_sm'
)

# Defining the label processing
LABEL = torchtext.legacy.data.Field(dtype=torch.long)

Process the dataset:

In [None]:
fields = [('tweet', TEXT), ('label', LABEL)]

dataset = torchtext.legacy.data.TabularDataset(
    path='C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/merged_encoded_dataset.tsv', format='tsv',
    skip_header=True, fields=fields)

In [None]:
fields = [('tweet', TEXT), ('label', LABEL)]

hausa_test_df = torchtext.legacy.data.TabularDataset(
    path='C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/hausa_dataset.tsv', format='tsv',
    skip_header=True, fields=fields)

igbo_test_df = torchtext.legacy.data.TabularDataset(
    path='C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/igbo_dataset.tsv', format='tsv',
    skip_header=True, fields=fields)

pidgin_test_df = torchtext.legacy.data.TabularDataset(
    path='C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/pidgin_dataset.tsv', format='tsv',
    skip_header=True, fields=fields)



In [None]:
fields = [('TWEET', TEXT), ('SENTIMENT', LABEL)]

validate_dataset = torchtext.legacy.data.TabularDataset(
    path='C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/merged_validate_dataset.tsv', format='tsv',
    skip_header=True, fields=fields)

In [None]:
# Define the fields
TEXT = torchtext.legacy.data.Field(
    tokenize='spacy',
    sequential=True,
    tokenizer_language='en_core_web_sm'
)
LABEL = torchtext.legacy.data.LabelField(dtype=torch.long)

# Load the CSV file
df = pd.read_csv('C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/merged_encoded_dataset.tsv', usecols=[0, 1], names=['tweet', 'label'], header=None)

hausa_test_df = pd.read_csv('C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/hausa_dataset.tsv', usecols=[0, 1], names=['tweet', 'label'], header=None)
igbo_test_df = pd.read_csv('C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/igbo_dataset.tsv', usecols=[0, 1], names=['tweet', 'label'], header=None)
pidgin_test_df = pd.read_csv('C:/Users/haliu/OneDrive/Desktop/CONTAINER/JUNIOR YEAR/SEM 1/NLP/pidgin_dataset.tsv', usecols=[0, 1], names=['tweet', 'label'], header=None)

df = df.drop(labels=0, axis=0)

hausa_test_df = hausa_test_df.drop(labels=0, axis=0)
igbo_test_df = igbo_test_df.drop(labels=0, axis=0)
pidgin_test_df = pidgin_test_df.drop(labels=0, axis=0)

# Create the examples
examples = [torchtext.legacy.data.Example.fromlist([row['tweet'], row['label']], fields=[('tweet', TEXT), ('label', LABEL)]) for _, row in df.iterrows()]

hausa_examples = [torchtext.legacy.data.Example.fromlist([row['tweet'], row['label']], fields=[('tweet', TEXT), ('label', LABEL)]) for _, row in hausa_test_df.iterrows()]
igbo_examples = [torchtext.legacy.data.Example.fromlist([row['tweet'], row['label']], fields=[('tweet', TEXT), ('label', LABEL)]) for _, row in igbo_test_df.iterrows()]
pidgin_examples = [torchtext.legacy.data.Example.fromlist([row['tweet'], row['label']], fields=[('tweet', TEXT), ('label', LABEL)]) for _, row in pidgin_test_df.iterrows()]

# Create the dataset
dataset = torchtext.legacy.data.Dataset(examples, fields=[('tweet', TEXT), ('label', LABEL)])


dataset = torchtext.legacy.data.Dataset(examples, fields=[('tweet', TEXT), ('label', LABEL)])
dataset = torchtext.legacy.data.Dataset(examples, fields=[('tweet', TEXT), ('label', LABEL)])
dataset = torchtext.legacy.data.Dataset(examples, fields=[('tweet', TEXT), ('label', LABEL)])

In [None]:
print(vars(dataset.examples[0]))

In [None]:
train_data, test_data = dataset.split(
    split_ratio=[0.8, 0.2],
    random_state=random.seed(RANDOM_SEED))

print(f'Num Train: {len(train_data)}')
print(f'Num Test: {len(test_data)}')

In [None]:
print(vars(test_data.examples[3]))


In [None]:
train_data, valid_data = train_data.split(
    split_ratio=[0.85, 0.15],
    random_state=random.seed(RANDOM_SEED))

print(f'Num Train: {len(train_data)}')
print(f'Num Validation: {len(valid_data)}')

In [None]:
TEXT.build_vocab(dataset, max_size=VOCABULARY_SIZE)
LABEL.build_vocab(dataset)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

In [None]:
print(TEXT.vocab.freqs.most_common(20))


In [None]:
print(TEXT.vocab.itos[:10])

In [None]:
print(TEXT.vocab.stoi['the'])

In [None]:
print(LABEL.vocab.stoi)

In [None]:
LABEL.vocab.freqs

Define Data Loaders

In [None]:
train_loader, valid_loader, test_loader = \
    torchtext.legacy.data.BucketIterator.splits(
        (train_data, valid_data, test_data),
         batch_size=BATCH_SIZE,
         sort_within_batch=False,
         sort_key=lambda x: len(x.tweet),
         device=device
    )

In [None]:
print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.tweet.size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.tweet.size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.tweet.size()}')
    print(f'Target vector size: {batch.label.size()}')
    break

In [None]:
class CNN(torch.nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        # self.cnn = torch.nn.CNN(embedding_dim,
        #                        hidden_dim,
        #                        nonlinearity='relu')
        self.cnn = torch.nn.LSTM(embedding_dim,
                                 hidden_dim)        
        
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        

    def forward(self, text):
        # text dim: [sentence length, batch size]
        
        embedded = self.embedding(text)
        # embedded dim: [sentence length, batch size, embedding dim]
        
        output, (hidden, cell) = self.cnn(embedded)
        # output dim: [sentence length, batch size, hidden dim]
        # hidden dim: [1, batch size, hidden dim]

        hidden.squeeze_(0)
        # hidden dim: [batch size, hidden dim]
        
        output = self.fc(hidden)
        return output


In [None]:
torch.manual_seed(RANDOM_SEED)
model = CNN(input_dim=len(TEXT.vocab),
            embedding_dim=EMBEDDING_DIM,
            hidden_dim=HIDDEN_DIM,
            output_dim=NUM_CLASSES 
)

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [None]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for i, (features, targets) in enumerate(data_loader):

            features = features.to(device)
            targets = targets.float().to(device)

            logits = model(features)
            _, predicted_labels = torch.max(logits, 1)

            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100

In [None]:
start_time = time.time()
NUM_EPOCHS = 15
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
# Train the model
total_steps = len(train_loader)
for epoch in range(NUM_EPOCHS):
    for i, (tweets, labels) in enumerate(train_loader):
        # Move tensors to the configured device
        tweets = tweets.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(tweets)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Print statistics
        if not i % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {i:03d}/{len(train_loader):03d} | '
                   f'Loss: {loss:.4f}')
            
    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader, device):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_loader, device):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, device):.2f}%')



In [None]:
# Evaluate the model
with torch.no_grad():
    correct = 0
    total = 0
    for tweets, labels in test_loader:
        tweets = tweets.to(device)
        labels = labels.to(device)
        outputs = model(tweets)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Accuracy of the model on the test tweets: {} %'.format(100 * correct / total))

In [None]:
dataiter = iter(test_loader)
tweets, labels = next(dataiter)

classes = ("0", "1", "2")

print('GroundTruth: ', ' '.join(f'{classes[labels[j]]:5s}' for j in range(10)))

In [None]:
outputs = model(tweets)
_, predicted = torch.max(outputs, 1)

print('Predicted: ', ' '.join(f'{classes[predicted[j]]:5s}'
                              for j in range(10)))