In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score

# Loading  train data
train_data_url = 'https://github.com/afrisenti-semeval/afrisent-semeval-2023/blob/main/data/ibo/train.tsv?raw=true'
train_data = pd.read_csv(train_data_url, sep='\t')
train_data = train_data[['tweet', 'label']]  # Select the relevant columns

# Loading dev data
dev_data_url = 'https://github.com/afrisenti-semeval/afrisent-semeval-2023/blob/main/data/ibo/dev.tsv?raw=true'
dev_data = pd.read_csv(dev_data_url, sep='\t')
dev_data = dev_data[['tweet', 'label']]  # Select the relevant columns

# Loading test data
test_data_url = 'https://github.com/afrisenti-semeval/afrisent-semeval-2023/blob/main/data/ibo/test.tsv?raw=true'
test_data = pd.read_csv(test_data_url, sep='\t')
test_data = test_data[['tweet', 'label']]  # Select the relevant columns

# Pre-processing the text data
train_data['tweet'] = train_data['tweet'].str.lower()  # Convert to lowercase
train_data['tweet'] = train_data['tweet'].str.replace('[^\w\s]', '')  # Remove punctuation
train_data['tweet'] = train_data['tweet'].str.replace('\d+', '')  # Remove digits

dev_data['tweet'] = dev_data['tweet'].str.lower()  # Convert to lowercase
dev_data['tweet'] = dev_data['tweet'].str.replace('[^\w\s]', '')  # Remove punctuation
dev_data['tweet'] = dev_data['tweet'].str.replace('\d+', '')  # Remove digits

test_data['tweet'] = test_data['tweet'].str.lower()  # Convert to lowercase
test_data['tweet'] = test_data['tweet'].str.replace('[^\w\s]', '')  # Remove punctuation
test_data['tweet'] = test_data['tweet'].str.replace('\d+', '')  # Remove digits

# Tokenizing and pad the sequences for train, dev, and test data
max_words = 5000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_data['tweet'])
word_index = tokenizer.word_index

# Training data
X_train = tokenizer.texts_to_sequences(train_data['tweet'])
X_train = pad_sequences(X_train, maxlen=max_len, padding='post')

# Dev data
X_dev = tokenizer.texts_to_sequences(dev_data['tweet'])
X_dev = pad_sequences(X_dev, maxlen=max_len, padding='post')

# Test data
X_test = tokenizer.texts_to_sequences(test_data['tweet'])
X_test = pad_sequences(X_test, maxlen=max_len, padding='post')
y_test = pd.get_dummies(test_data['label'])

# Converting labels to categorical
y_train = pd.get_dummies(train_data['label'])
y_dev = pd.get_dummies(dev_data['label'])

# Building and train the CNN model
model = Sequential()
model.add(Embedding(max_words, 100, input_length=max_len))
model.add(Conv1D(64, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(3, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_dev, y_dev))

#Evaluating the model on test data

y_pred = model.predict(X_test, batch_size=64)

y_pred = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test.values, axis=1)

#Classification report

print(classification_report(y_true, y_pred))
#Precision-Recall and F1 scores

precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)
#Print accuracy

loss, accuracy = model.evaluate(X_test, y_test, batch_size=64)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


  train_data['tweet'] = train_data['tweet'].str.replace('[^\w\s]', '')  # Remove punctuation
  train_data['tweet'] = train_data['tweet'].str.replace('\d+', '')  # Remove digits
  dev_data['tweet'] = dev_data['tweet'].str.replace('[^\w\s]', '')  # Remove punctuation
  dev_data['tweet'] = dev_data['tweet'].str.replace('\d+', '')  # Remove digits
  test_data['tweet'] = test_data['tweet'].str.replace('[^\w\s]', '')  # Remove punctuation
  test_data['tweet'] = test_data['tweet'].str.replace('\d+', '')  # Remove digits


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0       0.76      0.65      0.70       943
           1       0.74      0.80      0.77      1621
           2       0.79      0.80      0.80      1118

    accuracy                           0.76      3682
   macro avg       0.76      0.75      0.75      3682
weighted avg       0.76      0.76      0.76      3682

Precision:  0.7615786354365133
Recall:  0.7609994568169474
F1 Score:  0.7596085745887863
Test Loss: 0.7082180976867676
Test Accuracy: 0.7609994411468506
