In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

# Load train data
train_data_url = 'https://github.com/afrisenti-semeval/afrisent-semeval-2023/blob/main/data/ibo/train.tsv?raw=true'
train_data = pd.read_csv(train_data_url, sep='\t')
train_data = train_data[['tweet', 'label']]  # Select the relevant columns

# Load dev data
dev_data_url = 'https://github.com/afrisenti-semeval/afrisent-semeval-2023/blob/main/data/ibo/dev.tsv?raw=true'
dev_data = pd.read_csv(dev_data_url, sep='\t')
dev_data = dev_data[['tweet', 'label']]  # Select the relevant columns

# Load test data
test_data_url = 'https://github.com/afrisenti-semeval/afrisent-semeval-2023/blob/main/data/ibo/test.tsv?raw=true'
test_data = pd.read_csv(test_data_url, sep='\t')
test_data = test_data[['tweet', 'label']]  # Select the relevant columns

# Pre-process the text data
train_data['tweet'] = train_data['tweet'].str.lower()  # Convert to lowercase
train_data['tweet'] = train_data['tweet'].str.replace('[^\w\s]', '')  # Remove punctuation
train_data['tweet'] = train_data['tweet'].str.replace('\d+', '')  # Remove digits

dev_data['tweet'] = dev_data['tweet'].str.lower()  # Convert to lowercase
dev_data['tweet'] = dev_data['tweet'].str.replace('[^\w\s]', '')  # Remove punctuation
dev_data['tweet'] = dev_data['tweet'].str.replace('\d+', '')  # Remove digits

test_data['tweet'] = test_data['tweet'].str.lower()  # Convert to lowercase
test_data['tweet'] = test_data['tweet'].str.replace('[^\w\s]', '')  # Remove punctuation
test_data['tweet'] = test_data['tweet'].str.replace('\d+', '')  # Remove digits

# Tokenize and pad the sequences for train, dev, and test data
max_words = 5000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_data['tweet'])
word_index = tokenizer.word_index

# Train data
X_train = tokenizer.texts_to_sequences(train_data['tweet'])
X_train = pad_sequences(X_train, maxlen=max_len, padding='post')

# Dev data
X_dev = tokenizer.texts_to_sequences(dev_data['tweet'])
X_dev = pad_sequences(X_dev, maxlen=max_len, padding='post')

# Test data
X_test = tokenizer.texts_to_sequences(test_data['tweet'])
X_test = pad_sequences(X_test, maxlen=max_len, padding='post')
y_test = pd.get_dummies(test_data['label'])


# Convert labels to categorical
y_train = pd.get_dummies(train_data['label'])
y_dev = pd.get_dummies(dev_data['label'])

# Build and train the CNN model
model = Sequential()
model.add(Embedding(max_words, 100, input_length=max_len))
model.add(Conv1D(64, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(3, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_dev, y_dev))
#Evaluate the model on test data

loss, accuracy = model.evaluate(X_test, y_test, batch_size=64)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


  train_data['tweet'] = train_data['tweet'].str.replace('[^\w\s]', '')  # Remove punctuation
  train_data['tweet'] = train_data['tweet'].str.replace('\d+', '')  # Remove digits
  dev_data['tweet'] = dev_data['tweet'].str.replace('[^\w\s]', '')  # Remove punctuation
  dev_data['tweet'] = dev_data['tweet'].str.replace('\d+', '')  # Remove digits
  test_data['tweet'] = test_data['tweet'].str.replace('[^\w\s]', '')  # Remove punctuation
  test_data['tweet'] = test_data['tweet'].str.replace('\d+', '')  # Remove digits


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.7089181542396545
Test Accuracy: 0.7656165361404419
