This is an Natural Language processing Project. A swahili sentiment analyzer. which uses the swahili sentiment analysis dataset from github https://github.com/Neurotech-HQ/swahili-sentiment-analysis-dataset. This project also uses a custom swahili stopwords csv file from the "Enhancing text pre-processing for Swahili language: Datasets for common Swahili stop-words, slangs and typos with equivalent proper words" research article by Bernard Masua and Noel Masasi. The code preprocesses the data by removing the noise. Feature extraction using countvectorizer. It then trains the model using the naive bayes classifier algorithm to classify the texts into the sentiment categories.It then uses the gradio library to deploy the NLP application. The code launches the Gradio interface where you can input Swahili text and get the sentiment analysis result as output.

In [1]:
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import gradio as gr

def preprocess(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Load the custom list of stop words from a CSV file
    stopwords_df = pd.read_csv('Common Swahili Stop-words.csv')
    stopwords = set(stopwords_df['StopWords'].tolist() + list(string.punctuation))
    # Remove stop words and punctuation
    tokens = [token.lower() for token in tokens if token.lower() not in stopwords]
    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text


def extract_features(texts):
    # Preprocess the texts
    preprocessed_texts = [preprocess(text) for text in texts]
    # Extract features using CountVectorizer
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(preprocessed_texts)
    return X, vectorizer


def train_model():
    # Load the training data
    data = pd.read_csv('swahili.csv')
    X_train = data['text']
    y_train = data['sentiment']
    # Extract features from the training data
    X_train_features, vectorizer = extract_features(X_train)
    # Train a Naive Bayes classifier
    clf = MultinomialNB()
    clf.fit(X_train_features, y_train)
    return clf, vectorizer


# Load the trained model and vectorizer
clf, vectorizer = train_model()


def predict_sentiment(text):
    # Preprocess the text
    preprocessed_text = preprocess(text)
    # Extract features using the vectorizer
    X = vectorizer.transform([preprocessed_text])
    # Make a prediction
    y_pred = clf.predict_proba(X)[0]
    # Assign the sentiment label based on the prediction
    if y_pred[0] > y_pred[1]:
        sentiment = 'negative'
    else:
        sentiment = 'positive'
    return sentiment


# Define the input and output interfaces
input_text = gr.inputs.Textbox(label="Enter a Swahili text")
output_text = gr.outputs.Textbox(label="Sentiment")

# Define the function to be used as the backend
def predict(input):
    return predict_sentiment(input)

# Create the interface
iface = gr.Interface(fn=predict, inputs=input_text, outputs=output_text, title="Swahili Sentiment Analysis")

# Launch the interface
iface.launch()




Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.


