This notebook is for the project that aims at classifying swahili news into five specified categories:

Kitaifa (National)


Kimataifa (International)


Biashara (Business)


Michezo (Sports)


Burudani (Entertainment)

In [3]:
# importing necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf

In [5]:
# import and use text as ktrain
import ktrain
from ktrain import text

In [8]:
# import the API for text
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
import re

In [12]:
import nltk
nltk.download('punkt')  # installing one of the drivers from nltk

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [9]:
df = pd.read_csv("Train.csv") # loading the data using pandas df
df.head(5)

Unnamed: 0,id,content,category
0,SW0,SERIKALI imesema haitakuwa tayari kuona amani...,Kitaifa
1,SW1,"Mkuu wa Mkoa wa Tabora, Aggrey Mwanri amesiti...",Biashara
2,SW10,SERIKALI imetoa miezi sita kwa taasisi zote z...,Kitaifa
3,SW100,KAMPUNI ya mchezo wa kubahatisha ya M-bet ime...,michezo
4,SW1000,WATANZANIA wamekumbushwa kusherehekea sikukuu...,Kitaifa


In [31]:
# loading the validation dataset
validation_set = pd.read_csv('Test.csv')

In [32]:
#converting all of the text in lower case
df['category'] = df['category'].str.lower()
validation_set['content'] = validation_set['content'].str.lower()

In [34]:
# initialize the storage variable for the results from the test set

valid_pred_ro = np.zeros((len(validation_set), 5))

In [10]:
def processText(news):
    news = news.lower()
    news = re.sub('â€˜','', news) #removing 'â€˜ which appears frequently
    news = re.sub('â€™','', news)
    news = word_tokenize(news) # removing repeated characters (e.g mambooooo to mambo)
    
    return " ".join(news)

In [13]:
# applyng the above function to the training set
df['content'] = df['content'].apply(processText)

In [17]:
# loading a pre-trained model to train the data

model_name = 'bert-base-multilingual-uncased'

transfer_model = text.Transformer(model_name, maxlen=128, class_names= ['kitaifa', 'michezo', 'biashara', 'kimataifa', 'burudani'])

Downloading: 100%|██████████| 625/625 [00:00<00:00, 157kB/s]


In [20]:
# load early stopping from keras and model checkpoint which is used to load any best previous trained model and based on parameter of interest
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from sklearn.utils import class_weight

In [22]:
# create classweight to assign few label to have high loss compared to others
class_weights = class_weight.compute_class_weight('balanced', np.unique(df['category']), df['category'])
class_weight_dict = dict(enumerate(class_weights))

In [35]:
from sklearn.utils import class_weight
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(random_state=42, shuffle=False)

# creating a validation and training sets
x_test = df['category']
df.drop('content', axis=1)


Unnamed: 0,id,category
0,SW0,kitaifa
1,SW1,biashara
2,SW10,kitaifa
3,SW100,michezo
4,SW1000,kitaifa
...,...,...
5146,SW993,kitaifa
5147,SW994,kitaifa
5148,SW996,kitaifa
5149,SW997,kitaifa


In [36]:
es = EarlyStopping(monitor='val_accuracy', patience= 3 , verbose=1, restore_best_weights=True)  # parameter of interest validation accuracy and training should stop if validation accuracy is below best value for 3 consequetive episode



seed = 42
n_folds = 10   # cros validation folds by running 10 folds it will guarantee the best results from developed model

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits= n_folds, random_state=seed, shuffle=False)  # stratified for balanced sampling of training sample

n = 0  # Sometimes some folds produced worse results and then the model is skipped. n will guarantee the average is divided with only episodes contributing to the results

for train_index, test_index in skf.split(df['content'], df['category']):
    
    
    x_train, x_test = list(df.loc[train_index,'content']), list(df.loc[test_index,'content'])
    y_train, y_test = np.asarray(df.loc[train_index,'category']), np.asarray(df.loc[test_index,'category'])
    
    trn = transfer_model.preprocess_train(x_train, y_train)
    val = transfer_model.preprocess_test(x_test, y_test)
 
    model = transfer_model.get_classifier()
    
    learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size= 6)
    

    
    history = learner.fit(1e-5, 10, cycle_len=1, cycle_mult=2, class_weight= class_weight_dict, callbacks=[es], checkpoint_folder='/tmp')
    
    

    learner.validate(class_names=transfer_model.get_classes())



    
    if max(history.history['val_accuracy']) < 0.8: # I used any model for testing set if max(history) of validation accuracy is above or equal to 80% else continue and other CV
        continue 

    # make inference if the above condition is met 
    
    predictor = ktrain.get_predictor(learner.model,preproc=t )
    data = validation_set['content']
    data = np.asarray(data)
    print(predictor.get_classes())
    pred = predictor.predict(data,return_proba=True)
    n = n+1

    valid_pred_ro += pred

valid_pred_ro /= n

preprocessing train...
language: sw
train sequence lengths:
	mean : 346
	95percentile : 770
	99percentile : 1186


Downloading: 100%|██████████| 872k/872k [00:04<00:00, 214kB/s]
Downloading: 100%|██████████| 1.72M/1.72M [00:13<00:00, 128kB/s]
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 13.9kB/s]


Is Multi-Label? False
preprocessing test...
language: sw
test sequence lengths:
	mean : 357
	95percentile : 764
	99percentile : 1242


Downloading: 100%|██████████| 999M/999M [13:34<00:00, 1.23MB/s]


Epoch 1/1023
Epoch 2/1023
Epoch 3/1023
Epoch 4/1023
Epoch 5/1023
Epoch 6/1023

KeyboardInterrupt: 