In [1]:
# Check GPU type
!nvidia-smi

Sun Feb 28 17:29:50 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# Upgrade pip and install ktrain
!pip -qq install -U pip
!pip -qq install ktrain

[K     |████████████████████████████████| 1.5MB 7.6MB/s 
[K     |████████████████████████████████| 25.3 MB 73 kB/s 
[K     |████████████████████████████████| 6.8 MB 62.4 MB/s 
[K     |████████████████████████████████| 981 kB 60.1 MB/s 
[K     |████████████████████████████████| 263 kB 53.8 MB/s 
[K     |████████████████████████████████| 1.3 MB 31.5 MB/s 
[K     |████████████████████████████████| 1.2 MB 56.9 MB/s 
[K     |████████████████████████████████| 468 kB 53.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 60.7 MB/s 
[K     |████████████████████████████████| 883 kB 61.2 MB/s 
[K     |████████████████████████████████| 2.9 MB 65.4 MB/s 
[?25h  Building wheel for ktrain (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Building wheel for keras-bert (setup.py) ... [?25l[?25hdone
  Building wheel for keras-transformer (setup.py) ... [?25l[?25hdone
  Building wheel for keras-embed-sim (setup.py) ... [?25l[?25hdone
  

In [3]:
# Download data
!gdown --id 1gLdjbkhHVZd2RVP7k1lCR2UyBu0YPcsK
!unzip -q '/content/aai4_data.zip'

Downloading...
From: https://drive.google.com/uc?id=1gLdjbkhHVZd2RVP7k1lCR2UyBu0YPcsK
To: /content/aai4_data.zip
23.6MB [00:00, 57.1MB/s]


In [4]:
# Import libaries
import numpy as np 
import pandas as pd
import random
import os
import re
import ktrain
from ktrain import text
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

In [5]:
# Set seed
SEED = 3031

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)

def set_global_determinism(seed=SEED):
    set_seeds(seed=seed)

    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    
    tf.config.threading.set_inter_op_parallelism_threads(1)
    tf.config.threading.set_intra_op_parallelism_threads(1)

set_global_determinism(seed=SEED)

In [6]:
# Load data
train = pd.read_csv('/content/aai4_data/train.csv')
test = pd.read_csv('/content/aai4_data/test.csv')
sample = pd.read_csv('/content/aai4_data/sample_submission.csv')

In [7]:
# Preview last five rows in test
test.tail()

Unnamed: 0,id,content
7751,SW18887,\n\n \nNa Ibrahim Yassin-Nkasi\n \n\tMWANAFUNZ...
7752,SW23779,BAADA ya R. Kelly kukumbwa na\nkashfa ya unyan...
7753,SW20243,\n\tNa JUDITH NYANGE-MWANZA\n \n\n \n\tKAMPUNI...
7754,SW27943,"WAZIRI wa Ulinzi, Dk Hussein Mwinyi vijana wa..."
7755,SW22906,WAANDISHI WETU – DAR/MIKOANI KASI ya virusi vy...


In [8]:
# Remove trailing spaces, new lines and tab spaces from data
train.content = train.content.apply(lambda x: (re.sub('\s+',' ', x)).strip())
test.content = test.content.apply(lambda x: (re.sub('\s+',' ', x)).strip())
test.tail()

Unnamed: 0,id,content
7751,SW18887,Na Ibrahim Yassin-Nkasi MWANAFUNZI wa kidato c...
7752,SW23779,BAADA ya R. Kelly kukumbwa na kashfa ya unyany...
7753,SW20243,Na JUDITH NYANGE-MWANZA KAMPUNI ya Ujenzi wa N...
7754,SW27943,"WAZIRI wa Ulinzi, Dk Hussein Mwinyi vijana wan..."
7755,SW22906,WAANDISHI WETU – DAR/MIKOANI KASI ya virusi vy...


In [9]:
# Set model parameters
MODEL_NAME = 'xlm-roberta-base'
MAX_LEN = 256
BATCH_SIZE = 16
FOLDS = 3
LR = 3e-5
EPOCHS = 2

# List of class names
CLASS_NAMES = sorted(train.category.unique().tolist()) # ['afya', 'burudani', 'kimataifa', 'kitaifa', 'michezo', 'uchumi']

# Instantiate transformer with the provided parameters
t = text.Transformer(model_name=MODEL_NAME, maxlen=MAX_LEN, class_names=CLASS_NAMES, batch_size=BATCH_SIZE)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…




In [10]:
%%time
# Prepare test data
test_data = np.asarray(test.content)

# Set number of folds to 3
folds = StratifiedKFold(n_splits=FOLDS, random_state=SEED, shuffle=False)

# List to store predictions and loss-score per fold
oof_preds = []
oof_loss_score = []

for train_index, test_index in folds.split(train.content, train.category):
  X_train, X_test = list(train.loc[train_index, 'content']), list(train.loc[test_index, 'content'])
  y_train, y_test = np.asarray(train.loc[train_index, 'category']), np.asarray(train.loc[test_index, 'category'])

  # Preprocess training and validation data
  train_set = t.preprocess_train(X_train, y_train)
  val_set = t.preprocess_test(X_test, y_test)

  # Instantiate model
  model = t.get_classifier()
  learner = ktrain.get_learner(model, train_data=train_set, val_data=val_set, batch_size=BATCH_SIZE)

  # Train model
  history = learner.fit(LR, n_cycles=EPOCHS, checkpoint_folder='/tmp')
  learner.validate(class_names=t.get_classes())

  # Append score of each fold
  oof_loss_score.append(history.history['val_loss'][-1])

  # Make predictions
  preds = ktrain.get_predictor(learner.model, preproc=t).predict(test_data, return_proba=True)

  # Append preds to oof_preds list
  oof_preds.append(preds)

# Check cv score and prepare submission file
print(f'Mean Loss: {np.mean(oof_loss_score)}')
sub = pd.DataFrame(np.mean(oof_preds, axis=0), columns = t.get_classes())
sub['test_id'] = test.id
sub = sub[sample.columns]
sub.to_csv('Submission.csv', index = False)

preprocessing train...
language: sw
train sequence lengths:
	mean : 333
	95percentile : 792
	99percentile : 1268


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




Is Multi-Label? False
preprocessing test...
language: sw
test sequence lengths:
	mean : 331
	95percentile : 768
	99percentile : 1234


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1885418496.0, style=ProgressStyle(descr…


Epoch 1/2
Epoch 2/2
              precision    recall  f1-score   support

        afya       0.61      0.42      0.50       286
    burudani       0.92      0.89      0.90       743
   kimataifa       0.93      0.89      0.91       635
     kitaifa       0.90      0.94      0.92      3414
     michezo       0.95      0.97      0.96      2002
      uchumi       0.92      0.84      0.88       676

    accuracy                           0.91      7756
   macro avg       0.87      0.83      0.85      7756
weighted avg       0.91      0.91      0.91      7756

preprocessing train...
language: sw
train sequence lengths:
	mean : 332
	95percentile : 782
	99percentile : 1279


Is Multi-Label? False
preprocessing test...
language: sw
test sequence lengths:
	mean : 334
	95percentile : 787
	99percentile : 1239


Epoch 1/2
Epoch 2/2
              precision    recall  f1-score   support

        afya       0.58      0.50      0.53       286
    burudani       0.92      0.90      0.91       743
   kimataifa       0.90      0.87      0.89       636
     kitaifa       0.91      0.93      0.92      3414
     michezo       0.94      0.97      0.96      2001
      uchumi       0.92      0.83      0.88       676

    accuracy                           0.91      7756
   macro avg       0.86      0.83      0.85      7756
weighted avg       0.91      0.91      0.91      7756

preprocessing train...
language: sw
train sequence lengths:
	mean : 332
	95percentile : 778
	99percentile : 1238


Is Multi-Label? False
preprocessing test...
language: sw
test sequence lengths:
	mean : 333
	95percentile : 794
	99percentile : 1299


Epoch 1/2
Epoch 2/2
              precision    recall  f1-score   support

        afya       0.55      0.66      0.60       287
    burudani       0.88      0.95      0.91       743
   kimataifa       0.94      0.86      0.90       635
     kitaifa       0.93      0.90      0.91      3414
     michezo       0.93      0.99      0.96      2001
      uchumi       0.92      0.82      0.86       676

    accuracy                           0.91      7756
   macro avg       0.86      0.86      0.86      7756
weighted avg       0.91      0.91      0.91      7756

Mean Loss: 0.2963431775569916
CPU times: user 24min 1s, sys: 13min 2s, total: 37min 3s
Wall time: 2h 9min 20s
