### Important: Set up directories
Set `WORK_DIR` to the path to the repo in the cell below:

In [1]:
import os
WORK_DIR = os.path.join(os.getenv("HOME"), 'text-gnn')
os.chdir(WORK_DIR)

In [2]:
import pandas as pd
%load_ext autoreload
%autoreload 2
import sys
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
import json
import jsonlines
import torch

sys.path.append('src')
from src.shared.utils import tokenize_prune_stem

## Helsinki Swahili Corpus
Set the `DATASET_DIR_NAME` and `VOCAB_DIR_NAME` variables to the directory names

In [3]:
# The results from the `create_dataset.py` script
DATASET_DIR_NAME = 'swahili-processed-v1'
# Results from running the `download_stemming.py` script
VOCAB_DIR_NAME = 'hsc-dictionary'

### Documents
Explore document stats

In [4]:
dataset_dir = os.path.join(WORK_DIR, 'results', DATASET_DIR_NAME)
df = pd.read_csv(f'{dataset_dir}/dataset.csv',sep=';')

In [5]:
n_classes = df.document_type.nunique()
print(f'{len(df)} total documents with {n_classes} classes')

457 total documents with 3 classes


In [6]:
df.document_type.value_counts(dropna=False)

news     221
bunge    199
books     37
Name: document_type, dtype: int64

In [7]:
df['n_words'] = df.document_content.apply(lambda x:len(x.split()))

In [8]:
mean_words = round(df.n_words.mean(),1)
median_words = round(df.n_words.median(),1)

In [9]:
print(f'Mean words per document of {mean_words}')

Mean words per document of 52887.8


In [10]:
print(f'Median words per document of {median_words}')

Median words per document of 38732.0


In [11]:
total_words = df.n_words.sum()
print(f'{total_words/int(1e6):.2f} million words total')

24.17 million words total


In [12]:
## File size is 160.1 MB

### Vocab
Explore vocabulary stats

In [13]:
stemming_dir = os.path.join(WORK_DIR, 'results', VOCAB_DIR_NAME, 'stemming')

In [3]:
def get_words_in_vocab(path: str, count_threshold: int):
    with open(path,'r') as f:
        vocab_counts = json.load(f)
        return [word for word, count in vocab_counts.items() if count >= count_threshold]

In [15]:
unstemmed_vocab_path = os.path.join(stemming_dir, 'vocab_counts.json')
unstemmed_vocab = get_words_in_vocab(unstemmed_vocab_path, count_threshold=1)
print(f'{len(unstemmed_vocab)} unique words in the unstemmed vocabulary')
unstemmed_vocab = get_words_in_vocab(unstemmed_vocab_path, count_threshold=2)
print(f'{len(unstemmed_vocab)} unique words in the unstemmed vocabulary which occur at least twice')

434449 unique words in the unstemmed vocabulary
212920 unique words in the unstemmed vocabulary which occur at least twice


In [19]:
# TODO: Still need to generate the cleaned HSC vocab and stemming map
stemmed_vocab_path = os.path.join(stemming_dir, 'cleaned_vocab_counts.json')
stemmed_vocab = get_words_in_vocab(stemmed_vocab_path, count_threshold=1)
print(f'{len(stemmed_vocab)} unique words in the stemmed vocabulary')
stemmed_vocab_2 = get_words_in_vocab(stemmed_vocab_path, count_threshold=2)
print(f'{len(stemmed_vocab)} unique words in the unstemmed vocabulary which occur at least twice')

Which we should be able to check is the same as applying this method of finding the number of words in the vocab after applying stemming and cleaning.

In [17]:
# TODO: Requires cell above
stemmer_path = os.path.join(stemming_dir, 'stemming_cleaned.json')
with open(stemmer_path,'r') as f:
    stemming_map = json.load(f)

stemmed_words = []
for word in stemmed_vocab_2:
    stemmed_words.extend(tokenize_prune_stem(word, stemming_map))
print(f'{len(set(stemmed_words))} words in vocab after applying stemming and pruning')

In [18]:
# stemmer_path = os.path.join(stemming_dir, 'stemming_cleaned.json')
# with open(stemmer_path,'r') as f:
#     stemming_map = json.load(f)

In [19]:
# stemmed_words = [stemming_map[word] for word in stemmed_vocab]
# print(f'{len(set(stemmed_words))} words in vocab after applying stemming')

## Zenodo Swahili News Corpus
Set the `DATASET_DIR_NAME` and `VOCAB_DIR_NAME` variables to the directory names

In [4]:
# The results from the `create_dataset.py` script
DATASET_DIR_NAME = 'zenodo-processed-data-v2'
# Results from running the `download_stemming.py` script
VOCAB_DIR_NAME = 'z-news-dictionary-ct2-v2'

### Documents
Explore document stats

In [5]:
# dataset_dir = f'{WORK_DIR}/results/zen_data'
dataset_dir = os.path.join(WORK_DIR, 'results', DATASET_DIR_NAME)
df = pd.read_csv(f'{dataset_dir}/dataset.csv',sep=';')

In [6]:
n_classes = df.document_type.nunique()
print(f'{len(df)} total documents with {n_classes} classes')

23266 total documents with 6 classes


In [7]:
df.document_type.value_counts(dropna=False)

kitaifa      10242
michezo       6003
burudani      2229
uchumi        2027
kimataifa     1906
afya           859
Name: document_type, dtype: int64

In [10]:
df

Unnamed: 0,id/path,document_content,document_type,document_idx,label_idx
0,SW4670,"bodi ya utalii tanzania (ttb) imesema, itafany...",uchumi,0,0
1,SW30826,"pendo fundisha-mbeya rais dk. john magufuri, a...",kitaifa,1,1
2,SW29725,mwandishi wetu -singida benki ya nmb imetoa ms...,uchumi,2,0
3,SW20901,"timu ya taifa ya tanzania, serengeti boys jana...",michezo,3,2
4,SW12560,na agatha charles - dar es salaam aliyekuwa ka...,kitaifa,4,1
...,...,...,...,...,...
23261,SW24920,alitoa pongezi hizo alipozindua rasmi hatua ya...,uchumi,23261,0
23262,SW4038,na nora damian-dar es salaam tekla (si jina l...,kitaifa,23262,1
23263,SW16649,"mkuu wa mkoa wa njombe, dk rehema nchimbi waka...",uchumi,23263,0
23264,SW23291,"mabingwa wa ligi kuu soka tanzania bara, simba...",michezo,23264,2


In [15]:
df['doc_length'] = df.document_content.apply(len)

In [13]:
df.groupby(['document_type']).mean()

Unnamed: 0_level_0,document_idx,label_idx,doc_length
document_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
afya,11258.990687,5.0,3041.287544
burudani,11765.022432,4.0,1445.668013
kimataifa,11535.479538,3.0,1984.798006
kitaifa,11714.516696,1.0,2618.339582
michezo,11572.384641,2.0,1816.158921
uchumi,11499.905279,0.0,2070.51258


In [14]:
df.groupby(['document_type']).median()

Unnamed: 0_level_0,document_idx,label_idx,doc_length
document_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
afya,10963.0,5.0,2252.0
burudani,11593.0,4.0,893.0
kimataifa,11401.5,3.0,1542.0
kitaifa,11814.0,1.0,2148.5
michezo,11566.0,2.0,1577.0
uchumi,11555.0,0.0,2075.0


In [8]:
train_labels = torch.load(os.path.join(dataset_dir, 'train-labels.pt'))
val_labels = torch.load(os.path.join(dataset_dir, 'val-labels.pt'))
test_labels = torch.load(os.path.join(dataset_dir, 'test-labels.pt'))

print(f'Number of train samples: {train_labels.shape[0]}')
print(f'Number of val samples: {val_labels.shape[0]}')
print(f'Number of test samples: {test_labels.shape[0]}')

labels_map_path = os.path.join(dataset_dir, 'labels.json')
with open(labels_map_path, 'r') as f:
    labels_map = json.load(f)
reverse_labels_map = {label_idx: label_name for label_name, label_idx in labels_map.items()}

Number of train samples: 18612
Number of val samples: 2327
Number of test samples: 2327


In [10]:
categories, counts, = train_labels.unique(return_counts=True)
train_counts = {reverse_labels_map[cat]: count for cat, count in zip(categories.tolist(), counts.tolist())}
print(f'Train doc counts per class: {train_counts}')

Train doc counts per class: {'uchumi': 1622, 'kitaifa': 8193, 'michezo': 4802, 'kimataifa': 1525, 'burudani': 1783, 'afya': 687}


In [11]:
categories, counts, = val_labels.unique(return_counts=True)
val_counts = {reverse_labels_map[cat]: count for cat, count in zip(categories.tolist(), counts.tolist())}
print(val_counts)
print(f'Val doc counts per class: {val_counts}')

{'uchumi': 202, 'kitaifa': 1024, 'michezo': 601, 'kimataifa': 191, 'burudani': 223, 'afya': 86}
Val doc counts per class: {'uchumi': 202, 'kitaifa': 1024, 'michezo': 601, 'kimataifa': 191, 'burudani': 223, 'afya': 86}


In [12]:
categories, counts, = test_labels.unique(return_counts=True)
test_counts = {reverse_labels_map[cat]: count for cat, count in zip(categories.tolist(), counts.tolist())}
print(f'Test doc counts per class: {test_counts}')

Test doc counts per class: {'uchumi': 203, 'kitaifa': 1025, 'michezo': 600, 'kimataifa': 190, 'burudani': 223, 'afya': 86}


In [13]:
df['n_words'] = df.document_content.apply(lambda x:len(x.split()))
mean_words = round(df.n_words.mean(),1)
median_words = round(df.n_words.median(),1)

In [14]:
print(f'Mean words per document of {mean_words}')

Mean words per document of 332.2


In [15]:
print(f'Median words per document of {median_words}')

Median words per document of 275.0


In [16]:
total_words = df.n_words.sum()
print(f'{total_words/int(1e6):.2f} million words total')

7.73 million words total


In [28]:
## File size is 52.3 MB

### Vocab
Explore vocabulary stats


In [17]:
stemming_dir = os.path.join(WORK_DIR, 'results', VOCAB_DIR_NAME, 'stemming')

In [18]:
unstemmed_vocab_path = os.path.join(stemming_dir, 'vocab_counts.json')
unstemmed_vocab = get_words_in_vocab(unstemmed_vocab_path, count_threshold=1)
print(f'{len(unstemmed_vocab)} unique words in the unstemmed vocabulary')
unstemmed_vocab = get_words_in_vocab(unstemmed_vocab_path, count_threshold=2)
print(f'{len(unstemmed_vocab)} unique words in the unstemmed vocabulary which occur at least twice')

197619 unique words in the unstemmed vocabulary
95504 unique words in the unstemmed vocabulary which occur at least twice


In [19]:
# TODO: Still need to generate the cleaned HSC vocab and stemming map
stemmed_vocab_path = os.path.join(stemming_dir, 'cleaned_vocab_counts.json')
stemmed_vocab = get_words_in_vocab(stemmed_vocab_path, count_threshold=1)
print(f'{len(stemmed_vocab)} unique words in the stemmed vocabulary')
stemmed_vocab_2 = get_words_in_vocab(stemmed_vocab_path, count_threshold=2)
print(f'{len(stemmed_vocab)} unique words in the unstemmed vocabulary which occur at least twice')

34467 unique words in the stemmed vocabulary
34467 unique words in the unstemmed vocabulary which occur at least twice


We now consider the remaining words after removing non-alpha's, single character words, and stopwords 

In [20]:
stemmer_path = os.path.join(stemming_dir, 'stemming_cleaned.json')
with open(stemmer_path,'r') as f:
    stemming_map = json.load(f)

stemmed_words = []
for word in stemmed_vocab_2:
    stemmed_words.extend(tokenize_prune_stem(word, stemming_map))
print(f'{len(set(stemmed_words))} words in vocab after applying stemming and pruning')

31852 words in vocab after applying stemming and pruning
