In [58]:
import polars as pl
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from correlcon_ssdg_clf.helpers import multilabel_train_test_split
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
import string

In [3]:
nltk.download(['stopwords',"punkt_tab"])

[nltk_data] Downloading package stopwords to /home/jstet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jstet/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
orig = pl.read_csv("../data/sdg_knowledge_hub.csv")
orig.head(4)

url,title,type,text,date,sdgs,SDG-01,SDG-02,SDG-03,SDG-04,SDG-05,SDG-06,SDG-07,SDG-08,SDG-09,SDG-10,SDG-11,SDG-12,SDG-13,SDG-14,SDG-15,SDG-16,SDG-17
str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""http://sdg.iisd.org/news/unece…","""UNECE Releases 44 Recommended …","""news""","""UNECE Releases 44 Recommended …","""10 September 2021""","""7 SDGs""",1,1,0,0,0,1,1,0,0,0,1,0,1,0,1,0,0
"""http://sdg.iisd.org/news/large…","""Largest Source of Lead Polluti…","""news""","""Largest Source of Lead Polluti…","""10 September 2021""","""2 SDGs""",0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
"""http://sdg.iisd.org/news/vnr-u…","""VNR Update: 15 Countries Plann…","""news""","""VNR Update: 15 Countries Plann…","""10 September 2021""","""Partnerships for the Goals""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
"""http://sdg.iisd.org/news/sdg-m…","""SDG Moment Will Provide Realit…","""news""","""SDG Moment Will Provide Realit…","""9 September 2021""","""17 SDGs""",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


### Splitting the data into a training and a testing set
- Common 80/20 split
- Shuffles the data based on a 
- Stratifies the split, meaning the labels will have similar shares in train and test, which is not trivial for multi label data (much complexity hidden in this function)

In [5]:
X_train_raw, X_test_raw, y_train, y_test = multilabel_train_test_split(
        orig.select(pl.col("text")),
        orig.select(pl.col("^SDG.*$")),
        stratify=orig.select(pl.col("^SDG.*$")),
        test_size=0.2,
        train_size=0.8,
        random_state=21,
    )

### Preprocessing and Tokenization
1. Remove punctuation, because they are useless if not considering word order:
    
    `"I love hiking in the mountains!"` $\rightarrow$ `"I love hiking in the mountains"`
2. Tokenizing,meaning splitting a text into units, in this case words based on space: 

    `"I love hiking in the mountains!"` $\rightarrow$ `[I,love,hiking,in,the,mountains]`
3. Reducing words to their stem, to unify different versions of a word:  

    `[I,love,hiking,in,the,mountains]` $\rightarrow$ `[I,lov,hik,in,the,mountain]`
4. Removing Stopwords, meaning common words such as "the", that can be understood as noise in this context because they dont add much to the variance: 

    `[I,lov,hik,in,the,mountain]` $\rightarrow$  `[lov,hik,mountain]`

In [6]:
nltk_stopwords = stopwords.words('english')
stemmer = PorterStemmer()
def tokenizer(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    return  [stemmer.stem(word) for word in text.split() if word not in nltk_stopwords and word.isascii()]

### Feature Extraction
- The data in raw form was just text, but we want something we can do maths on
- Term Frequency - Inversed Document Frequency (TF-IDF) is a common approach to convert a tokenized text into a vector
    - Its a measure of importance of a word to a document in a collection or texts (corpus)
    - Output vector has one value per unique word, or rather stem, contained in a corpus (limits to retained features can be set)
- The Vectorizer used here actually uses n-grams instead of stems
    - n-grams are combinations of n tokens, a 2-gram of in our case would be "(lov,hik)"
    - the parameter `ngram_range` takes a tuple, e.g. with (1,3) considering 1,2 and 3-grams as features
    - this aims to include some meaning of word combinations into the model

In [7]:
# Documentation https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
vectorizer = TfidfVectorizer(stop_words=None, preprocessor=None, tokenizer=tokenizer, ngram_range=(1,3), max_features=50000)

print("Fitting..")
vectorizer.fit(X_train_raw["text"].to_list())
print("Extracting tf-idf vectors for Training data..")
X_train = vectorizer.transform(X_train_raw["text"].to_list())
print("Extracting tf-idf vectors for testing data..")
X_test = vectorizer.transform(X_test_raw["text"].to_list())


Fitting..




Extracting tf-idf vectors for Training data..
Extracting tf-idf vectors for testing data..


### Binary Relevance with Logistic Regression
- The multi label classification task is transformed to one binary classifier per label, whose outputs are combined (problem transformation)
- Logistic Regression learns the relationship between the feature vector and the probability of the presence of label X.

In [8]:
# Define parameter grid
clf = BinaryRelevance(
    classifier = LogisticRegression(),
    require_dense = [True, True]
)

In [9]:
# training
clf.fit(X_train, y_train)

In [10]:
y_pred = clf.predict(X_test)

### F1 Score
- The F1 score is the harmonic mean of precision and recall. 
- **Precision** measures the accuracy of positive predictions and is calculated as the ratio of true positives to the sum of true positives and false positives. 
- **Recall** measures the ability to identify all relevant instances and is calculated as the ratio of true positives to the sum of true positives and false negatives.

In [52]:
print(classification_report(y_test,y_pred)) 

              precision    recall  f1-score   support

           0       0.80      0.47      0.59       513
           1       0.90      0.40      0.55       385
           2       0.85      0.38      0.52       390
           3       0.87      0.25      0.39       239
           4       0.92      0.36      0.52       361
           5       0.94      0.44      0.60       324
           6       0.91      0.48      0.63       373
           7       0.78      0.35      0.48       407
           8       0.80      0.24      0.36       318
           9       0.77      0.41      0.54       410
          10       0.91      0.30      0.45       305
          11       0.86      0.39      0.54       327
          12       0.88      0.69      0.77       740
          13       0.86      0.43      0.57       290
          14       0.90      0.47      0.62       440
          15       0.82      0.41      0.55       377
          16       0.80      0.74      0.77       897

   micro avg       0.85   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The Macro F1 score evaluates the F1 score for each class independently and averages them, treating all classes equally.

In [60]:
print(f1_score(y_test,y_pred, average="macro")) 

0.5566252136529561
