In [7]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [3]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


# setup environment GEITje-7B Finetuning
# - pip install torch
# - pip install datasets
# - pip install transformers
# - pip install trl
# - pip install accelerate (restart after)
# - switch device_map='auto' to avaoid memory error

# - pip install sentencepiece
# - pip install jupyter
# - pip install protobuf 



## Notebook overview
This notebook creates predictions for the baseline models. In total, five models are tried out.
- Training function. Given a baseline model, will return scores.
- Load Data. Load all the documents, and set parameters.
- save predictions


Kernel: Pytorch and Tensorflow

### Load file with training funcation


In [4]:
import sys
sys.path.append('../scripts/') 
import baseline as bf

### Load data

In [9]:
from collections import Counter
import pandas as pd

# df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")
df = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")

print(Counter(df['2split']))
print(Counter(df['4split']))

#set  variables, same for each model
TRAIN_SET = 'train' # must be dev or train
TEST_SET = 'test' # must be val or test
SPLIT_COLUMN = '4split' #must be either 2split or 4split. 2split = data split into train and test. 4split = data split into train, test, dev and val. 
TEXT_COLUMN = 'text'
LABEL_COLUMN = 'label'
DATAFRAME = df.copy()
PATH = f"{cf.output_path}/predictions/baselinePredictions.pkl"
OVERVIEW_PATH = f"{cf.output_path}/overview/baselineOverview.pkl"

Counter({'train': 16445, 'test': 4373})
Counter({'train': 15613, 'test': 4164, 'dev': 832, 'val': 209})


### Baselines

##### Baseline 1: linear SVM+tf-idf

In [10]:
from sklearn.svm import LinearSVC
linear_svm = bf.run_baseline(LinearSVC(), 'LinearSVC' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

                      precision    recall  f1-score   support

         Actualiteit       0.98      0.84      0.90       152
              Agenda       0.98      0.98      0.98       528
             Besluit       0.96      0.94      0.95       113
               Brief       0.91      0.88      0.90       206
          Factsheets       0.91      0.47      0.62        45
               Motie       0.98      0.98      0.98      1545
   Onderzoeksrapport       0.80      0.97      0.88       222
          Raadsadres       0.91      0.98      0.94       313
        Raadsnotulen       1.00      1.00      1.00        42
Schriftelijke Vragen       0.99      0.95      0.97       603
          Voordracht       1.00      1.00      1.00       395

            accuracy                           0.96      4164
           macro avg       0.95      0.91      0.92      4164
        weighted avg       0.96      0.96      0.96      4164



##### Baseline 2: Naive Bayes+tf-idf

In [11]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = bf.run_baseline(MultinomialNB(), 'MultinomialNB' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                      precision    recall  f1-score   support

         Actualiteit       0.00      0.00      0.00       152
              Agenda       0.98      0.92      0.95       528
             Besluit       0.00      0.00      0.00       113
               Brief       0.00      0.00      0.00       206
          Factsheets       0.00      0.00      0.00        45
               Motie       0.46      1.00      0.63      1545
   Onderzoeksrapport       0.38      0.06      0.11       222
          Raadsadres       1.00      0.00      0.01       313
        Raadsnotulen       0.00      0.00      0.00        42
Schriftelijke Vragen       1.00      0.05      0.09       603
          Voordracht       0.99      0.66      0.79       395

            accuracy                           0.56      4164
           macro avg       0.44      0.24      0.23      4164
        weighted avg       0.63      0.56      0.45      4164



  _warn_prf(average, modifier, msg_start, len(result))


precision, recall and f1-score equal to zero occurs if there are not True Positives. Meaning for those classes not one document is correctly predicted. 

##### Baseline 3: Logistic Regression + tf-idf

In [12]:
from sklearn.linear_model import LogisticRegression
log_reg = bf.run_baseline(LogisticRegression(), 'LogisticRegression' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                      precision    recall  f1-score   support

         Actualiteit       0.99      0.73      0.84       152
              Agenda       0.98      0.98      0.98       528
             Besluit       0.95      0.87      0.91       113
               Brief       0.90      0.87      0.88       206
          Factsheets       0.92      0.49      0.64        45
               Motie       0.97      0.98      0.98      1545
   Onderzoeksrapport       0.78      0.92      0.85       222
          Raadsadres       0.87      0.96      0.91       313
        Raadsnotulen       1.00      0.98      0.99        42
Schriftelijke Vragen       0.98      0.95      0.96       603
          Voordracht       1.00      0.99      1.00       395

            accuracy                           0.95      4164
           macro avg       0.94      0.88      0.90      4164
        weighted avg       0.95      0.95      0.95      4164



##### Baseline 4: k Nearest Neigbors + tf-idf

In [13]:
from sklearn.neighbors import KNeighborsClassifier
knn = bf.run_baseline(KNeighborsClassifier(), 'KNeighborsClassifier' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

                      precision    recall  f1-score   support

         Actualiteit       0.73      0.39      0.51       152
              Agenda       0.89      0.98      0.93       528
             Besluit       0.81      0.79      0.80       113
               Brief       0.35      0.39      0.37       206
          Factsheets       0.68      0.38      0.49        45
               Motie       0.89      0.68      0.77      1545
   Onderzoeksrapport       0.23      0.86      0.36       222
          Raadsadres       0.87      0.22      0.35       313
        Raadsnotulen       0.15      1.00      0.25        42
Schriftelijke Vragen       0.71      0.49      0.58       603
          Voordracht       0.95      0.82      0.88       395

            accuracy                           0.66      4164
           macro avg       0.66      0.64      0.57      4164
        weighted avg       0.79      0.66      0.68      4164



##### Baseline 5: RandomForest + tf-idf

In [14]:
from sklearn.ensemble import RandomForestClassifier
random_forest = bf.run_baseline(RandomForestClassifier(), 'RandomForestClassifier' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

                      precision    recall  f1-score   support

         Actualiteit       1.00      0.62      0.76       152
              Agenda       0.96      0.98      0.97       528
             Besluit       0.99      0.83      0.90       113
               Brief       0.89      0.85      0.87       206
          Factsheets       1.00      0.27      0.42        45
               Motie       0.96      0.98      0.97      1545
   Onderzoeksrapport       0.80      0.91      0.85       222
          Raadsadres       0.80      0.94      0.87       313
        Raadsnotulen       1.00      0.98      0.99        42
Schriftelijke Vragen       0.97      0.95      0.96       603
          Voordracht       0.99      1.00      0.99       395

            accuracy                           0.94      4164
           macro avg       0.94      0.85      0.87      4164
        weighted avg       0.94      0.94      0.94      4164



### Overview of all runs

In [None]:
overview = pd.read_pickle(OVERVIEW_PATH)
display(overview)