In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [1]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


# setup environment GEITje-7B Finetuning
# - pip install torch
# - pip install datasets
# - pip install transformers
# - pip install trl
# - pip install accelerate (restart after)
# - switch device_map='auto' to avaoid memory error

# - pip install sentencepiece
# - pip install jupyter
# - pip install protobuf 



## Notebook overview
This notebook creates predictions for the baseline models. In total, five models are tried out.
- Training function. Given a baseline model, will return scores.
- Load Data. Load all the documents, and set parameters.
- save predictions


Kernel: Pytorch and Tensorflow

### Load file with training funcation


In [2]:
import sys
sys.path.append('../scripts/') 
import baseline as bf

### Load data

In [3]:
import pandas as pd

# df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")
df = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")

In [4]:
from collections import Counter
sys.path.append('../scripts/') 
import baseline as bf
from truncation import add_truncation_column

print(Counter(df['2split']))
print(Counter(df['4split']))
print(Counter(df['balanced_split']))

#set  variables, same for each model
SPLIT_COLUMN = 'balanced_split' #column that has the data split saved. must be either 2split, 4split or balanced_split. 2split = data split into train and test. 4split = data split into train, test, dev and val. 
TRAIN_SET = 'train' # must be dev or train
TEST_SET = 'test' # must be val or test
# this split column, train_set and test_set might be a bit confusing. The split_column need to have values about the split, so a row either belongs, in my case, to 'train', 'test', 'dev' or 'val'.
# Then the train_set indates which rows will be selected based on the filtering of the split column. 
# Thus if TRAIN_SET = 'train', then all rows where split_col is 'train', will be selected as the training set.
# The same goes for TEST_SET    


TEXT_COLUMN = 'text' # column where the text is
LABEL_COLUMN = 'label' # column with truth label
DATAFRAME = df.copy() # df where each rows is a doc. 
PATH = f"{cf.output_path}/predictionsFinal/baselines/predictions.pkl" # path where each individual prediction is saved
OVERVIEW_PATH = f"{cf.output_path}/predictionsFinal/baselines/overview.pkl" # path where score and extra data about run is saved

# needed for truncation experiment on baselines
TRUNC_COLUMN = 'trunc_txt'
TOKENS_COL = 'LlamaTokens'
THRESHOLD_COMBINATIONS =[(100,0), (200,0), (100,100)]

Counter({'train': 16445, 'test': 4373})
Counter({'train': 15613, 'test': 4164, 'dev': 832, 'val': 209})
Counter({'train': 9900, 'discard': 8718, 'test': 1100, 'val': 1100})


In [5]:
def run_truncation_on_baselines(baseline_function, model_name):
    for thresholds in THRESHOLD_COMBINATIONS:

        # select thresholds
        front_threshold = thresholds[0]
        back_threshold = thresholds[1]

        # set run_id
        run_id = f"{model_name}_first{front_threshold}_last{back_threshold}"

        # get df with truncated text column
        trunc = add_truncation_column(DATAFRAME, TEXT_COLUMN, TOKENS_COL, front_threshold,back_threshold)

        # train and get predictions
        bf.run_baseline(baseline_function, model_name, trunc, SPLIT_COLUMN, TRAIN_SET, TEST_SET, TRUNC_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH, run_id)

### Baselines

##### Baseline 1: linear SVM+tf-idf

In [6]:
from sklearn.svm import LinearSVC
model_name = 'LinearSVC'
baseline_function = LinearSVC()
run_id = f"{model_name}_fulltext"

linear_svm = bf.run_baseline(baseline_function, model_name , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH, run_id)

run_truncation_on_baselines(baseline_function, model_name)



                     precision    recall  f1-score   support

        Actualiteit       1.00      0.79      0.88       100
             Agenda       0.95      0.99      0.97       100
            Besluit       0.98      0.97      0.97       100
              Brief       0.95      0.98      0.97       100
          Factsheet       1.00      0.46      0.63       100
              Motie       0.97      0.95      0.96       100
  Onderzoeksrapport       0.64      0.94      0.76       100
         Raadsadres       0.81      1.00      0.89       100
       Raadsnotulen       1.00      1.00      1.00       100
Schriftelijke Vraag       0.96      0.93      0.94       100
         Voordracht       0.96      0.99      0.98       100

           accuracy                           0.91      1100
          macro avg       0.93      0.91      0.91      1100
       weighted avg       0.93      0.91      0.91      1100





                     precision    recall  f1-score   support

        Actualiteit       0.94      0.79      0.86       100
             Agenda       0.93      0.99      0.96       100
            Besluit       0.97      0.96      0.96       100
              Brief       0.97      0.96      0.96       100
          Factsheet       1.00      0.28      0.44       100
              Motie       0.94      0.94      0.94       100
  Onderzoeksrapport       0.56      0.95      0.71       100
         Raadsadres       0.76      0.94      0.84       100
       Raadsnotulen       1.00      0.98      0.99       100
Schriftelijke Vraag       1.00      0.94      0.97       100
         Voordracht       1.00      0.99      0.99       100

           accuracy                           0.88      1100
          macro avg       0.92      0.88      0.88      1100
       weighted avg       0.92      0.88      0.88      1100



##### Baseline 2: Naive Bayes+tf-idf

In [14]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = bf.run_baseline(MultinomialNB(), 'MultinomialNB' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                     precision    recall  f1-score   support

        Actualiteit       0.00      0.00      0.00       100
             Agenda       0.76      0.99      0.86       100
            Besluit       1.00      0.27      0.43       100
              Brief       1.00      0.11      0.20       100
          Factsheet       0.00      0.00      0.00       100
              Motie       0.83      0.89      0.86       100
  Onderzoeksrapport       0.40      0.87      0.55       100
         Raadsadres       0.43      0.92      0.59       100
       Raadsnotulen       0.00      0.00      0.00       100
Schriftelijke Vraag       0.34      0.96      0.51       100
         Voordracht       0.85      0.99      0.91       100

           accuracy                           0.55      1100
          macro avg       0.51      0.55      0.45      1100
       weighted avg       0.51      0.55      0.45      1100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


##### Baseline 3: Logistic Regression + tf-idf

In [15]:
from sklearn.linear_model import LogisticRegression
log_reg = bf.run_baseline(LogisticRegression(), 'LogisticRegression' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

                     precision    recall  f1-score   support

        Actualiteit       0.96      0.70      0.81       100
             Agenda       0.94      0.99      0.97       100
            Besluit       0.98      0.94      0.96       100
              Brief       0.93      0.97      0.95       100
          Factsheet       1.00      0.29      0.45       100
              Motie       0.90      0.94      0.92       100
  Onderzoeksrapport       0.62      0.92      0.74       100
         Raadsadres       0.69      0.99      0.81       100
       Raadsnotulen       1.00      0.96      0.98       100
Schriftelijke Vraag       0.95      0.93      0.94       100
         Voordracht       0.96      0.99      0.98       100

           accuracy                           0.87      1100
          macro avg       0.90      0.87      0.86      1100
       weighted avg       0.90      0.87      0.86      1100



##### Baseline 4: k Nearest Neigbors + tf-idf

In [16]:
from sklearn.neighbors import KNeighborsClassifier
knn = bf.run_baseline(KNeighborsClassifier(), 'KNeighborsClassifier' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

                     precision    recall  f1-score   support

        Actualiteit       0.90      0.45      0.60       100
             Agenda       0.89      0.98      0.93       100
            Besluit       0.84      0.81      0.82       100
              Brief       0.50      0.32      0.39       100
          Factsheet       1.00      0.35      0.52       100
              Motie       0.74      0.31      0.44       100
  Onderzoeksrapport       0.25      0.88      0.39       100
         Raadsadres       0.85      0.22      0.35       100
       Raadsnotulen       0.55      1.00      0.71       100
Schriftelijke Vraag       0.64      0.32      0.43       100
         Voordracht       0.92      0.87      0.89       100

           accuracy                           0.59      1100
          macro avg       0.73      0.59      0.59      1100
       weighted avg       0.73      0.59      0.59      1100



##### Baseline 5: RandomForest + tf-idf

In [17]:
from sklearn.ensemble import RandomForestClassifier
random_forest = bf.run_baseline(RandomForestClassifier(), 'RandomForestClassifier' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

                     precision    recall  f1-score   support

        Actualiteit       0.99      0.70      0.82       100
             Agenda       0.95      0.99      0.97       100
            Besluit       1.00      0.90      0.95       100
              Brief       0.94      0.95      0.95       100
          Factsheet       1.00      0.18      0.31       100
              Motie       0.94      0.90      0.92       100
  Onderzoeksrapport       0.65      0.95      0.77       100
         Raadsadres       0.54      1.00      0.70       100
       Raadsnotulen       1.00      0.94      0.97       100
Schriftelijke Vraag       0.99      0.91      0.95       100
         Voordracht       0.97      0.99      0.98       100

           accuracy                           0.86      1100
          macro avg       0.91      0.86      0.84      1100
       weighted avg       0.91      0.86      0.84      1100



### Overview of all runs

In [None]:
overview = pd.read_pickle(OVERVIEW_PATH)
display(overview)