In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [2]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


# setup environment GEITje-7B Finetuning
# - pip install torch
# - pip install datasets
# - pip install transformers
# - pip install trl
# - pip install accelerate (restart after)
# - switch device_map='auto' to avaoid memory error

# - pip install sentencepiece
# - pip install jupyter
# - pip install protobuf 



## Notebook overview
This notebook creates predictions for the baseline models. In total, five model are tried out.
- Functions to split the data. One function save the split as a column in txtfiles, the other loads the split.
- Training function. Given a baseline model, will return scores.
- Load Data. Load all the documents, and set parameters.
- TODO: save predictions


Kernel: Pytorch and Tensorflow

### Data split functions

In [3]:
from sklearn.model_selection import train_test_split
import pandas as pd

"""Function takes a dataframe and splits the data into train, test, val and dev set and save it.
Only need to run it once.
"""
def save_split(df, save_to_path):
    train_df, temp_df = train_test_split(df, test_size=0.25, random_state=42)

    # Splitting temp into test (20%) and val_dev (5%)
    test_df, val_dev_df = train_test_split(temp_df, test_size=0.2, random_state=42)

    # Splitting val_dev into validation (1%) and development (4%)
    dev_df,val_df = train_test_split(val_dev_df, test_size=0.2, random_state=42)

    # set split into 4 ways: train, test, val and dev
    train_df['4split'] = 'train'
    test_df['4split'] = 'test'
    val_df['4split'] = 'val'
    dev_df['4split'] = 'dev'

    # set split into 2 ways: test and training
    train_df['2split'] = 'train'
    test_df['2split'] = 'test'
    val_df['2split'] = 'test'
    dev_df['2split'] = 'train'

    # Combining the DataFrames
    final_df = pd.concat([train_df, test_df, val_df, dev_df])
    final_df.to_pickle(save_to_path)

# txtfiles = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")
# save_split(txtfiles, f"{cf.output_path}/txtfiles_tokenizer.pkl")


### Load file with training funcation


In [5]:
import sys
sys.path.append('../scripts/') 
import baseline as bf

### Load data

In [7]:
from collections import Counter
import pandas as pd

df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")
print(Counter(df['2split']))
print(Counter(df['4split']))

#set  variables, same for each model
TRAIN_SET = 'train' # must be dev or train
TEST_SET = 'test' # must be val or test
SPLIT_COLUMN = '4split' #must be either 2split or 4split. 2split = data split into train and test. 4split = data split into train, test, dev and val. 
TEXT_COLUMN = 'text'
LABEL_COLUMN = 'label'
DATAFRAME = df.copy()
PATH = f"{cf.output_path}/predictions/baselinePredictions.pkl"
OVERVIEW_PATH = f"{cf.output_path}/overview/baselineOverview.pkl"

Counter({'train': 21096, 'test': 5608})
Counter({'train': 20028, 'test': 5340, 'dev': 1068, 'val': 268})


### Baselines

##### Baseline 1: linear SVM+tf-idf

In [8]:
from sklearn.svm import LinearSVC
linear_svm = bf.run_baseline(LinearSVC(), 'LinearSVC' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)



                      precision    recall  f1-score   support

         Actualiteit       0.93      0.82      0.87       205
              Agenda       0.86      0.97      0.91       696
             Besluit       0.97      0.96      0.97       162
               Brief       0.90      0.92      0.91       408
          Factsheets       0.53      0.41      0.47        41
               Motie       0.97      0.97      0.97      1713
   Onderzoeksrapport       0.85      0.94      0.89       258
          Raadsadres       0.94      0.99      0.96       408
        Raadsnotulen       0.98      1.00      0.99        58
Schriftelijke Vragen       0.99      0.96      0.97       557
       Termijnagenda       0.84      0.46      0.59       189
          Voordracht       1.00      1.00      1.00       645

            accuracy                           0.94      5340
           macro avg       0.90      0.87      0.88      5340
        weighted avg       0.94      0.94      0.94      5340



In [11]:
import pandas as pd
yeet = pd.read_pickle(OVERVIEW_PATH)
display(yeet)

Unnamed: 0,model,date,train_set,test_set,train_set_support,test_set_support,split_col,text_col,runtime,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,classification_report
0,LinearSVC,2024-04-18 16:17:03.392004+02:00,train,test,20028,5340,4split,text,32.994138,0.939513,0.897017,0.866095,0.875292,precision recall f1-...
0,LinearSVC,2024-04-18 16:18:02.569115+02:00,train,test,20028,5340,4split,text,33.834955,0.939513,0.897017,0.866095,0.875292,precision recall f1-...
0,MultinomialNB,2024-04-18 16:19:02.979849+02:00,train,test,20028,5340,4split,text,29.710517,0.55618,0.548531,0.249166,0.228989,precision recall f1-...
0,LogisticRegression,2024-04-18 16:21:07.518663+02:00,train,test,20028,5340,4split,text,124.518363,0.935768,0.918087,0.848801,0.871043,precision recall f1-...
0,KNeighborsClassifier,2024-04-18 16:35:37.475188+02:00,train,test,20028,5340,4split,text,869.899391,0.648689,0.635765,0.621493,0.569344,precision recall f1-...
0,LinearSVC,2024-04-22 09:03:24.250889+02:00,train,test,20028,5340,4split,text,32.862975,0.939513,0.897017,0.866095,0.875292,precision recall f1-...


##### Baseline 2: Naive Bayes+tf-idf

In [28]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = bf.run_baseline(MultinomialNB(), 'MultinomialNB' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                      precision    recall  f1-score   support

         Actualiteit       0.00      0.00      0.00       205
              Agenda       0.79      0.92      0.85       696
             Besluit       1.00      0.02      0.05       162
               Brief       0.00      0.00      0.00       408
          Factsheets       0.00      0.00      0.00        41
               Motie       0.44      1.00      0.61      1713
   Onderzoeksrapport       0.37      0.05      0.09       258
          Raadsadres       1.00      0.00      0.01       408
        Raadsnotulen       0.00      0.00      0.00        58
Schriftelijke Vragen       1.00      0.03      0.06       557
       Termijnagenda       1.00      0.08      0.15       189
          Voordracht       0.98      0.87      0.92       645

            accuracy                           0.56      5340
           macro avg       0.55      0.25      0.23      5340
        weighted avg       0.63      0.56      0.44      5340



  _warn_prf(average, modifier, msg_start, len(result))


precision, recall and f1-score equal to zero occurs if there are not True Positives. Meaning for those classes not one document is correctly predicted. 

##### Baseline 3: Logistic Regression + tf-idf

In [29]:
from sklearn.linear_model import LogisticRegression
log_reg = bf.run_baseline(LogisticRegression(), 'LogisticRegression' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                      precision    recall  f1-score   support

         Actualiteit       0.97      0.75      0.84       205
              Agenda       0.86      0.98      0.92       696
             Besluit       0.98      0.91      0.94       162
               Brief       0.89      0.91      0.90       408
          Factsheets       0.73      0.39      0.51        41
               Motie       0.96      0.98      0.97      1713
   Onderzoeksrapport       0.85      0.95      0.90       258
          Raadsadres       0.91      0.97      0.94       408
        Raadsnotulen       0.98      0.95      0.96        58
Schriftelijke Vragen       0.98      0.96      0.97       557
       Termijnagenda       0.91      0.46      0.61       189
          Voordracht       0.99      1.00      0.99       645

            accuracy                           0.94      5340
           macro avg       0.92      0.85      0.87      5340
        weighted avg       0.94      0.94      0.93      5340



##### Baseline 4: k Nearest Neigbors + tf-idf

In [30]:
from sklearn.neighbors import KNeighborsClassifier
knn = bf.run_baseline(KNeighborsClassifier(), 'KNeighborsClassifier' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

                      precision    recall  f1-score   support

         Actualiteit       0.62      0.39      0.47       205
              Agenda       0.74      0.95      0.83       696
             Besluit       0.74      0.86      0.80       162
               Brief       0.40      0.58      0.47       408
          Factsheets       0.64      0.39      0.48        41
               Motie       0.86      0.66      0.74      1713
   Onderzoeksrapport       0.25      0.67      0.36       258
          Raadsadres       0.85      0.26      0.40       408
        Raadsnotulen       0.17      0.98      0.30        58
Schriftelijke Vragen       0.72      0.44      0.55       557
       Termijnagenda       0.74      0.44      0.55       189
          Voordracht       0.89      0.84      0.87       645

            accuracy                           0.65      5340
           macro avg       0.64      0.62      0.57      5340
        weighted avg       0.74      0.65      0.66      5340



##### Baseline 5: RandomForest + tf-idf

In [13]:
from sklearn.ensemble import RandomForestClassifier
random_forest = bf.run_baseline(RandomForestClassifier(), 'RandomForestClassifier' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

                      precision    recall  f1-score   support

         Actualiteit       0.92      0.71      0.80       205
              Agenda       0.85      0.89      0.87       696
             Besluit       0.98      0.89      0.93       162
               Brief       0.90      0.90      0.90       408
          Factsheets       0.48      0.32      0.38        41
               Motie       0.96      0.97      0.96      1713
   Onderzoeksrapport       0.84      0.90      0.87       258
          Raadsadres       0.85      0.95      0.90       408
        Raadsnotulen       1.00      0.95      0.97        58
Schriftelijke Vragen       0.98      0.94      0.96       557
       Termijnagenda       0.55      0.46      0.50       189
          Voordracht       0.98      0.99      0.99       645

            accuracy                           0.91      5340
           macro avg       0.86      0.82      0.84      5340
        weighted avg       0.91      0.91      0.91      5340



### Overview of all runs

In [None]:
overview = pd.read_pickle(OVERVIEW_PATH)
display(overview)