In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [2]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

import pandas as pd

## Notebook overview
This notebook creates predictions for the baseline models. In total, five models are tried out.
- Training function. Given a baseline model, will return scores.
- Load Data. Load all the documents, and set parameters.
- save predictions


*Previous notebook: RepairMistralPredictions*

*Next notebook: plot*

### Load file with training funcation


In [3]:
import sys
sys.path.append('../scripts/') 
import baseline as bf

### Load data

In [4]:
import pandas as pd

df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")

In [5]:
display(df.head())

Unnamed: 0,label,path,id,text,num_pages,4split,2split,MistralTokens,count_MistralTokens,LlamaTokens,count_LlamaTokens,md5_hash,balanced_split
0,Motie,/home/azureuser/cloudfiles/code/blobfuse/raads...,1874,x Gemeente Amsterdam R\nGemeenteraad\n% Gemeen...,1.0,train,train,"[▁x, ▁Geme, ente, ▁Amsterdam, ▁R, <0x0A>, G, e...",350,"[▁x, ▁Geme, ente, ▁Amsterdam, ▁R, <0x0A>, G, e...",346,2f09ba2c967bba0eecf71f846f258a78,discard
1,Motie,/home/azureuser/cloudfiles/code/blobfuse/raads...,230,X Gemeente Amsterdam R\nGemeenteraad\n% Gemeen...,2.0,train,train,"[▁X, ▁Geme, ente, ▁Amsterdam, ▁R, <0x0A>, G, e...",1130,"[▁X, ▁Geme, ente, ▁Amsterdam, ▁R, <0x0A>, G, e...",1082,d14b33c32ba1e1bcff16320891bdf158,discard
2,Raadsnotulen,/home/azureuser/cloudfiles/code/blobfuse/raads...,26304,Gemeente Amsterdam\n% Gemeenteraad R\n% Raadsn...,79.0,train,train,"[▁Geme, ente, ▁Amsterdam, <0x0A>, %, ▁Geme, en...",89050,"[▁Geme, ente, ▁Amsterdam, <0x0A>, %, ▁Geme, en...",85359,36964ae4a84926e2f825761d980d12f4,test
3,Besluit,/home/azureuser/cloudfiles/code/blobfuse/raads...,20677,3. Interne documenten - 5271\nx Gemeente Beslu...,2.0,train,train,"[▁, 3, ., ▁Inter, ne, ▁document, en, ▁-, ▁, 5,...",1094,"[▁, 3, ., ▁Inter, ne, ▁document, en, ▁-, ▁, 5,...",1071,f2f9203231ceba0504087b493b5ffd1d,train
4,Raadsadres,/home/azureuser/cloudfiles/code/blobfuse/raads...,24174,"|\nÍ\nAmsterdam, september 2016 |\nGeachte led...",2.0,train,train,"[▁|, <0x0A>, Í, <0x0A>, Am, sterdam, ,, ▁septe...",1839,"[▁|, <0x0A>, Í, <0x0A>, Am, sterdam, ,, ▁septe...",1775,cebc20ef3921faab5a377c1a637ed22d,train


In [6]:
from collections import Counter
sys.path.append('../scripts/') 
import baseline as bf
from truncation import add_truncation_column

#set  variables, same for each model
SPLIT_COLUMN = 'balanced_split' #column that has the data split saved. must be either 2split, 4split or balanced_split. 2split = data split into train and test. 4split = data split into train, test, dev and val. 
print('Distribution of sets: ', Counter(df[SPLIT_COLUMN]))
TRAIN_SET = 'train' # must be dev or train
TEST_SET = 'test' # must be val or test
# this split column, train_set and test_set might be a bit confusing. The split_column need to have values about the split, so a row either belongs, in my case, to 'train', 'test', 'dev' or 'val'.
# Then the train_set indates which rows will be selected based on the filtering of the split column. 
# Thus if TRAIN_SET = 'train', then all rows where split_col is 'train', will be selected as the training set.
# The same goes for TEST_SET    


TEXT_COLUMN = 'text' # column where the text is
LABEL_COLUMN = 'label' # column with truth label
DATAFRAME = df.copy() # df where each row is a doc. 
FOLDER = f"{cf.output_path}/predictionsFinal/baselines" # folder where each individual prediction is saved
OVERVIEW_PATH = f"{cf.output_path}/predictionsFinal/baselines/overview.pkl" # file where score and extra data about run is saved

# needed for truncation experiment on baselines
TRUNC_COLUMN = 'trunc_txt' # column with truncated text
TOKENS_COL = 'LlamaTokens' # column with text split into tokens using model tokenizer, in this case Llama, could also be MistralTokens
THRESHOLD_COMBINATIONS =[(100,0), (200,0), (100,100)] # combinations of front and back truncation thresholds. First value in tuple is first N tokens, second value is last N tokens.

Distribution of sets:  Counter({'train': 9900, 'discard': 8718, 'test': 1100, 'val': 1100})


In [7]:
# Function to run the baseline on each truncation threshold

def run_truncation_on_baselines(baseline_function, model_name, predictions_path):
    for thresholds in THRESHOLD_COMBINATIONS:

        # select thresholds
        front_threshold = thresholds[0]
        back_threshold = thresholds[1]

        # set run_id
        run_id = f"{model_name}_first{front_threshold}_last{back_threshold}"

        # get df with truncated text column
        trunc = add_truncation_column(DATAFRAME, TEXT_COLUMN, TOKENS_COL, front_threshold,back_threshold)

        # train and get predictions
        bf.run_baseline(baseline_function, model_name, trunc, SPLIT_COLUMN, TRAIN_SET, TEST_SET, TRUNC_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

### Baselines

##### Baseline 1: linear SVM+tf-idf

In [8]:
from sklearn.svm import LinearSVC
model_name = 'LinearSVC'
baseline_function = LinearSVC()
run_id = f"{model_name}_fulltext"
predictions_path = f"{FOLDER}/{model_name}predictions.pkl"

print(OVERVIEW_PATH)
print(predictions_path)
linear_svm = bf.run_baseline(baseline_function, model_name , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

run_truncation_on_baselines(baseline_function, model_name, predictions_path)

/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/overview.pkl
/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/LinearSVCpredictions.pkl




                     precision    recall  f1-score   support

        Actualiteit       1.00      0.79      0.88       100
             Agenda       0.95      0.99      0.97       100
            Besluit       0.98      0.97      0.97       100
              Brief       0.95      0.98      0.97       100
          Factsheet       1.00      0.46      0.63       100
              Motie       0.97      0.95      0.96       100
  Onderzoeksrapport       0.64      0.94      0.76       100
         Raadsadres       0.81      1.00      0.89       100
       Raadsnotulen       1.00      1.00      1.00       100
Schriftelijke Vraag       0.96      0.93      0.94       100
         Voordracht       0.96      0.99      0.98       100

           accuracy                           0.91      1100
          macro avg       0.93      0.91      0.91      1100
       weighted avg       0.93      0.91      0.91      1100





                     precision    recall  f1-score   support

        Actualiteit       0.94      0.79      0.86       100
             Agenda       0.93      0.99      0.96       100
            Besluit       0.97      0.96      0.96       100
              Brief       0.97      0.96      0.96       100
          Factsheet       1.00      0.28      0.44       100
              Motie       0.94      0.94      0.94       100
  Onderzoeksrapport       0.56      0.95      0.71       100
         Raadsadres       0.76      0.94      0.84       100
       Raadsnotulen       1.00      0.98      0.99       100
Schriftelijke Vraag       1.00      0.94      0.97       100
         Voordracht       1.00      0.99      0.99       100

           accuracy                           0.88      1100
          macro avg       0.92      0.88      0.88      1100
       weighted avg       0.92      0.88      0.88      1100





                     precision    recall  f1-score   support

        Actualiteit       0.94      0.80      0.86       100
             Agenda       0.94      0.99      0.97       100
            Besluit       0.97      0.96      0.96       100
              Brief       0.96      0.95      0.95       100
          Factsheet       1.00      0.25      0.40       100
              Motie       0.94      0.94      0.94       100
  Onderzoeksrapport       0.55      0.94      0.69       100
         Raadsadres       0.79      0.96      0.87       100
       Raadsnotulen       1.00      0.98      0.99       100
Schriftelijke Vraag       1.00      0.94      0.97       100
         Voordracht       0.98      1.00      0.99       100

           accuracy                           0.88      1100
          macro avg       0.92      0.88      0.87      1100
       weighted avg       0.92      0.88      0.87      1100





                     precision    recall  f1-score   support

        Actualiteit       1.00      0.80      0.89       100
             Agenda       0.95      0.99      0.97       100
            Besluit       0.96      0.96      0.96       100
              Brief       0.98      0.93      0.95       100
          Factsheet       1.00      0.31      0.47       100
              Motie       0.92      0.95      0.94       100
  Onderzoeksrapport       0.56      0.95      0.70       100
         Raadsadres       0.79      0.99      0.88       100
       Raadsnotulen       1.00      0.98      0.99       100
Schriftelijke Vraag       1.00      0.93      0.96       100
         Voordracht       0.99      0.99      0.99       100

           accuracy                           0.89      1100
          macro avg       0.92      0.89      0.88      1100
       weighted avg       0.92      0.89      0.88      1100



##### Baseline 2: Naive Bayes+tf-idf

In [9]:
from sklearn.naive_bayes import MultinomialNB
model_name = 'MultinomialNB'
baseline_function = MultinomialNB()
run_id = f"{model_name}_fulltext"
predictions_path = f"{FOLDER}/{model_name}predictions.pkl"
print(OVERVIEW_PATH)
print(predictions_path)

naive_bayes = bf.run_baseline(baseline_function, model_name , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

run_truncation_on_baselines(baseline_function, model_name, predictions_path)

/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/overview.pkl
/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/MultinomialNBpredictions.pkl


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                     precision    recall  f1-score   support

        Actualiteit       0.00      0.00      0.00       100
             Agenda       0.76      0.99      0.86       100
            Besluit       1.00      0.27      0.43       100
              Brief       1.00      0.11      0.20       100
          Factsheet       0.00      0.00      0.00       100
              Motie       0.83      0.89      0.86       100
  Onderzoeksrapport       0.40      0.87      0.55       100
         Raadsadres       0.43      0.92      0.59       100
       Raadsnotulen       0.00      0.00      0.00       100
Schriftelijke Vraag       0.34      0.96      0.51       100
         Voordracht       0.85      0.99      0.91       100

           accuracy                           0.55      1100
          macro avg       0.51      0.55      0.45      1100
       weighted avg       0.51      0.55      0.45      1100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                     precision    recall  f1-score   support

        Actualiteit       1.00      0.29      0.45       100
             Agenda       0.71      0.95      0.81       100
            Besluit       0.99      0.82      0.90       100
              Brief       0.94      0.93      0.93       100
          Factsheet       0.00      0.00      0.00       100
              Motie       0.55      0.89      0.68       100
  Onderzoeksrapport       0.49      0.74      0.59       100
         Raadsadres       0.52      0.92      0.67       100
       Raadsnotulen       0.00      0.00      0.00       100
Schriftelijke Vraag       0.64      0.92      0.75       100
         Voordracht       0.81      0.98      0.89       100

           accuracy                           0.68      1100
          macro avg       0.60      0.68      0.61      1100
       weighted avg       0.60      0.68      0.61      1100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                     precision    recall  f1-score   support

        Actualiteit       1.00      0.07      0.13       100
             Agenda       0.74      0.95      0.83       100
            Besluit       1.00      0.86      0.92       100
              Brief       0.94      0.92      0.93       100
          Factsheet       0.00      0.00      0.00       100
              Motie       0.53      0.89      0.66       100
  Onderzoeksrapport       0.50      0.69      0.58       100
         Raadsadres       0.54      0.95      0.69       100
       Raadsnotulen       1.00      0.11      0.20       100
Schriftelijke Vraag       0.54      0.93      0.69       100
         Voordracht       0.84      0.99      0.91       100

           accuracy                           0.67      1100
          macro avg       0.69      0.67      0.60      1100
       weighted avg       0.69      0.67      0.60      1100

                     precision    recall  f1-score   support

        Actualiteit 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


##### Baseline 3: Logistic Regression + tf-idf

In [10]:
from sklearn.linear_model import LogisticRegression
model_name = 'LogisticRegression'
baseline_function = LogisticRegression()
run_id = f"{model_name}_fulltext"
predictions_path = f"{FOLDER}/{model_name}predictions.pkl"

print(OVERVIEW_PATH)
print(predictions_path)

log_reg = bf.run_baseline(baseline_function, model_name , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

run_truncation_on_baselines(baseline_function, model_name, predictions_path)

/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/overview.pkl
/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/LogisticRegressionpredictions.pkl
                     precision    recall  f1-score   support

        Actualiteit       0.96      0.70      0.81       100
             Agenda       0.94      0.99      0.97       100
            Besluit       0.98      0.94      0.96       100
              Brief       0.93      0.97      0.95       100
          Factsheet       1.00      0.29      0.45       100
              Motie       0.90      0.94      0.92       100
  Onderzoeksrapport       0.62      0.92      0.74       100
         Raadsadres       0.69      0.99      0.81       100
       Raadsnotulen       1.00      0.96      0.98       100
Schriftelijke Vraag       0.95      0.93      0.94       100
         Voordracht       0

##### Baseline 4: k Nearest Neigbors + tf-idf

In [11]:
from sklearn.neighbors import KNeighborsClassifier
model_name = 'KNeighborsClassifier'
baseline_function = KNeighborsClassifier()
run_id = f"{model_name}_fulltext"
predictions_path = f"{FOLDER}/{model_name}predictions.pkl"

print(OVERVIEW_PATH)
print(predictions_path)
knn = bf.run_baseline(baseline_function, model_name , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

run_truncation_on_baselines(baseline_function, model_name, predictions_path)

/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/overview.pkl
/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/KNeighborsClassifierpredictions.pkl
                     precision    recall  f1-score   support

        Actualiteit       0.90      0.45      0.60       100
             Agenda       0.89      0.98      0.93       100
            Besluit       0.84      0.81      0.82       100
              Brief       0.50      0.32      0.39       100
          Factsheet       1.00      0.35      0.52       100
              Motie       0.74      0.31      0.44       100
  Onderzoeksrapport       0.25      0.88      0.39       100
         Raadsadres       0.85      0.22      0.35       100
       Raadsnotulen       0.55      1.00      0.71       100
Schriftelijke Vraag       0.64      0.32      0.43       100
         Voordracht      

##### Baseline 5: RandomForest + tf-idf

In [12]:
from sklearn.ensemble import RandomForestClassifier
model_name = 'RandomForestClassifier'
baseline_function = RandomForestClassifier()
run_id = f"{model_name}_fulltext"
predictions_path = f"{FOLDER}/{model_name}predictions.pkl"

print(OVERVIEW_PATH)
print(predictions_path)

random_forest = bf.run_baseline(baseline_function, model_name , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

run_truncation_on_baselines(baseline_function, model_name, predictions_path)

/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/overview.pkl
/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/RandomForestClassifierpredictions.pkl
                     precision    recall  f1-score   support

        Actualiteit       0.99      0.72      0.83       100
             Agenda       0.93      0.99      0.96       100
            Besluit       1.00      0.91      0.95       100
              Brief       0.92      0.98      0.95       100
          Factsheet       1.00      0.20      0.33       100
              Motie       0.94      0.90      0.92       100
  Onderzoeksrapport       0.66      0.94      0.77       100
         Raadsadres       0.55      0.97      0.70       100
       Raadsnotulen       1.00      0.94      0.97       100
Schriftelijke Vraag       0.99      0.91      0.95       100
         Voordracht    

### Overview of all runs

In [13]:
overview = pd.read_pickle(OVERVIEW_PATH)
display(overview)

Unnamed: 0,model,date,run_id,train_set,test_set,train_set_support,test_set_support,split_col,text_col,runtime,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,classification_report,weighted_avg_precision,weighted_avg_recall,weighted_avg_f1
0,LinearSVC,2024-06-10 16:52:54.640613+02:00,LinearSVC_fulltext,train,test,9900,1100,balanced_split,text,20.638674,0.909091,0.928945,0.909091,0.905174,precision recall f1-s...,,,
0,LinearSVC,2024-06-10 16:52:56.258414+02:00,LinearSVC_first100_last0,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back0,0.493109,0.883636,0.915587,0.883636,0.875062,precision recall f1-s...,0.915587,0.883636,0.875062
0,LinearSVC,2024-06-10 16:52:58.271834+02:00,LinearSVC_first200_last0,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront200Back0,0.855985,0.882727,0.915784,0.882727,0.872667,precision recall f1-s...,0.915784,0.882727,0.872667
0,LinearSVC,2024-06-10 16:53:00.483228+02:00,LinearSVC_first100_last100,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back100,0.8707,0.889091,0.922796,0.889091,0.882482,precision recall f1-s...,0.922796,0.889091,0.882482
0,MultinomialNB,2024-06-10 16:53:17.843000+02:00,MultinomialNB_fulltext,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back100,17.313666,0.545455,0.510466,0.545455,0.445479,precision recall f1-s...,0.510466,0.545455,0.445479
0,MultinomialNB,2024-06-10 16:53:19.416178+02:00,MultinomialNB_first100_last0,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back0,0.385086,0.676364,0.604005,0.676364,0.606069,precision recall f1-s...,0.604005,0.676364,0.606069
0,MultinomialNB,2024-06-10 16:53:21.260909+02:00,MultinomialNB_first200_last0,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront200Back0,0.690747,0.669091,0.694272,0.669091,0.595079,precision recall f1-s...,0.694272,0.669091,0.595079
0,MultinomialNB,2024-06-10 16:53:23.280924+02:00,MultinomialNB_first100_last100,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back100,0.697143,0.665455,0.60728,0.665455,0.587982,precision recall f1-s...,0.60728,0.665455,0.587982
0,LogisticRegression,2024-06-10 16:54:12.252235+02:00,LogisticRegression_fulltext,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back100,48.839654,0.874545,0.903397,0.874545,0.864052,precision recall f1-s...,0.903397,0.874545,0.864052
0,LogisticRegression,2024-06-10 16:54:21.097270+02:00,LogisticRegression_first100_last0,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back0,7.764363,0.854545,0.903185,0.854545,0.835388,precision recall f1-s...,0.903185,0.854545,0.835388
