In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [1]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


# setup environment GEITje-7B Finetuning
# - pip install torch
# - pip install datasets
# - pip install transformers
# - pip install trl
# - pip install accelerate (restart after)
# - switch device_map='auto' to avaoid memory error

# - pip install sentencepiece
# - pip install jupyter
# - pip install protobuf 



## Notebook overview
This notebook creates predictions for the baseline models. In total, five models are tried out.
- Training function. Given a baseline model, will return scores.
- Load Data. Load all the documents, and set parameters.
- save predictions


Kernel: Pytorch and Tensorflow

### Load file with training funcation


In [2]:
import sys
sys.path.append('../scripts/') 
import baseline as bf

### Load data

In [3]:
import pandas as pd

# df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")
df = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")

In [11]:
from collections import Counter
sys.path.append('../scripts/') 
import baseline as bf
from truncation import add_truncation_column

print(Counter(df['2split']))
print(Counter(df['4split']))
print(Counter(df['balanced_split']))

#set  variables, same for each model
SPLIT_COLUMN = 'balanced_split' #column that has the data split saved. must be either 2split, 4split or balanced_split. 2split = data split into train and test. 4split = data split into train, test, dev and val. 
TRAIN_SET = 'train' # must be dev or train
TEST_SET = 'test' # must be val or test
# this split column, train_set and test_set might be a bit confusing. The split_column need to have values about the split, so a row either belongs, in my case, to 'train', 'test', 'dev' or 'val'.
# Then the train_set indates which rows will be selected based on the filtering of the split column. 
# Thus if TRAIN_SET = 'train', then all rows where split_col is 'train', will be selected as the training set.
# The same goes for TEST_SET    


TEXT_COLUMN = 'text' # column where the text is
LABEL_COLUMN = 'label' # column with truth label
DATAFRAME = df.copy() # df where each rows is a doc. 
FOLDER = f"{cf.output_path}/predictionsFinal/baselines" # path where each individual prediction is saved
OVERVIEW_PATH = f"{cf.output_path}/predictionsFinal/baselines/overview.pkl" # path where score and extra data about run is saved

# needed for truncation experiment on baselines
TRUNC_COLUMN = 'trunc_txt'
TOKENS_COL = 'LlamaTokens'
THRESHOLD_COMBINATIONS =[(100,0), (200,0), (100,100)]

Counter({'train': 16445, 'test': 4373})
Counter({'train': 15613, 'test': 4164, 'dev': 832, 'val': 209})
Counter({'train': 9900, 'discard': 8718, 'test': 1100, 'val': 1100})


In [12]:
def run_truncation_on_baselines(baseline_function, model_name, predictions_path):
    for thresholds in THRESHOLD_COMBINATIONS:

        # select thresholds
        front_threshold = thresholds[0]
        back_threshold = thresholds[1]

        # set run_id
        run_id = f"{model_name}_first{front_threshold}_last{back_threshold}"

        # get df with truncated text column
        trunc = add_truncation_column(DATAFRAME, TEXT_COLUMN, TOKENS_COL, front_threshold,back_threshold)

        # train and get predictions
        bf.run_baseline(baseline_function, model_name, trunc, SPLIT_COLUMN, TRAIN_SET, TEST_SET, TRUNC_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

### Baselines

##### Baseline 1: linear SVM+tf-idf

In [23]:
from sklearn.svm import LinearSVC
model_name = 'LinearSVC'
baseline_function = LinearSVC()
run_id = f"{model_name}_fulltext"
predictions_path = f"{FOLDER}/{model_name}predictions.pkl"

print(OVERVIEW_PATH)
print(predictions_path)
linear_svm = bf.run_baseline(baseline_function, model_name , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

run_truncation_on_baselines(baseline_function, model_name, predictions_path)

/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/overview.pkl
/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/LinearSVCpredictions.pkl


KeyboardInterrupt: 

##### Baseline 2: Naive Bayes+tf-idf

In [22]:
from sklearn.naive_bayes import MultinomialNB
model_name = 'MultinomialNB'
baseline_function = MultinomialNB()
run_id = f"{model_name}_fulltext"
predictions_path = f"{FOLDER}/{model_name}predictions.pkl"
print(OVERVIEW_PATH)
print(predictions_path)

# naive_bayes = bf.run_baseline(baseline_function, model_name , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

# run_truncation_on_baselines(baseline_function, model_name, predictions_path)

/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/overview.pkl
/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/MultinomialNBpredictions.pkl


##### Baseline 3: Logistic Regression + tf-idf

In [24]:
from sklearn.linear_model import LogisticRegression
model_name = 'LogisticRegression'
baseline_function = LogisticRegression()
run_id = f"{model_name}_fulltext"
predictions_path = f"{FOLDER}/{model_name}predictions.pkl"

print(OVERVIEW_PATH)
print(predictions_path)

# log_reg = bf.run_baseline(baseline_function, model_name , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

# run_truncation_on_baselines(baseline_function, model_name, predictions_path)

/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/overview.pkl
/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/LogisticRegressionpredictions.pkl


##### Baseline 4: k Nearest Neigbors + tf-idf

In [25]:
from sklearn.neighbors import KNeighborsClassifier
model_name = 'KNeighborsClassifier'
baseline_function = KNeighborsClassifier()
run_id = f"{model_name}_fulltext"
predictions_path = f"{FOLDER}/{model_name}predictions.pkl"

print(OVERVIEW_PATH)
print(predictions_path)
# knn = bf.run_baseline(baseline_function, model_name , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)
# 
# run_truncation_on_baselines(baseline_function, model_name, predictions_path)

/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/overview.pkl
/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/KNeighborsClassifierpredictions.pkl


##### Baseline 5: RandomForest + tf-idf

In [27]:
from sklearn.ensemble import RandomForestClassifier
model_name = 'RandomForestClassifier'
baseline_function = RandomForestClassifier()
run_id = f"{model_name}_fulltext"
predictions_path = f"{FOLDER}/{model_name}predictions.pkl"

print(OVERVIEW_PATH)
print(predictions_path)

# random_forest = bf.run_baseline(baseline_function, model_name , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

# run_truncation_on_baselines(baseline_function, model_name, predictions_path)

/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/overview.pkl
/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/processed_data/woo_document_classification/predictionsFinal/baselines/RandomForestClassifierpredictions.pkl


### Overview of all runs

In [19]:
overview = pd.read_pickle(OVERVIEW_PATH)
display(overview)

Unnamed: 0,model,date,run_id,train_set,test_set,train_set_support,test_set_support,split_col,text_col,runtime,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,classification_report
0,LinearSVC,2024-05-15 15:04:05.242133+02:00,LinearSVC_fulltext,train,test,9900,1100,balanced_split,text,27.136113,0.909091,0.928945,0.909091,0.905174,precision recall f1-s...
0,LinearSVC,2024-05-15 15:04:13.555519+02:00,LinearSVC_first100_last0,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back0,17.967054,0.883636,0.915587,0.883636,0.875062,precision recall f1-s...
0,LinearSVC,2024-05-15 15:04:33.013663+02:00,LinearSVC_first200_last0,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront200Back0,29.550975,0.882727,0.915784,0.882727,0.872667,precision recall f1-s...
0,LinearSVC,2024-05-15 15:05:03.913225+02:00,LinearSVC_first100_last100,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back100,38.544936,0.889091,0.922796,0.889091,0.882482,precision recall f1-s...
0,MultinomialNB,2024-05-15 15:07:25.298385+02:00,MultinomialNB_fulltext,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back100,24.37628,0.545455,0.510466,0.545455,0.445479,precision recall f1-s...
0,MultinomialNB,2024-05-15 15:07:33.363364+02:00,MultinomialNB_first100_last0,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back0,17.915161,0.676364,0.604005,0.676364,0.606069,precision recall f1-s...
0,MultinomialNB,2024-05-15 15:07:52.727659+02:00,MultinomialNB_first200_last0,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront200Back0,29.898976,0.669091,0.694272,0.669091,0.595079,precision recall f1-s...
0,MultinomialNB,2024-05-15 15:08:23.938886+02:00,MultinomialNB_first100_last100,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back100,38.967012,0.665455,0.60728,0.665455,0.587982,precision recall f1-s...
0,LogisticRegression,2024-05-15 15:10:55.905981+02:00,LogisticRegression_fulltext,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back100,55.377151,0.874545,0.903397,0.874545,0.864052,precision recall f1-s...
0,LogisticRegression,2024-05-15 15:11:09.919198+02:00,LogisticRegression_first100_last0,train,test,9900,1100,balanced_split,TruncationLlamaTokensFront100Back0,23.8946,0.854545,0.903185,0.854545,0.835388,precision recall f1-s...
