In [16]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [17]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


# setup environment GEITje-7B Finetuning
# - pip install torch
# - pip install datasets
# - pip install transformers
# - pip install trl
# - pip install accelerate (restart after)
# - switch device_map='auto' to avaoid memory error

# - pip install sentencepiece
# - pip install jupyter
# - pip install protobuf 



## Notebook overview
This notebook creates predictions for the baseline models. In total, five model are tried out.
- Functions to split the data. One function save the split as a column in txtfiles, the other loads the split.
- Training function. Given a baseline model, will return scores.
- Load Data. Load all the documents, and set parameters.
- TODO: save predictions

### Data split functions

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

"""Function takes a dataframe and splits the data into train, test, val and dev set and save it.
Only need to run it once, load_data_spli is used to get the right subsets.

"""
def save_split(df, save_to_path):
    train_df, temp_df = train_test_split(df, test_size=0.25, random_state=42)

    # Splitting temp into test (20%) and val_dev (5%)
    test_df, val_dev_df = train_test_split(temp_df, test_size=0.2, random_state=42)

    # Splitting val_dev into validation (1%) and development (4%)
    val_df, dev_df = train_test_split(val_dev_df, test_size=0.2, random_state=42)

    train_df['split'] = 'train'
    test_df['split'] = 'test'
    val_df['split'] = 'val'
    dev_df['split'] = 'dev'

    # Combining the DataFrames
    final_df = pd.concat([train_df, test_df, val_df, dev_df])

    final_df.to_pickle(save_to_path)

# txtfiles = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")
# save_split(txtfiles, f"{cf.output_path}/txtfiles.pkl")


In [None]:
"""Function returns X and y set for either the train, val, test or dev set."""
def load_data_split(df, split_col,subset, label_col):
    subdf = df.loc[df[split_col]==subset]
    X = subdf.drop(columns=[label_col])
    y = subdf[label_col]
    return X, y

### Training function


In [34]:
# from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
  
import sys
sys.path.append('../scripts/') 
import prediction_helperfunctions as ph


def run_baseline(baseline_function, dataframe,split_col, subset_train, subset_test, text_col, label_col):
    ph.check_data_split_input(subset_train, subset_test)
    X_train, y_train = load_data_split(dataframe,split_col,subset_train,label_col) 
    X_test, y_test = load_data_split(dataframe,split_col,subset_test,label_col) 


    vectorizer = TfidfVectorizer()
    X_train_tfidf_bin = vectorizer.fit_transform(X_train[text_col])
    X_test_tfidf_bin = vectorizer.transform(X_test[text_col])

    model = baseline_function

    # Train the classifier on the training data
    model.fit(X_train_tfidf_bin, y_train)

    y_pred = model.predict(X_test_tfidf_bin)

    # Calculate the accuracy of the classifier
    report = classification_report(y_test, y_pred)
    print(report)

    predictions = X_test.copy()
    predictions[label_col] = y_test
    predictions['prediction'] = y_pred
    return predictions

### Load data

In [18]:
df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")

#set  variables, same for each model
TRAIN_SET = 'dev' # must be dev or train
TEST_SET = 'val' # must be val or test
SPLIT_COLUMN = 'split'
TEXT_COLUMN = 'text'
LABEL_COLUMN = 'label'
DATAFRAME = df.copy()

### Baselines

##### Baseline 1: linear SVM+tf-idf

In [19]:
from sklearn.svm import LinearSVC
linear_svm = run_baseline(LinearSVC(), DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN)

                      precision    recall  f1-score   support

         Actualiteit       1.00      0.06      0.11        35
              Agenda       0.79      0.87      0.83       134
             Besluit       0.93      0.56      0.70        25
               Brief       0.75      0.87      0.80        89
          Factsheets       1.00      0.14      0.25         7
               Motie       0.87      0.95      0.91       349
   Onderzoeksrapport       0.52      0.41      0.45        37
          Raadsadres       0.84      0.75      0.79        83
        Raadsnotulen       1.00      0.54      0.70        13
Schriftelijke Vragen       0.77      0.98      0.86       127
       Termijnagenda       0.50      0.24      0.32        42
          Voordracht       0.99      0.98      0.99       127

            accuracy                           0.83      1068
           macro avg       0.83      0.61      0.64      1068
        weighted avg       0.83      0.83      0.81      1068



##### Baseline 2: Naive Bayes+tf-idf

In [20]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = run_baseline(MultinomialNB(), DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN)

                      precision    recall  f1-score   support

         Actualiteit       0.00      0.00      0.00        35
              Agenda       0.80      0.71      0.75       134
             Besluit       0.00      0.00      0.00        25
               Brief       0.00      0.00      0.00        89
          Factsheets       0.00      0.00      0.00         7
               Motie       0.37      1.00      0.54       349
   Onderzoeksrapport       0.00      0.00      0.00        37
          Raadsadres       0.00      0.00      0.00        83
        Raadsnotulen       0.00      0.00      0.00        13
Schriftelijke Vragen       0.00      0.00      0.00       127
       Termijnagenda       0.00      0.00      0.00        42
          Voordracht       1.00      0.09      0.17       127

            accuracy                           0.43      1068
           macro avg       0.18      0.15      0.12      1068
        weighted avg       0.34      0.43      0.29      1068



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


precision, recall and f1-score equal to zero occurs if there are not True Positives. Meaning for those classes not one document is correctly predicted. 

##### Baseline 3: Logistic Regression + tf-idf

In [21]:
from sklearn.linear_model import LogisticRegression
log_reg = run_baseline(LogisticRegression(), DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN)

                      precision    recall  f1-score   support

         Actualiteit       0.00      0.00      0.00        35
              Agenda       0.82      0.89      0.85       134
             Besluit       0.00      0.00      0.00        25
               Brief       0.71      0.06      0.10        89
          Factsheets       0.00      0.00      0.00         7
               Motie       0.55      1.00      0.71       349
   Onderzoeksrapport       0.75      0.16      0.27        37
          Raadsadres       1.00      0.23      0.37        83
        Raadsnotulen       0.00      0.00      0.00        13
Schriftelijke Vragen       0.77      0.81      0.79       127
       Termijnagenda       0.00      0.00      0.00        42
          Voordracht       1.00      0.91      0.95       127

            accuracy                           0.67      1068
           macro avg       0.47      0.34      0.34      1068
        weighted avg       0.65      0.67      0.59      1068



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


##### Baseline 4: k Nearest Neigbors + tf-idf

In [22]:
from sklearn.neighbors import KNeighborsClassifier
knn = run_baseline(KNeighborsClassifier(), DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN)

                      precision    recall  f1-score   support

         Actualiteit       0.50      0.06      0.10        35
              Agenda       0.73      0.68      0.70       134
             Besluit       0.83      0.40      0.54        25
               Brief       0.33      0.76      0.46        89
          Factsheets       0.00      0.00      0.00         7
               Motie       0.90      0.18      0.30       349
   Onderzoeksrapport       0.15      0.73      0.24        37
          Raadsadres       1.00      0.07      0.13        83
        Raadsnotulen       0.00      0.00      0.00        13
Schriftelijke Vragen       0.37      0.94      0.53       127
       Termijnagenda       0.15      0.12      0.13        42
          Voordracht       0.98      0.77      0.86       127

            accuracy                           0.46      1068
           macro avg       0.49      0.39      0.33      1068
        weighted avg       0.70      0.46      0.43      1068



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


##### Baseline 5: RandomForest + tf-idf

In [23]:
from sklearn.ensemble import RandomForestClassifier
random_forest = run_baseline(RandomForestClassifier(), DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN)

                      precision    recall  f1-score   support

         Actualiteit       0.00      0.00      0.00        35
              Agenda       0.75      0.92      0.83       134
             Besluit       0.00      0.00      0.00        25
               Brief       0.76      0.87      0.81        89
          Factsheets       0.00      0.00      0.00         7
               Motie       0.80      0.95      0.87       349
   Onderzoeksrapport       0.72      0.70      0.71        37
          Raadsadres       0.74      0.73      0.74        83
        Raadsnotulen       0.00      0.00      0.00        13
Schriftelijke Vragen       0.87      0.93      0.90       127
       Termijnagenda       0.29      0.05      0.08        42
          Voordracht       0.96      0.98      0.97       127

            accuracy                           0.81      1068
           macro avg       0.49      0.51      0.49      1068
        weighted avg       0.73      0.81      0.76      1068



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
