In [9]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [10]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


# setup environment GEITje-7B Finetuning
# - pip install torch
# - pip install datasets
# - pip install transformers
# - pip install trl
# - pip install accelerate (restart after)
# - switch device_map='auto' to avaoid memory error

# - pip install sentencepiece
# - pip install jupyter
# - pip install protobuf 



##### Load data 
Load validation set and split into val and dev set

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")
df = df.loc[df['set']=='val']
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.loc[df['label']=='Termijnagenda','label'] = 'Agenda'


In [31]:
from sklearn import preprocessing 

# encode labels to integers
label_encoder = preprocessing.LabelEncoder() 
df['encoded_label'] = label_encoder.fit_transform(df['label']) 


In [32]:
def combine_tokens(tokens):
    return ' '.join(tokens)

df['clean_text'] = df['clean_tokens'].apply(combine_tokens)


In [33]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['encoded_label'])
y=df['encoded_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# select small portion to get code running
# X_train = X_train.iloc[0:50]
# y_train = y_train.iloc[0:50]
# X_test = X_test.iloc[0:10]
# y_test = y_test.iloc[0:10]


### Training function


In [34]:
# from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

def run_baseline(baseline_function, dataframe, text_col, label_col):
    X = dataframe.drop(columns=[label_col])
    y=dataframe[label_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    vectorizer = TfidfVectorizer()
    X_train_tfidf_bin = vectorizer.fit_transform(X_train[text_col])
    X_test_tfidf_bin = vectorizer.transform(X_test[text_col])

    model = baseline_function

    # Train the classifier on the training data
    model.fit(X_train_tfidf_bin, y_train)

    y_pred = model.predict(X_test_tfidf_bin)

    # Calculate the accuracy of the classifier
    report = classification_report(y_test, y_pred)
    print(report)

    predictions = X_test.copy()
    predictions[label_col] = y_test
    predictions['prediction'] = y_pred
    return predictions

### Baseline 1: linear SVM+tf-idf

In [35]:
from sklearn.svm import LinearSVC
linear_svm = run_baseline(LinearSVC(), df, 'clean_text', 'label')



                      precision    recall  f1-score   support

         Actualiteit       1.00      0.80      0.89        44
              Agenda       0.99      0.98      0.99       173
             Besluit       1.00      0.95      0.98        22
               Brief       0.92      0.90      0.91        68
          Factsheets       1.00      0.40      0.57         5
               Motie       0.96      0.98      0.97       366
   Onderzoeksrapport       0.84      0.96      0.90        54
          Raadsadres       0.89      0.95      0.92        84
        Raadsnotulen       1.00      1.00      1.00         6
Schriftelijke Vragen       0.96      0.94      0.95       122
          Voordracht       1.00      1.00      1.00       125

            accuracy                           0.96      1069
           macro avg       0.96      0.90      0.92      1069
        weighted avg       0.96      0.96      0.96      1069



### Baseline 2: Naive Bayes+tf-idf

In [36]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = run_baseline(MultinomialNB(), df, 'clean_text', 'label')

                      precision    recall  f1-score   support

         Actualiteit       0.00      0.00      0.00        44
              Agenda       0.96      0.99      0.97       173
             Besluit       0.00      0.00      0.00        22
               Brief       0.00      0.00      0.00        68
          Factsheets       0.00      0.00      0.00         5
               Motie       0.48      1.00      0.65       366
   Onderzoeksrapport       0.78      0.13      0.22        54
          Raadsadres       0.00      0.00      0.00        84
        Raadsnotulen       0.00      0.00      0.00         6
Schriftelijke Vragen       1.00      0.02      0.05       122
          Voordracht       1.00      0.95      0.98       125

            accuracy                           0.62      1069
           macro avg       0.38      0.28      0.26      1069
        weighted avg       0.59      0.62      0.51      1069



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


precision, recall and f1-score equal to zero occurs if there are not True Positives. Meaning for those classes not one document is correctly predicted. 

### Baseline 3: Logistic Regression + tf-idf

In [37]:
from sklearn.linear_model import LogisticRegression
log_reg = run_baseline(LogisticRegression(), df, 'clean_text', 'label')

                      precision    recall  f1-score   support

         Actualiteit       1.00      0.57      0.72        44
              Agenda       0.99      0.98      0.99       173
             Besluit       1.00      0.77      0.87        22
               Brief       0.92      0.81      0.86        68
          Factsheets       1.00      0.20      0.33         5
               Motie       0.92      0.98      0.95       366
   Onderzoeksrapport       0.87      0.89      0.88        54
          Raadsadres       0.81      0.94      0.87        84
        Raadsnotulen       1.00      1.00      1.00         6
Schriftelijke Vragen       0.95      0.94      0.95       122
          Voordracht       0.99      0.99      0.99       125

            accuracy                           0.94      1069
           macro avg       0.95      0.83      0.86      1069
        weighted avg       0.94      0.94      0.93      1069



### Baseline 4: k Nearest Neigbors + tf-idf

In [38]:
from sklearn.neighbors import KNeighborsClassifier
knn = run_baseline(KNeighborsClassifier(), df, 'clean_text', 'label')

                      precision    recall  f1-score   support

         Actualiteit       0.59      0.43      0.50        44
              Agenda       0.82      0.99      0.90       173
             Besluit       0.71      0.77      0.74        22
               Brief       0.36      0.40      0.38        68
          Factsheets       0.38      0.60      0.46         5
               Motie       0.80      0.81      0.80       366
   Onderzoeksrapport       0.44      0.50      0.47        54
          Raadsadres       0.75      0.32      0.45        84
        Raadsnotulen       0.35      1.00      0.52         6
Schriftelijke Vragen       0.72      0.58      0.64       122
          Voordracht       0.89      0.97      0.93       125

            accuracy                           0.73      1069
           macro avg       0.62      0.67      0.62      1069
        weighted avg       0.74      0.73      0.73      1069



### Baseline 5: RandomForest + tf-idf

In [39]:
from sklearn.ensemble import RandomForestClassifier
random_forest = run_baseline(RandomForestClassifier(), df, 'clean_text', 'label')

                      precision    recall  f1-score   support

         Actualiteit       1.00      0.61      0.76        44
              Agenda       0.97      0.99      0.98       173
             Besluit       1.00      0.86      0.93        22
               Brief       0.82      0.94      0.88        68
          Factsheets       0.00      0.00      0.00         5
               Motie       0.97      0.95      0.96       366
   Onderzoeksrapport       0.85      0.94      0.89        54
          Raadsadres       0.82      0.96      0.89        84
        Raadsnotulen       1.00      0.83      0.91         6
Schriftelijke Vragen       0.96      0.94      0.95       122
          Voordracht       1.00      0.99      1.00       125

            accuracy                           0.94      1069
           macro avg       0.85      0.82      0.83      1069
        weighted avg       0.94      0.94      0.94      1069



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
