In [16]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [17]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


# setup environment GEITje-7B Finetuning
# - pip install torch
# - pip install datasets
# - pip install transformers
# - pip install trl
# - pip install accelerate (restart after)
# - switch device_map='auto' to avaoid memory error

# - pip install sentencepiece
# - pip install jupyter
# - pip install protobuf 



## Notebook overview
This notebook creates predictions for the baseline models. In total, five model are tried out.
- Functions to split the data. One function save the split as a column in txtfiles, the other loads the split.
- Training function. Given a baseline model, will return scores.
- Load Data. Load all the documents, and set parameters.
- TODO: save predictions


Kernel: Pytorch and Tensorflow

### Data split functions

In [18]:
from sklearn.model_selection import train_test_split
import pandas as pd

"""Function takes a dataframe and splits the data into train, test, val and dev set and save it.
Only need to run it once, load_data_spli is used to get the right subsets.

"""
def save_split(df, save_to_path):
    train_df, temp_df = train_test_split(df, test_size=0.25, random_state=42)

    # Splitting temp into test (20%) and val_dev (5%)
    test_df, val_dev_df = train_test_split(temp_df, test_size=0.2, random_state=42)

    # Splitting val_dev into validation (1%) and development (4%)
    dev_df,val_df = train_test_split(val_dev_df, test_size=0.2, random_state=42)

    # set split into 4 ways: train, test, val and dev
    train_df['4split'] = 'train'
    test_df['4split'] = 'test'
    val_df['4split'] = 'val'
    dev_df['4split'] = 'dev'

    # set split into 2 ways: test and training
    train_df['2split'] = 'train'
    test_df['2split'] = 'test'
    val_df['2split'] = 'test'
    dev_df['2split'] = 'train'

    # Combining the DataFrames
    final_df = pd.concat([train_df, test_df, val_df, dev_df])
    final_df.to_pickle(save_to_path)

# txtfiles = pd.read_pickle(f"{cf.output_path}/txtfiles_tokenizer.pkl")
# save_split(txtfiles, f"{cf.output_path}/txtfiles.pkl")


In [19]:
"""Function returns X and y set for either the train, val, test or dev set."""
def load_data_split(df, split_col,subset, label_col):
    subdf = df.loc[df[split_col]==subset]
    X = subdf.drop(columns=[label_col])
    y = subdf[label_col]
    return X, y

### Training function


In [22]:
# from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
  
import sys
sys.path.append('../scripts/') 
import prediction_helperfunctions as ph

import time


def run_baseline(baseline_function,model_name, dataframe,split_col, subset_train, subset_test, text_col, label_col, prediction_path, overview_path):
    start_time = time.time()

    ph.check_data_split_input(subset_train, subset_test)
    X_train, y_train = load_data_split(dataframe,split_col,subset_train,label_col) 
    X_test, y_test = load_data_split(dataframe,split_col,subset_test,label_col) 


    vectorizer = TfidfVectorizer()
    X_train_tfidf_bin = vectorizer.fit_transform(X_train[text_col])
    X_test_tfidf_bin = vectorizer.transform(X_test[text_col])

    model = baseline_function

    # Train the classifier on the training data
    model.fit(X_train_tfidf_bin, y_train)

    y_pred = model.predict(X_test_tfidf_bin)

    # Calculate the accuracy of the classifier
    report = classification_report(y_test, y_pred)
    print(report)

    date = ph.get_datetime()
    predictions = X_test.copy()
    predictions[label_col] = y_test
    predictions['prediction'] = y_pred
    # predictions['train_set'] = subset_train
    # predictions['test_set'] = subset_test
    predictions['model'] = model_name
    predictions['date'] = date

    # remove unneccary columns
    predictions = predictions.drop(columns=['set', 'text', 'tokens', 'token_count', 'clean_tokens', 'clean_tokens_count', 'pdf_path', 'clean_text', 'token_count_geitje', 'token_count_mistral', 'token_count_llama2_7b_hf', '4split', '2split'])

    # save predictions
    ph.combine_and_save_df(predictions, prediction_path)

    # save run -> scores + runtime
    overview = pd.DataFrame(
        [{
            'model':model_name,
            'date': date,
            'train_set': subset_train,
            'test_set': subset_test,
            'train_set_support':len(X_train),
            'test_set_support':len(X_test),
            'split_col':split_col,
            'text_col':text_col,
            'runtime':time.time()-start_time,
            'accuracy': accuracy_score(y_test, y_pred),
            'macro_avg_precision': precision_score(y_test, y_pred, average='macro'),
            'macro_avg_recall': recall_score(y_test, y_pred, average='macro'),
            'macro_avg_f1': f1_score(y_test, y_pred, average='macro'),
            'classification_report':report
        }   ]
    )
    ph.combine_and_save_df(overview, overview_path)

    return predictions

### Load data

In [21]:
from collections import Counter

df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")
print(Counter(df['2split']))
print(Counter(df['4split']))

#set  variables, same for each model
TRAIN_SET = 'train' # must be dev or train
TEST_SET = 'test' # must be val or test
SPLIT_COLUMN = '4split' #must be either 2split or 4split. 2split = data split into train and test. 4split = data split into train, test, dev and val. 
TEXT_COLUMN = 'text'
LABEL_COLUMN = 'label'
DATAFRAME = df.copy()
PATH = f"{cf.output_path}/predictions/baselinePredictions.pkl"
OVERVIEW_PATH = f"{cf.output_path}/overview/baselineOverview.pkl"

Counter({'train': 21096, 'test': 5608})
Counter({'train': 20028, 'test': 5340, 'dev': 1068, 'val': 268})


### Baselines

##### Baseline 1: linear SVM+tf-idf

In [25]:
from sklearn.svm import LinearSVC
linear_svm = run_baseline(LinearSVC(), 'LinearSVC' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

                      precision    recall  f1-score   support

         Actualiteit       0.93      0.82      0.87       205
              Agenda       0.86      0.97      0.91       696
             Besluit       0.97      0.96      0.97       162
               Brief       0.90      0.92      0.91       408
          Factsheets       0.53      0.41      0.47        41
               Motie       0.97      0.97      0.97      1713
   Onderzoeksrapport       0.85      0.94      0.89       258
          Raadsadres       0.94      0.99      0.96       408
        Raadsnotulen       0.98      1.00      0.99        58
Schriftelijke Vragen       0.99      0.96      0.97       557
       Termijnagenda       0.84      0.46      0.59       189
          Voordracht       1.00      1.00      1.00       645

            accuracy                           0.94      5340
           macro avg       0.90      0.87      0.88      5340
        weighted avg       0.94      0.94      0.94      5340



##### Baseline 2: Naive Bayes+tf-idf

In [28]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = run_baseline(MultinomialNB(), 'MultinomialNB' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                      precision    recall  f1-score   support

         Actualiteit       0.00      0.00      0.00       205
              Agenda       0.79      0.92      0.85       696
             Besluit       1.00      0.02      0.05       162
               Brief       0.00      0.00      0.00       408
          Factsheets       0.00      0.00      0.00        41
               Motie       0.44      1.00      0.61      1713
   Onderzoeksrapport       0.37      0.05      0.09       258
          Raadsadres       1.00      0.00      0.01       408
        Raadsnotulen       0.00      0.00      0.00        58
Schriftelijke Vragen       1.00      0.03      0.06       557
       Termijnagenda       1.00      0.08      0.15       189
          Voordracht       0.98      0.87      0.92       645

            accuracy                           0.56      5340
           macro avg       0.55      0.25      0.23      5340
        weighted avg       0.63      0.56      0.44      5340



  _warn_prf(average, modifier, msg_start, len(result))


precision, recall and f1-score equal to zero occurs if there are not True Positives. Meaning for those classes not one document is correctly predicted. 

##### Baseline 3: Logistic Regression + tf-idf

In [29]:
from sklearn.linear_model import LogisticRegression
log_reg = run_baseline(LogisticRegression(), 'LogisticRegression' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

##### Baseline 4: k Nearest Neigbors + tf-idf

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = run_baseline(KNeighborsClassifier(), 'KNeighborsClassifier' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

##### Baseline 5: RandomForest + tf-idf

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest = run_baseline(RandomForestClassifier(), DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN)
random_forest = run_baseline(RandomForestClassifier(), 'RandomForestClassifier' , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, PATH, OVERVIEW_PATH)

                      precision    recall  f1-score   support

         Actualiteit       0.93      0.67      0.78       183
              Agenda       0.83      0.89      0.86       728
             Besluit       0.98      0.92      0.95       145
               Brief       0.91      0.91      0.91       396
          Factsheets       0.75      0.32      0.45        47
               Motie       0.95      0.97      0.96      1644
   Onderzoeksrapport       0.84      0.92      0.88       263
          Raadsadres       0.85      0.95      0.89       385
        Raadsnotulen       0.98      1.00      0.99        55
Schriftelijke Vragen       0.98      0.95      0.96       591
       Termijnagenda       0.52      0.37      0.43       207
          Voordracht       0.99      1.00      0.99       696

            accuracy                           0.91      5340
           macro avg       0.88      0.82      0.84      5340
        weighted avg       0.91      0.91      0.91      5340



### Overview of all runs

In [None]:
overview = pd.read_pickle(OVERVIEW_PATH)
display(overview)