In [None]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [None]:
import sys
sys.path.append("..")

# MAKE SURE TO SET-UP PATH -> use local to run with demo data; use azure to run with complete dataset (access required)
# Select where to run notebook: "azure" or "local"
my_run = "azure"

if my_run == "azure":
    import config_azure as cf
    running_demo = False
elif my_run == "local":
    import config as cf
    running_demo = True


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

import pandas as pd


## Notebook overview
This notebook creates predictions for the baseline models. In total, five models are tried out.
- Training function. Given a baseline model, will return scores.
- Load Data. Load all the documents, and set parameters.
- save predictions


*Previous notebook: GetPredictions*

*Next notebook: plot*

### Load file with training funcation


In [None]:
import sys
sys.path.append('../src/') 
import baseline as bf

### Load data

In [7]:
import pandas as pd

df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")

In [None]:
from collections import Counter
sys.path.append('../src/') 
import baseline as bf
from truncation import add_truncation_column

#set  variables, same for each model
SPLIT_COLUMN = 'balanced_split' #column that has the data split saved. must be either 2split, 4split or balanced_split. 2split = data split into train and test. 4split = data split into train, test, dev and val. 
print('Distribution of sets: ', Counter(df[SPLIT_COLUMN]))
TRAIN_SET = 'train' # must be dev or train
TEST_SET = 'test' # must be val or test
# this split column, train_set and test_set might be a bit confusing. The split_column need to have values about the split, so a row either belongs, in my case, to 'train', 'test', 'dev' or 'val'.
# Then the train_set indates which rows will be selected based on the filtering of the split column. 
# Thus if TRAIN_SET = 'train', then all rows where split_col is 'train', will be selected as the training set.
# The same goes for TEST_SET    


TEXT_COLUMN = 'text' # column where the text is
LABEL_COLUMN = 'label' # column with truth label
DATAFRAME = df.copy() # df where each row is a doc. 
FOLDER = f"{cf.output_path}/predictionsFinal/baselines" # folder where each individual prediction is saved
OVERVIEW_PATH = f"{cf.output_path}/predictionsFinal/baselines/overview.pkl" # file where score and extra data about run is saved

# needed for truncation experiment on baselines
TRUNC_COLUMN = 'trunc_txt' # column with truncated text
TOKENS_COL = 'LlamaTokens' # column with text split into tokens using model tokenizer, in this case Llama, could also be MistralTokens
THRESHOLD_COMBINATIONS =[(100,0), (200,0), (100,100)] # combinations of front and back truncation thresholds. First value in tuple is first N tokens, second value is last N tokens.

In [None]:
# Function to run the baseline on each truncation threshold

def run_truncation_on_baselines(baseline_function, model_name, predictions_path):
    for thresholds in THRESHOLD_COMBINATIONS:

        # select thresholds
        front_threshold = thresholds[0]
        back_threshold = thresholds[1]

        # set run_id
        run_id = f"{model_name}_first{front_threshold}_last{back_threshold}"

        # get df with truncated text column
        trunc = add_truncation_column(DATAFRAME, TEXT_COLUMN, TOKENS_COL, front_threshold,back_threshold)

        # train and get predictions
        bf.run_baseline(baseline_function, model_name, trunc, SPLIT_COLUMN, TRAIN_SET, TEST_SET, TRUNC_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

### Baselines

##### Baseline 1: linear SVM+tf-idf

In [None]:
from sklearn.svm import LinearSVC
model_name = 'LinearSVC'
baseline_function = LinearSVC()
run_id = f"{model_name}_fulltext"
predictions_path = f"{FOLDER}/{model_name}predictions.pkl"

print(OVERVIEW_PATH)
print(predictions_path)
linear_svm = bf.run_baseline(baseline_function, model_name , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

run_truncation_on_baselines(baseline_function, model_name, predictions_path)

##### Baseline 2: Naive Bayes+tf-idf

In [None]:
from sklearn.naive_bayes import MultinomialNB
model_name = 'MultinomialNB'
baseline_function = MultinomialNB()
run_id = f"{model_name}_fulltext"
predictions_path = f"{FOLDER}/{model_name}predictions.pkl"
print(OVERVIEW_PATH)
print(predictions_path)

naive_bayes = bf.run_baseline(baseline_function, model_name , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

run_truncation_on_baselines(baseline_function, model_name, predictions_path)

##### Baseline 3: Logistic Regression + tf-idf

In [None]:
from sklearn.linear_model import LogisticRegression
model_name = 'LogisticRegression'
baseline_function = LogisticRegression()
run_id = f"{model_name}_fulltext"
predictions_path = f"{FOLDER}/{model_name}predictions.pkl"

print(OVERVIEW_PATH)
print(predictions_path)

log_reg = bf.run_baseline(baseline_function, model_name , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

run_truncation_on_baselines(baseline_function, model_name, predictions_path)

##### Baseline 4: k Nearest Neigbors + tf-idf

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model_name = 'KNeighborsClassifier'
baseline_function = KNeighborsClassifier()
run_id = f"{model_name}_fulltext"
predictions_path = f"{FOLDER}/{model_name}predictions.pkl"

print(OVERVIEW_PATH)
print(predictions_path)
knn = bf.run_baseline(baseline_function, model_name , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

run_truncation_on_baselines(baseline_function, model_name, predictions_path)

##### Baseline 5: RandomForest + tf-idf

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_name = 'RandomForestClassifier'
baseline_function = RandomForestClassifier()
run_id = f"{model_name}_fulltext"
predictions_path = f"{FOLDER}/{model_name}predictions.pkl"

print(OVERVIEW_PATH)
print(predictions_path)

random_forest = bf.run_baseline(baseline_function, model_name , DATAFRAME, SPLIT_COLUMN, TRAIN_SET, TEST_SET,TEXT_COLUMN, LABEL_COLUMN, predictions_path, OVERVIEW_PATH, run_id)

run_truncation_on_baselines(baseline_function, model_name, predictions_path)

### Overview of all runs

In [None]:
overview = pd.read_pickle(OVERVIEW_PATH)
display(overview)