In [None]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh


In [1]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE


# setup environment GEITje-7B Finetuning
# - pip install torch
# - pip install datasets
# - pip install transformers
# - pip install trl
# - pip install accelerate (restart after)
# - switch device_map='auto' to avaoid memory error

# - pip install sentencepiece
# - pip install jupyter
# - pip install protobuf 



##### Load data 
Load validation set and split into val and dev set

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_pickle(f"{cf.output_path}/txtfiles.pkl")
df = df.loc[df['set']=='val']
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


In [10]:
from sklearn import preprocessing 

# encode labels to integers
label_encoder = preprocessing.LabelEncoder() 
df['encoded_label'] = label_encoder.fit_transform(df['label']) 


In [11]:
def combine_tokens(tokens):
    return ' '.join(tokens)

df['clean_text'] = df['clean_tokens'].apply(combine_tokens)


In [12]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['encoded_label'])
y=df['encoded_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# select small portion to get code running
# X_train = X_train.iloc[0:50]
# y_train = y_train.iloc[0:50]
# X_test = X_test.iloc[0:10]
# y_test = y_test.iloc[0:10]


### Training function


In [13]:
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer


def run_baseline(baseline_function, dataframe, text_col):
    X = dataframe.drop(columns=['encoded_label'])
    y=dataframe['encoded_label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    vectorizer = TfidfVectorizer()
    X_train_tfidf_bin = vectorizer.fit_transform(X_train[text_col])
    X_test_tfidf_bin = vectorizer.transform(X_test[text_col])

    model = baseline_function

    # Train the classifier on the training data
    model.fit(X_train_tfidf_bin, y_train)

    y_pred = model.predict(X_test_tfidf_bin)

    # Calculate the accuracy of the classifier
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    predictions = X_test.copy()
    predictions['encoded_label'] = y_test
    predictions['prediction'] = y_pred
    return predictions

### Baseline 1: linear SVM+tf-idf

In [14]:
from sklearn.svm import LinearSVC
linear_svm = run_baseline(LinearSVC(), df, 'clean_text')



Accuracy: 0.9326473339569691


### Baseline 2: Naive Bayes+tf-idf

In [15]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = run_baseline(MultinomialNB(), df, 'clean_text')

Accuracy: 0.5921421889616464


### Baseline 3: Logistic Regression + tf-idf

In [18]:
from sklearn.linear_model import LogisticRegression
log_reg = run_baseline(LogisticRegression(), df, 'clean_text')

Accuracy: 0.9148737137511693


### Baseline 4: k Nearest Neigbors + tf-idf

In [20]:
from sklearn.neighbors import KNeighborsClassifier
knn = run_baseline(KNeighborsClassifier(), df, 'clean_text')

Accuracy: 0.7090739008419084


### Baseline 5: RandomForest + tf-idf

In [21]:
from sklearn.ensemble import RandomForestClassifier
random_forest = run_baseline(RandomForestClassifier(), df, 'clean_text')

Accuracy: 0.9195509822263798
