In [10]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [11]:
import sys
sys.path.append("..")

# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
elif my_run == "local":
    import config as cf


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

import pandas as pd

# set-up environment - GEITje-7b-chat InContextLearning:
# - install blobfuse -> sudo apt-get install blobfuse
# - pip install transformers
# - pip install torch
# - pip install accelerate
# - pip install jupyter
# - pip install ipywidgets

## Notebook overview
Goal: analyse the data. Understand why the baselines work so well.

In [18]:
base_pred = pd.read_pickle(f'{cf.output_path}/predictions/baselineTruncationPredictions.pkl')
predictions = base_pred.loc[base_pred['trunc_col']== 'TruncationLlamaTokensFront100Back0']

In [13]:
txtfiles = pd.read_pickle(f'{cf.output_path}/txtfiles_tokenizer.pkl')

#### Check txtfiles
- check for duplicates
- class distribution

In [23]:
confusion_matrices = pd.crosstab(predictions['label'], predictions['prediction'])
display(confusion_matrices)
print(sum(sum(confusion_matrices.values)))


prediction,Actualiteit,Agenda,Besluit,Brief,Factsheets,Motie,Onderzoeksrapport,Raadsadres,Raadsnotulen,Schriftelijke Vragen,Voordracht
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Actualiteit,116,9,3,0,0,9,2,11,0,1,1
Agenda,2,521,0,0,0,0,2,3,0,0,0
Besluit,1,0,105,0,0,1,2,4,0,0,0
Brief,0,0,0,181,0,19,3,3,0,0,0
Factsheets,0,1,0,0,16,0,25,3,0,0,0
Motie,0,0,0,15,0,1516,2,11,0,1,0
Onderzoeksrapport,1,3,0,0,1,0,210,5,0,2,0
Raadsadres,1,0,1,3,1,6,9,291,0,1,0
Raadsnotulen,0,0,0,0,0,0,0,0,42,0,0
Schriftelijke Vragen,0,0,0,0,0,6,12,10,0,575,0


4164


In [28]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(predictions['label'], predictions['prediction'])

num_classes = len(np.unique(predictions['label']))

# Iterate over each class
for i in range(num_classes):
    class_name = f"Class {i}"
    tp = cm[i, i]  # True Positives
    fp = np.sum(cm[:, i]) - tp  # False Positives
    fn = np.sum(cm[i, :]) - tp  # False Negatives
    tn = np.sum(cm) - tp - fp - fn  # True Negatives

    print(f"\nConfusion Matrix for {class_name}:")
    print(f"True Positives: {tp}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"True Negatives: {tn}")



Confusion Matrix for Class 0:
True Positives: 116
False Positives: 5
False Negatives: 36
True Negatives: 4007

Confusion Matrix for Class 1:
True Positives: 521
False Positives: 13
False Negatives: 7
True Negatives: 3623

Confusion Matrix for Class 2:
True Positives: 105
False Positives: 5
False Negatives: 8
True Negatives: 4046

Confusion Matrix for Class 3:
True Positives: 181
False Positives: 18
False Negatives: 25
True Negatives: 3940

Confusion Matrix for Class 4:
True Positives: 16
False Positives: 2
False Negatives: 29
True Negatives: 4117

Confusion Matrix for Class 5:
True Positives: 1516
False Positives: 41
False Negatives: 29
True Negatives: 2578

Confusion Matrix for Class 6:
True Positives: 210
False Positives: 57
False Negatives: 12
True Negatives: 3885

Confusion Matrix for Class 7:
True Positives: 291
False Positives: 51
False Negatives: 22
True Negatives: 3800

Confusion Matrix for Class 8:
True Positives: 42
False Positives: 0
False Negatives: 0
True Negatives: 4122


In [33]:

from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

import sys
sys.path.append('../scripts/') 
from baseline import load_data_split

# Assuming X_train and y_train are your training data
# Replace X_train and y_train with your actual data
X_train, y_train= load_data_split(txtfiles, '4split','train', 'label')

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data and transform the text data to TF-IDF features
X_tfidf_train = tfidf_vectorizer.fit_transform(X_train['text'])

# Initialize SVM classifier
svm_model = SVC(kernel='linear')

# Perform cross-validation
# Specify the number of folds (e.g., 5-fold cross-validation)
num_folds = 5
cv_scores = cross_val_score(svm_model, X_tfidf_train, y_train, cv=num_folds)

# Print cross-validation scores
print("Cross-Validation Scores:")
for fold, score in enumerate(cv_scores, start=1):
    print(f"Fold {fold}: {score}")

# Calculate mean and standard deviation of cross-validation scores
mean_cv_score = np.mean(cv_scores)
std_cv_score = np.std(cv_scores)

print("\nMean Cross-Validation Score:", mean_cv_score)
print("Standard Deviation of Cross-Validation Scores:", std_cv_score)



Cross-Validation Scores:
Fold 1: 0.9554915145693244
Fold 2: 0.9535702849823887
Fold 3: 0.9644572526416907
Fold 4: 0.9586803331197951
Fold 5: 0.9596412556053812

Mean Cross-Validation Score: 0.958368128183716
Standard Deviation of Cross-Validation Scores: 0.003744061091639378
