# Load Libraries

In [1]:
# %pip install scikit-learn
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# %pip install tqdm
# %pip install transformers
#%pip install matplotlib
#%pip install seaborn

In [2]:
import os
import pandas as pd
import ast
import torch
from transformers import BertTokenizer, BertModel
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import My_Machine_Learning_Tools as mytools
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


# Load and Explore Data

In [3]:
df_train=pd.read_csv('ModApte_train.csv')
df_test=pd.read_csv('ModApte_test.csv')

In [4]:
def series_to_list(df,column_name):
    result=df[column_name].replace({' ':''},regex=True)
    result.replace({'\\n':''},regex=True,inplace=True)
    result.replace({'\'\'':'\',\''},regex=True,inplace=True)
    return result.apply(ast.literal_eval)

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9603 entries, 0 to 9602
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   text         8816 non-null   object
 1   text_type    9603 non-null   object
 2   topics       9603 non-null   object
 3   lewis_split  9603 non-null   object
 4   cgis_split   9603 non-null   object
 5   old_id       9603 non-null   object
 6   new_id       9603 non-null   object
 7   places       9603 non-null   object
 8   people       9603 non-null   object
 9   orgs         9603 non-null   object
 10  exchanges    9603 non-null   object
 11  date         9603 non-null   object
 12  title        9549 non-null   object
dtypes: object(13)
memory usage: 975.4+ KB


In [6]:
df_train.dtypes

text           object
text_type      object
topics         object
lewis_split    object
cgis_split     object
old_id         object
new_id         object
places         object
people         object
orgs           object
exchanges      object
date           object
title          object
dtype: object

In [7]:
df_train.head()

Unnamed: 0,text,text_type,topics,lewis_split,cgis_split,old_id,new_id,places,people,orgs,exchanges,date,title
0,Showers continued throughout the week in\nthe ...,"""NORM""",['cocoa'],"""TRAIN""","""TRAINING-SET""","""5544""","""1""",['el-salvador' 'usa' 'uruguay'],[],[],[],26-FEB-1987 15:01:01.79,BAHIA COCOA REVIEW
1,The U.S. Agriculture Department\nreported the ...,"""NORM""",['grain' 'wheat' 'corn' 'barley' 'oat' 'sorghum'],"""TRAIN""","""TRAINING-SET""","""5548""","""5""",['usa'],[],[],[],26-FEB-1987 15:10:44.60,NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE
2,Argentine grain board figures show\ncrop regis...,"""NORM""",['veg-oil' 'linseed' 'lin-oil' 'soy-oil' 'sun-...,"""TRAIN""","""TRAINING-SET""","""5549""","""6""",['argentina'],[],[],[],26-FEB-1987 15:14:36.41,ARGENTINE 1986/87 GRAIN/OILSEED REGISTRATIONS
3,Moody's Investors Service Inc said it\nlowered...,"""NORM""",[],"""TRAIN""","""TRAINING-SET""","""5551""","""8""",['usa'],[],[],[],26-FEB-1987 15:15:40.12,USX &lt;X> DEBT DOWGRADED BY MOODY'S
4,Champion Products Inc said its\nboard of direc...,"""NORM""",['earn'],"""TRAIN""","""TRAINING-SET""","""5552""","""9""",['usa'],[],[],[],26-FEB-1987 15:17:11.20,CHAMPION PRODUCTS &lt;CH> APPROVES STOCK SPLIT


# Preprocessing

#### Define Parameters

In [26]:
fit_encoder=False
load_encoder=True
save_encoder=False

#### Define treatment of columns und topics

In [9]:
#define topics
topic_column = 'topics'
food = ['coconut', 'cotton-oil', 'sorghum', 'orange', 'rice', 'soybean', 'sun-meal', 
    'oilseed', 'sugar', 'hog', 'coffee', 'groundnut', 'sunseed', 'sun-oil', 'rye', 
    'lin-oil', 'copra-cake', 'potato', 'barley', 'tea', 'meal-feed', 'coconut-oil', 
    'palmkernel', 'cottonseed', 'castor-oil', 'l-cattle', 'livestock', 'soy-oil', 
    'rape-oil', 'palm-oil', 'cocoa', 'cotton', 'wheat', 'corn', 'f-cattle', 'grain', 
    'soy-meal', 'oat', 'groundnut-oil', 'veg-oil','rapeseed']
resource = ['platinum', 'lead', 'nickel', 'strategic-metal', 'copper', 'palladium', 'gold', 
    'zinc', 'tin', 'iron-steel', 'alum', 'silver', 'nat-gas', 'rubber', 'pet-chem', 'fuel', 'crude','lumber','propane','wool']
finance = ['money-supply', 'dlr', 'nkr', 'lei', 'yen', 'dfl', 'sfr', 'cpi', 'instal-debt', 
    'money-fx', 'gnp', 'interest', 'income', 'dmk', 'rand', 'bop', 'reserves', 'nzdlr','acq']
personal_finance = ['housing','jobs','earn']
transport = ['jet', 'ship']
topics=[[food,'food'],[resource,'resource'],[finance,'finance'],[personal_finance,'personal_finance'],[transport,'transport']]
topics_to_remove = ['gas', 'heat', 'trade', 'retail', 'carcass', 'cpu', 'wpi', 'naphtha', 'ipi','stg','inventories']

#columns with special treatment
list_column='places'
drop_columns=['text_type','people','orgs','exchanges','lewis_split','cgis_split','old_id','new_id']
notnan_columns=['text','topics']
date_columns=['date']
text_columns=['text','title']

#### Define functions for Prepeocessing

These may be turned into a library later.

In [10]:
def drop_row_notnan_columms(df,notnan_columns):
    df_copy = df.copy()
    
    for column in notnan_columns:
        df_copy[column].dropna(inplace=True)
    
    return df_copy


In [11]:
def format_listcolumns(df, column):
    """
    Wandelt eine Spalte mit Listen als Strings formatiert in echte Listen um und gibt ein DataFrame und die eindeutigen Werte zurück.

    Parameters:
    df (pd.DataFrame): Der DataFrame, der die Spalte enthält.
    column (str): Der Name der Spalte, die konvertiert werden soll.

    Returns:
    pd.DataFrame: Das DataFrame mit der umgewandelten Spalte.
    list: Eine Liste der eindeutigen Werte in der umgewandelten Spalte.

    Example:
    >>> df, unique_values = format_listcolumns(df_train, 'features')
    """
    # Kopie der Spalte erstellen, um die Originaldaten nicht zu ändern
    df_copy = df.copy()

    # Umwandlung der Spalte von einem String in eine Liste
    df_copy[column].replace({'\\n': ''}, regex=True, inplace=True)
    df_copy = mytools.df_string_to_list(df_copy, column, entry_delimiter="'", separator=' ')

    # Eindeutige Werte in der umgewandelten Spalte finden
    unique_values = mytools.df_unique_list_values(df_copy, column)

    return df_copy, unique_values

In [12]:
#funtion to reorganize a column of subtopics into  broader topics and removing some of them 
def categorize_topics(df,column,topics,remove):
    df_copy = df.copy()
    
    for topic in topics:
        for subtopic in topic[0]:
            df_copy[column] = df_copy[column].replace({'\'' + subtopic + '\'': '\'' + topic[1] + '\''}, regex=True)
    
    for subtopic in remove:
        df_copy[column] = df_copy[column].replace({'\'' + subtopic + '\'': ''}, regex=True)
    
    df_copy[column] = df_copy[column].replace({' ': ''}, regex=True)
    df_copy[column] = series_to_list(df_copy, column)
    df_copy = df_copy[df_copy[column].str.len() == 1]
    df_copy[column] = df_copy[column].apply(lambda x: x[0])
    
    return df_copy

In [13]:
def format_datecolumns(df,date_columns):
    # Kopie des DataFrame erstellen, um die Originaldaten nicht zu ändern
    df_copy = df.copy()

    for column in date_columns:
        # Die Zeichenkette in ein Datum konvertieren
        df_copy[column] = pd.to_datetime(df_copy[column].str.strip().str.split(' ').str.get(0))
        df_copy[column+'_month'] = df_copy[column].dt.month
        df_copy[column+'_month'] = (df_copy[column+'_month'] - df_copy[column+'_month'].mean()) / df_copy[column+'_month'].std()

        # Woche extrahieren (altes Verhalten, ab Pandas 1.1.0 ist isocalendar().week empfohlen)
        df_copy[column+'_day_month'] = df_copy[column].dt.day
        df_copy[column+'_day_month'] = (df_copy[column+'_day_month'] - df_copy[column+'_day_month'].mean()) / df_copy[column+'_day_month'].std()

        # Tag extrahieren
        df_copy[column+'_day_year'] = df_copy[column].dt.day_of_year
        df_copy[column+'_day_year'] = (df_copy[column+'_day_year'] - df_copy[column+'_day_year'].mean()) / df_copy[column+'_day_year'].std()

        # Wochentag extrahieren (Montag=0, Sonntag=6)
        df_copy[column+'_weekday'] = df_copy[column].dt.day_name('en')

        df_copy[column+'_quarter_year'] = df_copy[column].dt.quarter
        df_copy = pd.get_dummies(df_copy, columns=[column+'_weekday'])
        weekdays = ['weekday_Monday', 'weekday_Tuesday', 'weekday_Wednesday', 'weekday_Thursday', 'weekday_Friday', 'weekday_Saturday', 'weekday_Sunday']
        for weekday in weekdays:
            if not column+'_'+weekday in df_copy.columns:
                df_copy[column+'_'+weekday] = 0
            else:
                df_copy[column+'_'+weekday] = df_copy[column+'_'+weekday].astype(int)

        df_copy = df_copy.drop(columns=column)

    return df_copy

In [21]:
def format_textcolumns(df,text_columns):
    # Kopie des DataFrame erstellen, um die Originaldaten nicht zu ändern
    df_copy = df.copy()

    for column in text_columns:
        df_copy[column].replace({'&lt;': '<'}, regex=True, inplace=True)
        df_copy[column].replace({'\\n': ' '}, regex=True, inplace=True)
        df_copy[column] = df_copy[column].str.replace('\s+', ' ', regex=True)
        df_copy[column] = df_copy[column].str.lower()
        df_copy[column] = df_copy[column].fillna(value='')

    return df_copy

In [14]:
def handle_special_columns(df,list_column,list_possible_values,drop_columns,date_columns,notnan_columns,text_columns):
    # Kopie des DataFrame erstellen, um die Originaldaten nicht zu ändern
    df_copy = df.copy()

    # Spalten aus dem DataFrame entfernen
    df_copy = df_copy.drop(columns=drop_columns)

    # Spalte mit Listen explodieren und mögliche Werte festlegen
    df_copy = mytools.df_explode_listcolumn(df_copy, list_column, list_possible_values)

    # Datumsangaben formatieren
    df_copy = format_datecolumns(df_copy, date_columns)

    # Zeilen entfernen, die NaN-Werte in bestimmten Spalten enthalten
    df_copy = drop_row_notnan_columms(df_copy, notnan_columns)

    # Textspalten formatieren
    df_copy = format_textcolumns(df_copy, text_columns)

    return df_copy

In [18]:
#has to expanded to make it readeble by model
def preprocessing(df,topic_column,topics,topics_to_remove,list_column,list_possible_values,drop_columns,date_columns,notnan_columns,text_columns,encoder,fit_encoder=True):
    df_copy = df.copy()
    
    df_copy = categorize_topics(df_copy, topic_column, topics, topics_to_remove)
    df_copy = handle_special_columns(df_copy, list_column, list_possible_values, drop_columns, date_columns, notnan_columns, text_columns)
    
    additional_features = torch.tensor(df_copy.drop(columns=(text_columns + [topic_column])).values)
    additional_features = additional_features.float()
    
    if fit_encoder:
        labels = torch.tensor(encoder.extend_transform(df_copy[topic_column]))
    else:
        labels = torch.tensor(encoder.transform(df_copy[topic_column]))
    labels = labels.long()
    
    return df_copy[text_columns], additional_features, labels, encoder

#### Actual Preprocessing

In [16]:

df,unique_values_test = format_listcolumns(df_test,list_column)
df,unique_values_train = format_listcolumns(df_train,list_column)
unique_countries=unique_values_train

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy[column].replace({'\\n': ''}, regex=True, inplace=True)


In [30]:
label_encoder = mytools.LabelEncoder()
label_encoder.extend_transform

if load_encoder:
    label_encoder.load('encoder.pkl')

In [32]:

train_df_text,train_additional_features,train_labels,label_encoder = preprocessing(df_train,topic_column,topics,topics_to_remove,list_column,unique_countries,drop_columns,date_columns,notnan_columns,text_columns,label_encoder,fit_encoder=fit_encoder)

test_df_text,test_additional_features,test_labels,label_encoder = preprocessing(df_test,topic_column,topics,topics_to_remove,list_column,unique_countries,drop_columns,date_columns,notnan_columns,text_columns,label_encoder,fit_encoder=fit_encoder)

if save_encoder:
    label_encoder.save('encoder.pkl')

  df_copy[column] = pd.to_datetime(df_copy[column].str.strip().str.split(' ').str.get(0))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy[column].replace({'&lt;': '<'}, regex=True, inplace=True)
  df_copy[column] = pd.to_datetime(df_copy[column].str.strip().str.split(' ').str.get(0))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original

# Erstellung des Modell

#### Definieren der Parameter

In [30]:
model_name='bert-base-multilingual-uncased'
num_additional_features=139
num_classes=5
freeze_bert=True
num_epochs=5
batch_size=32
model_path_base='models/Bert_freeze'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")


Device: cpu


#### Initilisieren vom Tokennizer

In [24]:
tokenizer = BertTokenizer.from_pretrained(model_name)

## Definieren der benötigten Funktionen und Objekte

#### Tokenizen und Dataset

In [25]:
def tokenize_texts(text,length=128):
    tokenized_text = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=length)
    return tokenized_text

def tokenize_inputs(df_text):
    tokenized_inputs1 = []
    tokenized_inputs2 = []
    for idx, row in df_text.iterrows():
        inputs1 = tokenize_texts(row['text'],256)
        inputs2 = tokenize_texts(row['title'],16)
        tokenized_inputs1.append(inputs1)
        tokenized_inputs2.append(inputs2)
    return tokenized_inputs1,tokenized_inputs2

In [26]:
class Text_Text_Feature_Dataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_inputs1, tokenized_inputs2, additional_features, labels):
        self.tokenized_inputs1 = tokenized_inputs1
        self.tokenized_inputs2 = tokenized_inputs2
        self.additional_features = additional_features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids1 = self.tokenized_inputs1[idx]['input_ids'].squeeze()
        attention_mask1 = self.tokenized_inputs1[idx]['attention_mask'].squeeze()
        input_ids2 = self.tokenized_inputs2[idx]['input_ids'].squeeze()
        attention_mask2 = self.tokenized_inputs2[idx]['attention_mask'].squeeze()
        additional_features = self.additional_features[idx]
        label = self.labels[idx]
        return input_ids1, attention_mask1, input_ids2, attention_mask2, additional_features, label

#### Modell

In [27]:
class MultilingualBERTClassifier(nn.Module):
    def __init__(self, bert_model_name='bert-base-multilingual-uncased', num_additional_features=119, num_classes=5, freeze_bert=True):
        super(MultilingualBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        
        # Einfrieren der BERT-Gewichte, falls angegeben
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
        self.additional_features_layer = nn.Linear(num_additional_features, 128)
        self.classifier = nn.Linear(768 * 2 + 128, num_classes)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2, additional_features):
        outputs1 = self.bert(input_ids1, attention_mask=attention_mask1)
        pooled_output1 = outputs1[1]  # [CLS] token representation
        
        outputs2 = self.bert(input_ids2, attention_mask=attention_mask2)
        pooled_output2 = outputs2[1]  # [CLS] token representation
        
        additional_features_output = self.additional_features_layer(additional_features)
        additional_features_output = torch.relu(additional_features_output)
        
        combined_output = torch.cat((pooled_output1, pooled_output2, additional_features_output), dim=1)
        combined_output = self.dropout(combined_output)
        
        logits = self.classifier(combined_output)
        
        return logits

## Training

#### Erstellen des Datasets

In [28]:
#Tokenize Text
train_tokenized_inputs1, train_tokenized_inputs2= tokenize_inputs(train_df_text)
test_tokenized_inputs1, test_tokenized_inputs2= tokenize_inputs(test_df_text)
#Create Datasets
train_dataset = Text_Text_Feature_Dataset(train_tokenized_inputs1, train_tokenized_inputs2,train_additional_features,train_labels)
test_dataset = Text_Text_Feature_Dataset(test_tokenized_inputs1, test_tokenized_inputs2,test_additional_features,test_labels)

#### Initialisieren von Modell

In [32]:
#Create modell or load previous one
model = MultilingualBERTClassifier(num_additional_features=num_additional_features, num_classes=num_classes)
#load_model(model,model_path_base)
mytools.modelversions_load_model(model,model_path_base)
# move modell to divice if possible
model.to(device)
print('Model loaded and on device')

No versions available.
Model loaded and on device


In [33]:
#create Dataloader
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# Loss-Funktion and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-5)

#### Training

In [None]:
model.train()
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for batch in tqdm(dataloader, desc=f'Epoch {epoch + 1}'):
        input_ids1, attention_mask1, input_ids2, attention_mask2, additional_features, labels = batch
        input_ids1, attention_mask1 = input_ids1.to(device), attention_mask1.to(device)
        input_ids2, attention_mask2 = input_ids2.to(device), attention_mask2.to(device)
        additional_features, labels = additional_features.to(device), labels.to(device)

        optimizer.zero_grad()
        
        # Vorwärtsdurchlauf
        logits = model(input_ids1, attention_mask1, input_ids2, attention_mask2, additional_features)

        # Verlust berechnen
        loss = criterion(logits, labels)
        epoch_loss += loss.item()
        
        # Rückwärtsdurchlauf und Optimierung
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1} (BERT eingefroren), Loss: {epoch_loss / len(dataloader)}")

Epoch 1: 100%|██████████| 199/199 [00:59<00:00,  3.34it/s]


Epoch 1 (BERT eingefroren), Loss: 1.1001562727755638


Epoch 2: 100%|██████████| 199/199 [00:59<00:00,  3.33it/s]


Epoch 2 (BERT eingefroren), Loss: 1.0979603894391852


Epoch 3: 100%|██████████| 199/199 [01:01<00:00,  3.24it/s]


Epoch 3 (BERT eingefroren), Loss: 1.0939032968564248


Epoch 4: 100%|██████████| 199/199 [01:00<00:00,  3.28it/s]


Epoch 4 (BERT eingefroren), Loss: 1.092449508420187


Epoch 5: 100%|██████████| 199/199 [01:00<00:00,  3.30it/s]

Epoch 5 (BERT eingefroren), Loss: 1.0834619058436485





In [34]:
#test modell
model.eval()
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
test_predictions = []
test_labels = []

with torch.no_grad():
    for batch in tqdm(test_dataloader):
        input_ids1, attention_mask1, input_ids2, attention_mask2, additional_features, labels = batch
        input_ids1, attention_mask1 = input_ids1.to(device), attention_mask1.to(device)
        input_ids2, attention_mask2 = input_ids2.to(device), attention_mask2.to(device)
        additional_features, labels = additional_features.to(device), labels.to(device)

        logits = model(input_ids1, attention_mask1, input_ids2, attention_mask2, additional_features)
        predictions = torch.argmax(logits, dim=1)
        
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_predictions = label_encoder.inverse_transform(test_predictions)
test_labels = label_encoder.inverse_transform(test_labels)

accuracy = sum(test_predictions == test_labels) / len(test_labels)
print(f'Accuracy: {accuracy}')

100%|██████████| 79/79 [21:20<00:00, 16.21s/it]

Accuracy: 0.08619313647246608





In [107]:
from sklearn.metrics import confusion_matrix
classes=[topic[1] for topic in topics]
def calculate_and_append_metrics(epoch, classes, labels, predictions,df=None):
    # Confusion Matrix erstellen
    cm = confusion_matrix(labels, predictions, labels=classes)

    # Spalten für das DataFrame definieren
    columns = ['Epoch', 'Class', 'TP abs', 'True Positive', 'TP %', 'FP abs', 'False Positive', 'FP %', 'FN abs','False Negative', 'FN %']
    data = []

    # Iteration über die Klassen
    for idx, class_name in enumerate(classes):
        total = sum(1 for label in labels if label == class_name)
        true_positive = cm[idx, idx]
        false_positive = cm[:, idx].sum() - true_positive
        false_negative = cm[idx,:].sum() - true_positive

        tp_percent = (true_positive / total) * 100 if total > 0 else 0
        fp_percent = (false_positive / total) * 100 if total > 0 else 0
        fn_percent = (false_negative / total) * 100 if total > 0 else 0

        tp_text = f"{round(tp_percent,2)}% ({true_positive})"
        fp_text = f"{round(fp_percent,2)}% ({false_positive})"
        fn_text = f"{round(fn_percent,2)}% ({false_negative})"


        data.append([epoch, class_name, true_positive, tp_text, tp_percent, false_positive, fp_text, fp_percent, false_negative, fn_text, fn_percent])

    # DataFrame für die aktuelle Epoche erstellen
    epoch_df = pd.DataFrame(data, columns=columns)
    epoch_df.set_index(['Epoch', 'Class'],inplace=True)
    if df is None:
        return epoch_df
    else:
        return pd.concat([df, epoch_df])
    
df=calculate_and_append_metrics(1, classes, test_labels, test_predictions)
#est labels und prediction vertauscht um unterschiedliche Daen zu haben
df=calculate_and_append_metrics(2, classes, test_labels, test_predictions, df)
print(df)


                        TP abs True Positive       TP %  FP abs  \
Epoch Class                                                       
1     food                  29   22.83% (29)  22.834646     556   
      resource             146  55.09% (146)  55.094340    1683   
      finance               40    4.09% (40)   4.085802      37   
      personal_finance       0      0.0% (0)   0.000000       0   
      transport              1      2.7% (1)   2.702703      14   
2     food                  29   22.83% (29)  22.834646     556   
      resource             146  55.09% (146)  55.094340    1683   
      finance               40    4.09% (40)   4.085802      37   
      personal_finance       0      0.0% (0)   0.000000       0   
      transport              1      2.7% (1)   2.702703      14   

                        False Positive        FP %  FN abs False Negative  \
Epoch Class                                                                 
1     food                437.8% (556)  4

In [108]:
metrics_df = df.loc[df.index.get_level_values('Epoch') >= (df.index.get_level_values('Epoch').max() - 2)]
columns = pd.MultiIndex.from_product([metrics_df.columns.get_level_values(0).unique(), ['True Positive', 'False Positive', 'False Negative']])
metrics_table = pd.DataFrame(index=metrics_df.index.levels[0], columns=columns)
metrics_df = metrics_df[['True Positive','False Positive','False Negative']]
metrics_df.stack().unstack(level=1)

Unnamed: 0_level_0,Class,finance,food,personal_finance,resource,transport
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,True Positive,4.09% (40),22.83% (29),0.0% (0),55.09% (146),2.7% (1)
1,False Positive,3.78% (37),437.8% (556),0.0% (0),635.09% (1683),37.84% (14)
1,False Negative,95.91% (939),77.17% (98),100.0% (1098),44.91% (119),97.3% (36)
2,True Positive,4.09% (40),22.83% (29),0.0% (0),55.09% (146),2.7% (1)
2,False Positive,3.78% (37),437.8% (556),0.0% (0),635.09% (1683),37.84% (14)
2,False Negative,95.91% (939),77.17% (98),100.0% (1098),44.91% (119),97.3% (36)


In [91]:
# Funktion zur Erstellung der geplotteten Tabelle
def plot_metrics_table(metrics_df):
    # Nur die letzten 3 Epochen anzeigen
    metrics_df = metrics_df.loc[metrics_df.index.get_level_values('Epoch') >= (metrics_df.index.get_level_values('Epoch').max() - 2)]
    
    # MultiIndex für Spalten erstellen
    columns = pd.MultiIndex.from_product([metrics_df.columns.get_level_values(0).unique(), ['True Positive', 'False Positive', 'False Negative']])
    metrics_table = pd.DataFrame(index=metrics_df.index.levels[0], columns=columns)

    # Werte formatieren und in die neue Tabelle einfügen
    for epoch in metrics_df.index.levels[0]:
        for class_name in metrics_df.columns.get_level_values(0).unique():
            tp = metrics_df.loc[(epoch, class_name), 'TP %']
            tp_abs = metrics_df.loc[(epoch, class_name), 'True Positive']
            fp = metrics_df.loc[(epoch, class_name), 'FP %']
            fp_abs = metrics_df.loc[(epoch, class_name), 'False Positive']
            fn = metrics_df.loc[(epoch, class_name), 'FN %']
            fn_abs = metrics_df.loc[(epoch, class_name), 'False Negative']

            metrics_table.loc[epoch, (class_name, 'True Positive')] = f"{tp:.1f}% ({tp_abs})"
            metrics_table.loc[epoch, (class_name, 'False Positive')] = f"{fp:.1f}% ({fp_abs})"
            metrics_table.loc[epoch, (class_name, 'False Negative')] = f"{fn:.1f}% ({fn_abs})"

    fig, ax = plt.subplots(figsize=(12, 6))
    ax.axis('tight')
    ax.axis('off')

    table = ax.table(cellText=metrics_table.values,
                     colLabels=metrics_table.columns,
                     rowLabels=metrics_table.index,
                     cellLoc='center',
                     loc='center')

    plt.show()

# Plotte die Tabelle
plot_metrics_table(df)

KeyError: (1, 'True Positive')

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Beispiel DataFrame (hier kannst du dein eigenes DataFrame verwenden)
data = {
    'Epoch': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
    'Class': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'],
    'True Positive': [100, 120, 110, 130, 115, 135, 105, 125, 120, 140],
    'TP %': [80.0, 85.0, 82.5, 87.5, 78.0, 89.5, 75.0, 80.5, 82.0, 90.0],
    'False Positive': [20, 18, 22, 20, 24, 23, 21, 19, 18, 16],
    'FP %': [10.0, 9.0, 11.0, 10.0, 12.0, 11.5, 10.5, 9.5, 9.0, 8.0],
    'False Negative': [30, 28, 35, 32, 32, 30, 28, 25, 30, 27],
    'FN %': [15.0, 14.0, 17.5, 16.0, 16.0, 15.0, 14.0, 12.5, 15.0, 13.5]
}
df = pd.DataFrame(data)

# Die letzten drei Epochen auswählen
last_three_epochs = df[df['Epoch'].isin(df['Epoch'].unique()[-3:])]

# Spalten auswählen, die für den Gradienten relevant sind (TP %, FP %, FN %)
gradient_columns = ['TP %', 'FP %', 'FN %']

# Konvertiere die relevanten Spalten in ein Array
data_array = last_three_epochs.pivot(index='Class', columns='Epoch', values=gradient_columns).values

# Berechne die Änderungen zum vorherigen Eintrag
diffs = np.diff(data_array, axis=1)

# Setze die Farbpalette für den Gradienten
cmap = sns.diverging_palette(220, 20, as_cmap=True)



In [44]:
data_array

array([[78. , 75. , 82. , 12. , 10.5,  9. , 16. , 14. , 15. ],
       [89.5, 80.5, 90. , 11.5,  9.5,  8. , 15. , 12.5, 13.5]])

In [41]:
# Plot erstellen
plt.figure(figsize=(10, 6))
sns.heatmap(diffs[:, -1, :], annot=data_array[:, -1, :], fmt=".1f", cmap=cmap, cbar=True,
            xticklabels=gradient_columns, yticklabels=last_three_epochs['Class'].unique(),
            linewidths=.5, square=True, annot_kws={"size": 10})

plt.title('Gradient Plot der letzten drei Epochen')
plt.xlabel('Metrik')
plt.ylabel('Klasse')
plt.xticks(rotation=45)
plt.yticks(rotation=0)

plt.tight_layout()
plt.show()


IndexError: too many indices for array: array is 2-dimensional, but 3 were indexed

<Figure size 1000x600 with 0 Axes>

In [50]:
#define categories
categories = ['food', 'resource', 'finance', 'personal_finance', 'transport']
accuracy_data = [test_predictions==test_labels,test_labels]
#total datapoints
total = len(test_labels)
# DataFrame erstellen
accuracy_df = pd.DataFrame(accuracy_data).T
#group by and sum for each category and total
accuracy_df = accuracy_df.groupby(1).sum()
accuracy_df


Unnamed: 0_level_0,0
1,Unnamed: 1_level_1
finance,True
food,False
personal_finance,True
resource,False
transport,False


In [46]:
# Tabelle plotten
fig, ax = plt.subplots()
ax.axis('tight')
ax.axis('off')
table = ax.table(cellText=accuracy_df.values, colLabels=accuracy_df.columns, rowLabels=accuracy_df.index, loc='center', cellLoc='center')
table.auto_set_font_size(False)
table.set_fontsize(14)
table.scale(1.2, 1.2)

plt.title('Accuracy für verschiedene Kategorien')
plt.show()

NameError: name 'plt' is not defined

In [41]:
#print accuracy by topic
for topic in label_encoder.classes_:
    topic_mask = test_labels == topic
    topic_accuracy = sum(test_predictions[topic_mask] == test_labels[topic_mask]) / sum(topic_mask)
    print(f'Accuracy for {topic}: {topic_accuracy}')

Accuracy for finance: 0.6567926455566905
Accuracy for food: 0.0
Accuracy for personal_finance: 0.9052823315118397
Accuracy for resource: 0.0
Accuracy for transport: 0.0


#### Saving Modell

In [51]:
mytools.modelversions_save_model(model, model_path_base)

Model saved as models/Bert_freeze_v2.pt


# Testing

from sklearn.metrics import confusion_matrix
classes=[topic[1] for topic in topics]
def calculate_and_append_metrics(epoch, classes, labels, predictions,df=None):
    # Confusion Matrix erstellen
    cm = confusion_matrix(labels, predictions, labels=classes)

    # Spalten für das DataFrame definieren
    columns = ['Epoch', 'Class', 'TP abs', 'True Positive', 'TP %', 'FP abs', 'False Positive', 'FP %', 'FN abs','False Negative', 'FN %']
    data = []

    # Iteration über die Klassen
    for idx, class_name in enumerate(classes):
        total = sum(1 for label in labels if label == class_name)
        true_positive = cm[idx, idx]
        false_positive = cm[:, idx].sum() - true_positive
        false_negative = cm[idx,:].sum() - true_positive

        tp_percent = (true_positive / total) * 100 if total > 0 else 0
        fp_percent = (false_positive / total) * 100 if total > 0 else 0
        fn_percent = (false_negative / total) * 100 if total > 0 else 0

        tp_text = f"{round(tp_percent,2)}% ({true_positive})"
        fp_text = f"{round(fp_percent,2)}% ({false_positive})"
        fn_text = f"{round(fn_percent,2)}% ({false_negative})"


        data.append([epoch, class_name, true_positive, tp_text, tp_percent, false_positive, fp_text, fp_percent, false_negative, fn_text, fn_percent])

    # DataFrame für die aktuelle Epoche erstellen
    epoch_df = pd.DataFrame(data, columns=columns)
    epoch_df.set_index(['Epoch', 'Class'],inplace=True)
    if df is None:
        return epoch_df
    else:
        return pd.concat([df, epoch_df])
    
df=calculate_and_append_metrics(1, classes, test_labels, test_predictions)
#est labels und prediction vertauscht um unterschiedliche Daen zu haben
df=calculate_and_append_metrics(2, classes, test_labels, test_predictions, df)

metrics_df = df.loc[df.index.get_level_values('Epoch') >= (df.index.get_level_values('Epoch').max() - 2)]
columns = pd.MultiIndex.from_product([metrics_df.columns.get_level_values(0).unique(), ['True Positive', 'False Positive', 'False Negative']])
metrics_table = pd.DataFrame(index=metrics_df.index.levels[0], columns=columns)
metrics_df = metrics_df[['True Positive','False Positive','False Negative']]
metrics_df.stack().unstack(level=1)

print(df)

In [None]:
import torch
import numpy as np

def calculate_class_weights(labels, method='sqrt'):
    """
    Berechnet die Gewichte für die Klassen basierend auf der Häufigkeit der Labels und der angegebenen Methode.

    Args:
    labels (torch.Tensor): Tensor der Labels.
    method (str): Methode zur Berechnung der Gewichte ('sqrt', 'log', 'inverse').

    Returns:
    torch.Tensor: Tensor der Klassen-Gewichte.
    """
    
    # Berechne die Häufigkeit jeder Klasse
    unique_classes, class_counts = torch.unique(labels, return_counts=True)
    
    # Berechne die Gesamtzahl der Beispiele
    total_samples = labels.size(0)
    
    if method == 'sqrt':
        # Umgekehrt proportional zur Quadratwurzel der Klassenhäufigkeit
        weights = torch.sqrt(total_samples / class_counts.float())
    elif method == 'log':
        # Logarithmische Gewichtung
        weights = torch.log(1 + total_samples / class_counts.float())
    elif method == 'inverse':
        # Umgekehrt proportional zur Klassenhäufigkeit
        weights = total_samples / class_counts.float()
    else:
        raise ValueError("Method must be 'sqrt', 'log', or 'inverse'")
    
    # Normalisierung (optional)
    weights = weights / torch.sum(weights) * len(unique_classes)
    
    # Erstelle eine Gewichtungstabelle für alle möglichen Klassen
    class_weights = torch.zeros(len(unique_classes), dtype=torch.float)
    class_weights[unique_classes] = weights
    
    return class_weights

# Beispielaufruf
labels = torch.tensor([0] * 2895 + [1] * 2379 + [2] * 556 + [3] * 393 + [4] * 111)
class_weights_sqrt = calculate_class_weights(labels, method='sqrt')
class_weights_log = calculate_class_weights(labels, method='log')
class_weights_inverse = calculate_class_weights(labels, method='inverse')

print("Class Weights (sqrt):", class_weights_sqrt)
print("Class Weights (log):", class_weights_log)
print("Class Weights (inverse):", class_weights_inverse)