# Load Libraries

In [2]:
# %pip install scikit-learn
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# %pip install tqdm
# %pip install transformers
#%pip install matplotlib

In [3]:
import os
import pandas as pd
import ast
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import BertTokenizer, BertModel
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
import joblib
from tqdm import tqdm
import My_Machine_Learning_Tools as mytools
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


# Load and Explore Data

In [4]:
df_train=pd.read_csv('ModApte_train.csv')
df_test=pd.read_csv('ModApte_test.csv')

In [5]:
def series_to_list(df,column_name):
    result=df[column_name].replace({' ':''},regex=True)
    result.replace({'\\n':''},regex=True,inplace=True)
    result.replace({'\'\'':'\',\''},regex=True,inplace=True)
    return result.apply(ast.literal_eval)

In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9603 entries, 0 to 9602
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   text         8816 non-null   object
 1   text_type    9603 non-null   object
 2   topics       9603 non-null   object
 3   lewis_split  9603 non-null   object
 4   cgis_split   9603 non-null   object
 5   old_id       9603 non-null   object
 6   new_id       9603 non-null   object
 7   places       9603 non-null   object
 8   people       9603 non-null   object
 9   orgs         9603 non-null   object
 10  exchanges    9603 non-null   object
 11  date         9603 non-null   object
 12  title        9549 non-null   object
dtypes: object(13)
memory usage: 975.4+ KB


In [7]:
df_train.dtypes

text           object
text_type      object
topics         object
lewis_split    object
cgis_split     object
old_id         object
new_id         object
places         object
people         object
orgs           object
exchanges      object
date           object
title          object
dtype: object

In [8]:
df_train.head()

Unnamed: 0,text,text_type,topics,lewis_split,cgis_split,old_id,new_id,places,people,orgs,exchanges,date,title
0,Showers continued throughout the week in\nthe ...,"""NORM""",['cocoa'],"""TRAIN""","""TRAINING-SET""","""5544""","""1""",['el-salvador' 'usa' 'uruguay'],[],[],[],26-FEB-1987 15:01:01.79,BAHIA COCOA REVIEW
1,The U.S. Agriculture Department\nreported the ...,"""NORM""",['grain' 'wheat' 'corn' 'barley' 'oat' 'sorghum'],"""TRAIN""","""TRAINING-SET""","""5548""","""5""",['usa'],[],[],[],26-FEB-1987 15:10:44.60,NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE
2,Argentine grain board figures show\ncrop regis...,"""NORM""",['veg-oil' 'linseed' 'lin-oil' 'soy-oil' 'sun-...,"""TRAIN""","""TRAINING-SET""","""5549""","""6""",['argentina'],[],[],[],26-FEB-1987 15:14:36.41,ARGENTINE 1986/87 GRAIN/OILSEED REGISTRATIONS
3,Moody's Investors Service Inc said it\nlowered...,"""NORM""",[],"""TRAIN""","""TRAINING-SET""","""5551""","""8""",['usa'],[],[],[],26-FEB-1987 15:15:40.12,USX &lt;X> DEBT DOWGRADED BY MOODY'S
4,Champion Products Inc said its\nboard of direc...,"""NORM""",['earn'],"""TRAIN""","""TRAINING-SET""","""5552""","""9""",['usa'],[],[],[],26-FEB-1987 15:17:11.20,CHAMPION PRODUCTS &lt;CH> APPROVES STOCK SPLIT


# Preprocessing

#### Define Parameters

In [9]:
load_encoder=False
fit_encoder=True

#### Define treatment of columns und topics

In [10]:
#define topics
topic_column = 'topics'
food = ['coconut', 'cotton-oil', 'sorghum', 'orange', 'rice', 'soybean', 'sun-meal', 
    'oilseed', 'sugar', 'hog', 'coffee', 'groundnut', 'sunseed', 'sun-oil', 'rye', 
    'lin-oil', 'copra-cake', 'potato', 'barley', 'tea', 'meal-feed', 'coconut-oil', 
    'palmkernel', 'cottonseed', 'castor-oil', 'l-cattle', 'livestock', 'soy-oil', 
    'rape-oil', 'palm-oil', 'cocoa', 'cotton', 'wheat', 'corn', 'f-cattle', 'grain', 
    'soy-meal', 'oat', 'groundnut-oil', 'veg-oil','rapeseed']
resource = ['platinum', 'lead', 'nickel', 'strategic-metal', 'copper', 'palladium', 'gold', 
    'zinc', 'tin', 'iron-steel', 'alum', 'silver', 'nat-gas', 'rubber', 'pet-chem', 'fuel', 'crude','lumber','propane','wool']
finance = ['money-supply', 'dlr', 'nkr', 'lei', 'yen', 'dfl', 'sfr', 'cpi', 'instal-debt', 
    'money-fx', 'gnp', 'interest', 'income', 'dmk', 'rand', 'bop', 'reserves', 'nzdlr','acq']
personal_finance = ['housing','jobs','earn']
transport = ['jet', 'ship']
topics=[[food,'food'],[resource,'resource'],[finance,'finance'],[personal_finance,'personal_finance'],[transport,'transport']]
topics_to_remove = ['gas', 'heat', 'trade', 'retail', 'carcass', 'cpu', 'wpi', 'naphtha', 'ipi','stg','inventories']

#columns with special treatment
list_column='places'
drop_columns=['text_type','people','orgs','exchanges','lewis_split','cgis_split','old_id','new_id']
notnan_columns=['text','topics']
date_columns=['date']
text_columns=['text','title']

#### Define functions for Prepeocessing

These may be turned into a library later.

In [11]:
def drop_row_notnan_columms(df,notnan_columns):
    df_copy = df.copy()
    
    for column in notnan_columns:
        df_copy[column].dropna(inplace=True)
    
    return df_copy


In [12]:
def format_listcolumns(df, column):
    """
    Wandelt eine Spalte mit Listen als Strings formatiert in echte Listen um und gibt ein DataFrame und die eindeutigen Werte zurück.

    Parameters:
    df (pd.DataFrame): Der DataFrame, der die Spalte enthält.
    column (str): Der Name der Spalte, die konvertiert werden soll.

    Returns:
    pd.DataFrame: Das DataFrame mit der umgewandelten Spalte.
    list: Eine Liste der eindeutigen Werte in der umgewandelten Spalte.

    Example:
    >>> df, unique_values = format_listcolumns(df_train, 'features')
    """
    # Kopie der Spalte erstellen, um die Originaldaten nicht zu ändern
    df_copy = df.copy()

    # Umwandlung der Spalte von einem String in eine Liste
    df_copy[column].replace({'\\n': ''}, regex=True, inplace=True)
    df_copy = mytools.df_string_to_list(df_copy, column, entry_delimiter="'", separator=' ')

    # Eindeutige Werte in der umgewandelten Spalte finden
    unique_values = mytools.df_unique_list_values(df_copy, column)

    return df_copy, unique_values

In [13]:
#funtion to reorganize a column of subtopics into  broader topics and removing some of them 
def categorize_topics(df,column,topics,remove):
    df_copy = df.copy()
    
    for topic in topics:
        for subtopic in topic[0]:
            df_copy[column] = df_copy[column].replace({'\'' + subtopic + '\'': '\'' + topic[1] + '\''}, regex=True)
    
    for subtopic in remove:
        df_copy[column] = df_copy[column].replace({'\'' + subtopic + '\'': ''}, regex=True)
    
    df_copy[column] = df_copy[column].replace({' ': ''}, regex=True)
    df_copy[column] = series_to_list(df_copy, column)
    df_copy = df_copy[df_copy[column].str.len() == 1]
    df_copy[column] = df_copy[column].apply(lambda x: x[0])
    
    return df_copy

In [14]:
def format_datecolumns(df,date_columns):
    # Kopie des DataFrame erstellen, um die Originaldaten nicht zu ändern
    df_copy = df.copy()

    for column in date_columns:
        # Die Zeichenkette in ein Datum konvertieren
        df_copy[column] = pd.to_datetime(df_copy[column].str.strip().str.split(' ').str.get(0))
        df_copy[column+'_month'] = df_copy[column].dt.month
        df_copy[column+'_month'] = (df_copy[column+'_month'] - df_copy[column+'_month'].mean()) / df_copy[column+'_month'].std()

        # Woche extrahieren (altes Verhalten, ab Pandas 1.1.0 ist isocalendar().week empfohlen)
        df_copy[column+'_day_month'] = df_copy[column].dt.day
        df_copy[column+'_day_month'] = (df_copy[column+'_day_month'] - df_copy[column+'_day_month'].mean()) / df_copy[column+'_day_month'].std()

        # Tag extrahieren
        df_copy[column+'_day_year'] = df_copy[column].dt.day_of_year
        df_copy[column+'_day_year'] = (df_copy[column+'_day_year'] - df_copy[column+'_day_year'].mean()) / df_copy[column+'_day_year'].std()

        # Wochentag extrahieren (Montag=0, Sonntag=6)
        df_copy[column+'_weekday'] = df_copy[column].dt.day_name('en')

        df_copy[column+'_quarter_year'] = df_copy[column].dt.quarter
        df_copy = pd.get_dummies(df_copy, columns=[column+'_weekday'])
        weekdays = ['weekday_Monday', 'weekday_Tuesday', 'weekday_Wednesday', 'weekday_Thursday', 'weekday_Friday', 'weekday_Saturday', 'weekday_Sunday']
        for weekday in weekdays:
            if not column+'_'+weekday in df_copy.columns:
                df_copy[column+'_'+weekday] = 0
            else:
                df_copy[column+'_'+weekday] = df_copy[column+'_'+weekday].astype(int)

        df_copy = df_copy.drop(columns=column)

    return df_copy

In [15]:
def format_textcolumns(df,text_columns):
    # Kopie des DataFrame erstellen, um die Originaldaten nicht zu ändern
    df_copy = df.copy()

    for column in text_columns:
        df_copy[column].replace({'&lt;': '<'}, regex=True, inplace=True)
        df_copy[column].replace({'\\n': ' '}, regex=True, inplace=True)
        df_copy[column] = df_copy[column].str.replace('\s+', ' ', regex=True)
        df_copy[column] = df_copy[column].str.lower()
        df_copy[column] = df_copy[column].fillna(value='')

    return df_copy

  df_copy[column] = df_copy[column].str.replace('\s+', ' ', regex=True)


In [16]:
def handle_special_columns(df,list_column,list_possible_values,drop_columns,date_columns,notnan_columns,text_columns):
    # Kopie des DataFrame erstellen, um die Originaldaten nicht zu ändern
    df_copy = df.copy()

    # Spalten aus dem DataFrame entfernen
    df_copy = df_copy.drop(columns=drop_columns)

    # Spalte mit Listen explodieren und mögliche Werte festlegen
    df_copy = mytools.df_explode_listcolumn(df_copy, list_column, list_possible_values)

    # Datumsangaben formatieren
    df_copy = format_datecolumns(df_copy, date_columns)

    # Zeilen entfernen, die NaN-Werte in bestimmten Spalten enthalten
    df_copy = drop_row_notnan_columms(df_copy, notnan_columns)

    # Textspalten formatieren
    df_copy = format_textcolumns(df_copy, text_columns)

    return df_copy

In [17]:
#has to expanded to make it readeble by model
def preprocessing(df,topic_column,topics,topics_to_remove,list_column,list_possible_values,drop_columns,date_columns,notnan_columns,text_columns,encoder,fit_encoder=True):
    df_copy = df.copy()
    
    df_copy = categorize_topics(df_copy, topic_column, topics, topics_to_remove)
    df_copy = handle_special_columns(df_copy, list_column, list_possible_values, drop_columns, date_columns, notnan_columns, text_columns)
    
    additional_features = torch.tensor(df_copy.drop(columns=(text_columns + [topic_column])).values)
    additional_features = additional_features.float()
    
    if fit_encoder:
        labels = torch.tensor(encoder.fit_transform(df_copy[topic_column]))
    else:
        labels = torch.tensor(encoder.transform(df_copy[topic_column]))
    labels = labels.long()
    
    return df_copy[text_columns], additional_features, labels, encoder

#### Actual Preprocessing

In [18]:
df,unique_values_test = format_listcolumns(df_test,list_column)
df,unique_values_train = format_listcolumns(df_train,list_column)
unique_countries=unique_values_train

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy[column].replace({'\\n': ''}, regex=True, inplace=True)


In [19]:
if load_encoder:
    label_encoder = joblib.load('label_encoder.joblib')
else:
   label_encoder = LabelEncoder() 

In [20]:

train_df_text,train_additional_features,train_labels,label_encoder = preprocessing(df_train,topic_column,topics,topics_to_remove,list_column,unique_countries,drop_columns,date_columns,notnan_columns,text_columns,label_encoder,fit_encoder=fit_encoder)

test_df_text,test_additional_features,test_labels,label_encoder = preprocessing(df_test,topic_column,topics,topics_to_remove,list_column,unique_countries,drop_columns,date_columns,notnan_columns,text_columns,label_encoder,fit_encoder=fit_encoder)

if not load_encoder:
    joblib.dump(label_encoder, 'label_encoder.joblib')

  df_copy[column] = pd.to_datetime(df_copy[column].str.strip().str.split(' ').str.get(0))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy[column].replace({'&lt;': '<'}, regex=True, inplace=True)
  df_copy[column] = pd.to_datetime(df_copy[column].str.strip().str.split(' ').str.get(0))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original

# Erstellung des Modell

#### Definieren der Parameter

In [21]:
model_name='bert-base-multilingual-uncased'
num_additional_features=139
num_classes=5
freeze_bert=True
num_epochs=5
batch_size=32
model_path_base='models/Bert_freeze'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


#### Initilisieren vom Tokennizer

In [22]:
tokenizer = BertTokenizer.from_pretrained(model_name)

## Definieren der benötigten Funktionen und Objekte

#### Tokenizen und Dataset

In [23]:
def tokenize_texts(text,length=128):
    tokenized_text = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=length)
    return tokenized_text

def tokenize_inputs(df_text):
    tokenized_inputs1 = []
    tokenized_inputs2 = []
    for idx, row in df_text.iterrows():
        inputs1 = tokenize_texts(row['text'],256)
        inputs2 = tokenize_texts(row['title'],16)
        tokenized_inputs1.append(inputs1)
        tokenized_inputs2.append(inputs2)
    return tokenized_inputs1,tokenized_inputs2

In [24]:
class Text_Text_Feature_Dataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_inputs1, tokenized_inputs2, additional_features, labels):
        self.tokenized_inputs1 = tokenized_inputs1
        self.tokenized_inputs2 = tokenized_inputs2
        self.additional_features = additional_features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids1 = self.tokenized_inputs1[idx]['input_ids'].squeeze()
        attention_mask1 = self.tokenized_inputs1[idx]['attention_mask'].squeeze()
        input_ids2 = self.tokenized_inputs2[idx]['input_ids'].squeeze()
        attention_mask2 = self.tokenized_inputs2[idx]['attention_mask'].squeeze()
        additional_features = self.additional_features[idx]
        label = self.labels[idx]
        return input_ids1, attention_mask1, input_ids2, attention_mask2, additional_features, label

#### Modell

In [25]:
class MultilingualBERTClassifier(nn.Module):
    def __init__(self, bert_model_name='bert-base-multilingual-uncased', num_additional_features=119, num_classes=5, freeze_bert=True):
        super(MultilingualBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        
        # Einfrieren der BERT-Gewichte, falls angegeben
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
        self.additional_features_layer = nn.Linear(num_additional_features, 128)
        self.classifier = nn.Linear(768 * 2 + 128, num_classes)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2, additional_features):
        outputs1 = self.bert(input_ids1, attention_mask=attention_mask1)
        pooled_output1 = outputs1[1]  # [CLS] token representation
        
        outputs2 = self.bert(input_ids2, attention_mask=attention_mask2)
        pooled_output2 = outputs2[1]  # [CLS] token representation
        
        additional_features_output = self.additional_features_layer(additional_features)
        additional_features_output = torch.relu(additional_features_output)
        
        combined_output = torch.cat((pooled_output1, pooled_output2, additional_features_output), dim=1)
        combined_output = self.dropout(combined_output)
        
        logits = self.classifier(combined_output)
        
        return logits

## Training

#### Erstellen des Datasets

In [26]:
#Tokenize Text
train_tokenized_inputs1, train_tokenized_inputs2= tokenize_inputs(train_df_text)
test_tokenized_inputs1, test_tokenized_inputs2= tokenize_inputs(test_df_text)
#Create Datasets
train_dataset = Text_Text_Feature_Dataset(train_tokenized_inputs1, train_tokenized_inputs2,train_additional_features,train_labels)
test_dataset = Text_Text_Feature_Dataset(test_tokenized_inputs1, test_tokenized_inputs2,test_additional_features,test_labels)

#### Initialisieren von Modell

In [30]:
#Create modell or load previous one
model = MultilingualBERTClassifier(num_additional_features=num_additional_features, num_classes=num_classes)
#load_model(model,model_path_base)
mytools.modelversions_load_model(model,model_path_base)
# move modell to divice if possible
model.to(device)
print('Model loaded and on device')

Model loaded from models/Bert_freeze_v2.pt
model loaded


In [34]:
#create Dataloader
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# Loss-Funktion and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-5)

#### Training

In [39]:
model.train()
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for batch in tqdm(dataloader, desc=f'Epoch {epoch + 1}'):
        input_ids1, attention_mask1, input_ids2, attention_mask2, additional_features, labels = batch
        input_ids1, attention_mask1 = input_ids1.to(device), attention_mask1.to(device)
        input_ids2, attention_mask2 = input_ids2.to(device), attention_mask2.to(device)
        additional_features, labels = additional_features.to(device), labels.to(device)

        optimizer.zero_grad()
        
        # Vorwärtsdurchlauf
        logits = model(input_ids1, attention_mask1, input_ids2, attention_mask2, additional_features)

        # Verlust berechnen
        loss = criterion(logits, labels)
        epoch_loss += loss.item()
        
        # Rückwärtsdurchlauf und Optimierung
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1} (BERT eingefroren), Loss: {epoch_loss / len(dataloader)}")

Epoch 1: 100%|██████████| 199/199 [00:59<00:00,  3.34it/s]


Epoch 1 (BERT eingefroren), Loss: 1.1001562727755638


Epoch 2: 100%|██████████| 199/199 [00:59<00:00,  3.33it/s]


Epoch 2 (BERT eingefroren), Loss: 1.0979603894391852


Epoch 3: 100%|██████████| 199/199 [01:01<00:00,  3.24it/s]


Epoch 3 (BERT eingefroren), Loss: 1.0939032968564248


Epoch 4: 100%|██████████| 199/199 [01:00<00:00,  3.28it/s]


Epoch 4 (BERT eingefroren), Loss: 1.092449508420187


Epoch 5: 100%|██████████| 199/199 [01:00<00:00,  3.30it/s]

Epoch 5 (BERT eingefroren), Loss: 1.0834619058436485





In [40]:
#test modell
model.eval()
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
test_predictions = []
test_labels = []

with torch.no_grad():
    for batch in tqdm(test_dataloader):
        input_ids1, attention_mask1, input_ids2, attention_mask2, additional_features, labels = batch
        input_ids1, attention_mask1 = input_ids1.to(device), attention_mask1.to(device)
        input_ids2, attention_mask2 = input_ids2.to(device), attention_mask2.to(device)
        additional_features, labels = additional_features.to(device), labels.to(device)

        logits = model(input_ids1, attention_mask1, input_ids2, attention_mask2, additional_features)
        predictions = torch.argmax(logits, dim=1)
        
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_predictions = label_encoder.inverse_transform(test_predictions)
test_labels = label_encoder.inverse_transform(test_labels)

accuracy = sum(test_predictions == test_labels) / len(test_labels)
print(f'Accuracy: {accuracy}')

100%|██████████| 79/79 [00:23<00:00,  3.38it/s]

Accuracy: 0.6532322426177175





In [50]:
#define categories
categories = ['food', 'resource', 'finance', 'personal_finance', 'transport']
accuracy_data = [test_predictions==test_labels,test_labels]
#total datapoints
total = len(test_labels)
# DataFrame erstellen
accuracy_df = pd.DataFrame(accuracy_data).T
#group by and sum for each category and total
accuracy_df = accuracy_df.groupby(1).sum()
accuracy_df


Unnamed: 0_level_0,0
1,Unnamed: 1_level_1
finance,True
food,False
personal_finance,True
resource,False
transport,False


In [46]:
# Tabelle plotten
fig, ax = plt.subplots()
ax.axis('tight')
ax.axis('off')
table = ax.table(cellText=accuracy_df.values, colLabels=accuracy_df.columns, rowLabels=accuracy_df.index, loc='center', cellLoc='center')
table.auto_set_font_size(False)
table.set_fontsize(14)
table.scale(1.2, 1.2)

plt.title('Accuracy für verschiedene Kategorien')
plt.show()

NameError: name 'plt' is not defined

In [41]:
#print accuracy by topic
for topic in label_encoder.classes_:
    topic_mask = test_labels == topic
    topic_accuracy = sum(test_predictions[topic_mask] == test_labels[topic_mask]) / sum(topic_mask)
    print(f'Accuracy for {topic}: {topic_accuracy}')

Accuracy for finance: 0.6567926455566905
Accuracy for food: 0.0
Accuracy for personal_finance: 0.9052823315118397
Accuracy for resource: 0.0
Accuracy for transport: 0.0


#### Saving Modell

In [51]:
mytools.modelversions_save_model(model, model_path_base)

Model saved as models/Bert_freeze_v2.pt
