In [51]:
import pandas as pd
import ast
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.optim as optim


In [52]:
df_train=pd.read_csv('ModApte_train.csv')
df_test=pd.read_csv('ModApte_test.csv')

In [53]:
def series_to_list(df,column_name):
    result=df[column_name].replace({' ':''},regex=True)
    result.replace({'\\n':''},regex=True,inplace=True)
    result.replace({'\'\'':'\',\''},regex=True,inplace=True)
    return result.apply(ast.literal_eval)

In [54]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9603 entries, 0 to 9602
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   text         8816 non-null   object
 1   text_type    9603 non-null   object
 2   topics       9603 non-null   object
 3   lewis_split  9603 non-null   object
 4   cgis_split   9603 non-null   object
 5   old_id       9603 non-null   object
 6   new_id       9603 non-null   object
 7   places       9603 non-null   object
 8   people       9603 non-null   object
 9   orgs         9603 non-null   object
 10  exchanges    9603 non-null   object
 11  date         9603 non-null   object
 12  title        9549 non-null   object
dtypes: object(13)
memory usage: 975.4+ KB


In [55]:
df_train.dtypes

text           object
text_type      object
topics         object
lewis_split    object
cgis_split     object
old_id         object
new_id         object
places         object
people         object
orgs           object
exchanges      object
date           object
title          object
dtype: object

In [56]:
df_train.head()

Unnamed: 0,text,text_type,topics,lewis_split,cgis_split,old_id,new_id,places,people,orgs,exchanges,date,title
0,Showers continued throughout the week in\nthe ...,"""NORM""",['cocoa'],"""TRAIN""","""TRAINING-SET""","""5544""","""1""",['el-salvador' 'usa' 'uruguay'],[],[],[],26-FEB-1987 15:01:01.79,BAHIA COCOA REVIEW
1,The U.S. Agriculture Department\nreported the ...,"""NORM""",['grain' 'wheat' 'corn' 'barley' 'oat' 'sorghum'],"""TRAIN""","""TRAINING-SET""","""5548""","""5""",['usa'],[],[],[],26-FEB-1987 15:10:44.60,NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE
2,Argentine grain board figures show\ncrop regis...,"""NORM""",['veg-oil' 'linseed' 'lin-oil' 'soy-oil' 'sun-...,"""TRAIN""","""TRAINING-SET""","""5549""","""6""",['argentina'],[],[],[],26-FEB-1987 15:14:36.41,ARGENTINE 1986/87 GRAIN/OILSEED REGISTRATIONS
3,Moody's Investors Service Inc said it\nlowered...,"""NORM""",[],"""TRAIN""","""TRAINING-SET""","""5551""","""8""",['usa'],[],[],[],26-FEB-1987 15:15:40.12,USX &lt;X> DEBT DOWGRADED BY MOODY'S
4,Champion Products Inc said its\nboard of direc...,"""NORM""",['earn'],"""TRAIN""","""TRAINING-SET""","""5552""","""9""",['usa'],[],[],[],26-FEB-1987 15:17:11.20,CHAMPION PRODUCTS &lt;CH> APPROVES STOCK SPLIT


In [57]:
#define topics
topic_column = 'topics'
food = ['coconut', 'cotton-oil', 'sorghum', 'orange', 'rice', 'soybean', 'sun-meal', 
    'oilseed', 'sugar', 'hog', 'coffee', 'groundnut', 'sunseed', 'sun-oil', 'rye', 
    'lin-oil', 'copra-cake', 'potato', 'barley', 'tea', 'meal-feed', 'coconut-oil', 
    'palmkernel', 'cottonseed', 'castor-oil', 'l-cattle', 'livestock', 'soy-oil', 
    'rape-oil', 'palm-oil', 'cocoa', 'cotton', 'wheat', 'corn', 'f-cattle', 'grain', 
    'soy-meal', 'oat', 'groundnut-oil', 'veg-oil','rapeseed']
resource = ['platinum', 'lead', 'nickel', 'strategic-metal', 'copper', 'palladium', 'gold', 
    'zinc', 'tin', 'iron-steel', 'alum', 'silver', 'nat-gas', 'rubber', 'pet-chem', 'fuel', 'crude','lumber','propane']
finance = ['money-supply', 'dlr', 'nkr', 'lei', 'yen', 'dfl', 'sfr', 'cpi', 'instal-debt', 
    'money-fx', 'gnp', 'interest', 'income', 'dmk', 'rand', 'bop', 'reserves', 'nzdlr','acq']
personal_finance = ['housing','jobs','earn']
transport = ['jet', 'ship']
topics=[[food,'food'],[resource,'resource'],[finance,'finance'],[personal_finance,'personal_finance'],[transport,'transport']]
topics_to_remove = ['gas', 'heat', 'trade', 'retail', 'carcass', 'cpu', 'wpi', 'naphtha', 'ipi']

#columns with special treatment
list_columns=['places']
drop_columns=['text_type','people','orgs','exchanges','lewis_split','cgis_split','old_id','new_id']
notnan_columns=['text','topics']
date_columns=['date']
text_columns=['text','title']

In [58]:
def drop_row_notnan_columms(df,notnan_columns):
    for column in notnan_columns:
        df[column].dropna(inplace=True)
    return df


In [59]:
#funtion to reorganize a column of subtopics into  broader topics and removing some of them 
def categorize_topics(df,column,topics,remove):
    for topic in topics:
        for subtopic in topic[0]:
            df[column]=df[column].replace({'\''+subtopic+'\'':'\''+topic[1] +'\''},regex=True)
    for subtopic in remove:
        df[column]=df[column].replace({'\''+subtopic+'\'':''},regex=True)
    df[column]=df[column].replace({' ':''},regex=True)
    df[column]=series_to_list(df,column)
    df=df[df[column].str.len()==1]
    df[column]=df[column].apply(lambda x : x[0])
    return df

In [60]:
def explode_listcolumns(df,list_columns):
    for column in list_columns:
        df[column]=series_to_list(df,column)
        df = df.join(pd.crosstab((s:=df[column].explode()).index, s).add_prefix(column+'_'))
        df = df.drop(columns=column)
        for col  in [col for col in df if col.startswith(column+'_')]:
            df[col].fillna(value=0,inplace=True)
    return df

In [61]:
def format_datecolumns(df,date_columns):
    for column in date_columns:
        # Die Zeichenkette in ein Datum konvertieren
        df[column] = pd.to_datetime(df[column].str.strip().str.split(' ').str.get(0))
        df[column+'_month'] = df[column].dt.month
        df[column+'_month'] = (df[column+'_month'] - df[column+'_month'].mean()) / df[column+'_month'].std()

        # Woche extrahieren (altes Verhalten, ab Pandas 1.1.0 ist isocalendar().week empfohlen)
        df[column+'_day_month'] = df[column].dt.day
        df[column+'_day_month'] = (df[column+'_day_month'] - df[column+'_day_month'].mean()) / df[column+'_day_month'].std()

        # Tag extrahieren
        df[column+'_day_year'] = df[column].dt.day_of_year
        df[column+'_day_year'] = (df[column+'_day_year'] - df[column+'_day_year'].mean()) / df[column+'_day_year'].std()


        # Wochentag extrahieren (Montag=0, Sonntag=6)
        df[column+'_weekday'] = df[column].dt.day_name('en')

        df[column+'_quarter_year'] = df[column].dt.quarter
        df=pd.get_dummies(df, columns=[column+'_weekday'])
        weekdays=['weekday_Monday','weekday_Tuesday','weekday_Wednesday','weekday_Thursday','weekday_Friday','weekday_Saturday','weekday_Sunday']
        for weekday in weekdays:
            if not column+'_'+weekday in df.columns:
                df[column+'_'+weekday]=0
            else:
                df[column+'_'+weekday]=df[column+'_'+weekday].astype(int)
        
        df=df.drop(columns=column)
    return df

In [62]:
def format_textcolumns(df,text_columns):
    for column in text_columns:
        df[column].replace({'&lt;':'<'},regex=True,inplace=True)
        df[column].replace({'\\n':' '},regex=True,inplace=True)
        df[column]=df[column].str.replace('\s+', ' ', regex=True)
        df[column]=df[column].str.lower()
        df[column]=df[column].fillna(value='')
    return df

In [63]:
def handle_special_columns(df,list_columns,drop_columns,date_columns,notnan_columns,text_columns):
    df=df.drop(columns=drop_columns)
    df= explode_listcolumns(df,list_columns)
    df= format_datecolumns(df,date_columns)
    df= drop_row_notnan_columms(df,notnan_columns)
    df= format_textcolumns(df,text_columns)
    return df

In [64]:
#has to expanded to make it readeble by model
def preprocessing(df,topic_column,topics,topics_to_remove,list_columns,drop_columns,date_columns,notnan_columns,text_columns,encoder,fit_encoder=True):
    df=categorize_topics(df,topic_column,topics,topics_to_remove)
    df = handle_special_columns(df,list_columns,drop_columns,date_columns,notnan_columns,text_columns)
    
    #create torch tensor 
    additional_features = torch.tensor(df.drop(columns=(text_columns+[topic_column])).values)

    #labels
    if fit_encoder:
        labels = torch.tensor(encoder.fit_transform(df[topic_column]))
    else:
        labels = torch.tensor(encoder.transform(df[topic_column]))
    return df[text_columns],additional_features,labels,encoder

In [65]:
label_encoder = LabelEncoder()

In [66]:

train_df_text,train_additional_features,train_labels,label_encoder = preprocessing(df_train,topic_column,topics,topics_to_remove,list_columns,drop_columns,date_columns,notnan_columns,text_columns,label_encoder)

test_df_text,test_additional_features,test_labels,label_encoder = preprocessing(df_test,topic_column,topics,topics_to_remove,list_columns,drop_columns,date_columns,notnan_columns,text_columns,label_encoder,fit_encoder=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column]=df[column].apply(lambda x : x[0])
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(value=0,inplace=True)
  df[column] = pd.to_datetime(df[column].str.strip().str.split(' ').str.get(0))
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing

In [82]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

def tokenize_texts(text,length=128):
    tokenized_text = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=length)
    return tokenized_text
tokenized_inputs1 = []
tokenized_inputs2 = []

for idx, row in train_df_text.iterrows():
    inputs1 = tokenize_texts(row['text'],256)
    inputs2 = tokenize_texts(row['title'],16)
    tokenized_inputs1.append(inputs1)
    tokenized_inputs2.append(inputs2)

In [83]:
tokenized_inputs2

[{'input_ids': tensor([[  101, 23224, 47691, 10112, 12893,   102,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])},
 {'input_ids': tensor([[  101, 13752, 19208,   133, 14879,   135, 35821, 47318, 10107, 16913,
          18389,   102,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])},
 {'input_ids': tensor([[  101, 14831, 18354, 13946,   133, 31275, 43615,   135, 84838, 14773,
            102,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])},
 {'input_ids': tensor([[  101, 10348, 10799, 10805, 13565,   133, 54786, 10132,   

In [76]:
class Text_Text_Feature_Dataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_inputs1, tokenized_inputs2, additional_features, labels):
        self.tokenized_inputs1 = tokenized_inputs1
        self.tokenized_inputs2 = tokenized_inputs2
        self.additional_features = additional_features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids1 = self.tokenized_inputs1[idx]['input_ids'].squeeze()
        attention_mask1 = self.tokenized_inputs1[idx]['attention_mask'].squeeze()
        input_ids2 = self.tokenized_inputs2[idx]['input_ids'].squeeze()
        attention_mask2 = self.tokenized_inputs2[idx]['attention_mask'].squeeze()
        additional_features = self.additional_features[idx]
        label = self.labels[idx]
        return input_ids1, attention_mask1, input_ids2, attention_mask2, additional_features, label


In [84]:
train_dataset = Text_Text_Feature_Dataset(tokenized_inputs1, tokenized_inputs2,train_additional_features,train_labels)

In [None]:
class MultilingualBERTClassifier(nn.Module):
    def __init__(self, bert_model_name='bert-base-multilingual-uncased', num_additional_features=10, num_classes=2):
        super(MultilingualBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.additional_features_layer = nn.Linear(num_additional_features, 128)
        self.classifier = nn.Linear(768 * 2 + 128, num_classes)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2, additional_features):
        outputs1 = self.bert(input_ids1, attention_mask=attention_mask1)
        pooled_output1 = outputs1[1]  # [CLS] token representation
        
        outputs2 = self.bert(input_ids2, attention_mask=attention_mask2)
        pooled_output2 = outputs2[1]  # [CLS] token representation
        
        additional_features_output = self.additional_features_layer(additional_features)
        additional_features_output = torch.relu(additional_features_output)
        
        combined_output = torch.cat((pooled_output1, pooled_output2, additional_features_output), dim=1)
        combined_output = self.dropout(combined_output)
        
        logits = self.classifier(combined_output)
        
        return logits

# Beispiel Initialisierung
num_additional_features = 10  # Anzahl der zusätzlichen Features
num_classes = 5  # Anzahl der Klassen (z.B. binäre Klassifikation)

model = MultilingualBERTClassifier(num_additional_features=num_additional_features, num_classes=num_classes)

# Tokenizer initialisieren
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Beispiel Eingabedaten
text1 = "Dies ist ein Beispielsatz."
text2 = "Dies ist ein anderer Beispielsatz."
additional_features = torch.randn(1, num_additional_features)  # Dummy-Daten für zusätzliche Features

# Tokenisieren
inputs1 = tokenizer(text1, return_tensors='pt', padding=True, truncation=True, max_length=128)
inputs2 = tokenizer(text2, return_tensors='pt', padding=True, truncation=True, max_length=128)

# Vorhersage
model.eval()
with torch.no_grad():
    logits = model(inputs1['input_ids'], inputs1['attention_mask'], inputs2['input_ids'], inputs2['attention_mask'], additional_features)
    predictions = torch.softmax(logits, dim=1)

print(predictions)