# This is the jupyter notebook for the baseline system of "call to action" detection model

## 1. Data balancing

First of all, we want to preprocess the data we are working with.

In [2]:
import pandas as pd

filename = "data/c2a.csv"
df = pd.read_csv(filename, sep=';')
df.head()

Unnamed: 0,id,description,C2A
0,1064396393598783,"Oliver, ich guck doch schon mindestens einmal ...",False
1,1069077806463975,Schlafen Sie schon oder reden sie noch ?,False
2,1065824423455980,Ich bin auf einen hoffentlich ehrlichen abschl...,False
3,1069073689797720,Und sowas ist Ministerpräsident. Unglaublich.,False
4,1065801100124979,https://www.youtube.com/watch?v=134uhbtYBM4,False


Our data is imbalanced, we will perform undersampling

In [3]:
from sklearn.utils import resample

minority_class = df[df['C2A'] == True]
majority_class = df[df['C2A'] == False]

majority_undersampled = resample(majority_class, replace=False, n_samples=len(minority_class), random_state=42)
df = pd.concat([minority_class, majority_undersampled])

print(df['C2A'].value_counts())

C2A
True     952
False    952
Name: count, dtype: int64


## 2. Data preprocessing
Once our data is balanced, we can preprocess it.

- First we will clean it

In [4]:
import re


def clean_text(text):
    url_pattern = r'https?://\S+|www\.\S+'
    mention_pattern = r'@\w+'
    hashtag_pattern = r'#\w+'
    combined_pattern = f'({url_pattern})|({mention_pattern})|({hashtag_pattern})'

    cleaned_text = re.sub(combined_pattern, '', text)
    return cleaned_text


df['description'] = df['description'].apply(clean_text)
print(df.head())

                   id                                        description   C2A
29   1069125156459240  Oh, Ihr Schwaben habt auch ein Volltrottel als...  True
35   1064470026924753  JEDE STIMME ZÄHLT! Die SVP-Initiative könnte a...  True
76   1069142563124166  Schaut hin, ihr jungen Menschen, diese "alte H...  True
166  1066221073416315      Sehrt gut. Dan bitte anfangen mit Mein Kampf.  True
217  1064431840261905  immer weiter so bis zum bitteren ende nicht au...  True


- Secondly, we will extract lemmas from text and the tokenize it.

In [5]:
import spacy

nlp = spacy.load('de_core_news_md')


def text_lemmatize_tokenize(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_punct]
    return ' '.join(tokens)


df['description'] = df['description'].apply(text_lemmatize_tokenize)
print(df.head())

                   id                                        description   C2A
29   1069125156459240  oh ihr schwaben haben auch ein volltrottel als...  True
35   1064470026924753  jede stimme zählt der svp-initiative können al...  True
76   1069142563124166  schauen hin ihr jung mensch dieser alt herrenr...  True
166  1066221073416315        sehrt gut dan bitte anfangen mit mein kampf  True
217  1064431840261905  immer weiter so bis zu bitter ende nicht aufgeben  True


- Next we will add some features. In this case we will add polarity using the text blob library.

In [6]:
from textblob_de import TextBlobDE


def add_polarity(df):
    def calculate_sentiment_features(text):
        blob = TextBlobDE(text)
        return blob.sentiment.polarity

    df[['polarity']] = df['description'].apply(
        lambda x: pd.Series(calculate_sentiment_features(x))
    )

    return df


df = add_polarity(df)
print(df.head())

                   id                                        description  \
29   1069125156459240  oh ihr schwaben haben auch ein volltrottel als...   
35   1064470026924753  jede stimme zählt der svp-initiative können al...   
76   1069142563124166  schauen hin ihr jung mensch dieser alt herrenr...   
166  1066221073416315        sehrt gut dan bitte anfangen mit mein kampf   
217  1064431840261905  immer weiter so bis zu bitter ende nicht aufgeben   

      C2A  polarity  
29   True      1.00  
35   True      0.45  
76   True      0.70  
166  True      1.00  
217  True      0.35  


- After this step we will add some sentence transformers and add sentence embeddings.

In [7]:
from sentence_transformers import SentenceTransformer


def add_semantic_features(df):
    sentence_model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
    texts = df['description'].astype(str).values.tolist()

    embeddings = sentence_model.encode(texts, show_progressbar=True)
    embeddings_df = pd.DataFrame(embeddings, columns=[f'embedding_{i}' for i in range(embeddings.shape[1])])

    df = pd.concat([df.reset_index(drop=True), embeddings_df.reset_index(drop=True)], axis=1)
    return df


df = add_semantic_features(df)
print(df.head())

                 id                                        description   C2A  \
0  1069125156459240  oh ihr schwaben haben auch ein volltrottel als...  True   
1  1064470026924753  jede stimme zählt der svp-initiative können al...  True   
2  1069142563124166  schauen hin ihr jung mensch dieser alt herrenr...  True   
3  1066221073416315        sehrt gut dan bitte anfangen mit mein kampf  True   
4  1064431840261905  immer weiter so bis zu bitter ende nicht aufgeben  True   

   polarity  embedding_0  embedding_1  embedding_2  embedding_3  embedding_4  \
0      1.00     0.025917     0.020020     0.014230    -0.018579    -0.005564   
1      0.45     0.039895     0.040401    -0.051886    -0.019407     0.100345   
2      0.70    -0.019509    -0.003412    -0.009561    -0.003204     0.076088   
3      1.00     0.015723    -0.006647     0.014712    -0.014438     0.002726   
4      0.35    -0.016261    -0.027691     0.032199    -0.013078     0.053044   

   embedding_5  ...  embedding_502  em

After we finished with preprocessing steps, we will split our dataset randomly into training and testing set. For testing we will randomly take out 50 True and 50 False samples.

In [8]:
sampled_minority = resample(
    df[df['C2A'] == True], replace=False, n_samples=50, random_state=42
)
sampled_majority = resample(
    df[df['C2A'] == False], replace=False, n_samples=50, random_state=42
)

test_df = pd.concat([sampled_minority, sampled_majority])
train_df = df.drop(test_df.index)

Now we can train our model on our training data.

In [9]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

X = train_df.drop(columns=['id', 'description', 'C2A'])
y = train_df['C2A']

model = GradientBoostingClassifier(random_state=42)
model.fit(X, y)

y_pred = cross_val_predict(model, X, y, cv=5)

accuracy = accuracy_score(y, y_pred)
report = classification_report(y, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Accuracy: 0.7489
Classification Report:
              precision    recall  f1-score   support

       False       0.75      0.75      0.75       902
        True       0.75      0.75      0.75       902

    accuracy                           0.75      1804
   macro avg       0.75      0.75      0.75      1804
weighted avg       0.75      0.75      0.75      1804



Now, we will test our model on testing data we sampled before.

In [10]:
X_test = test_df.drop(columns=['id', 'description', 'C2A'])
y_test = test_df['C2A']

y_test_pred = model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(test_report)


Test Accuracy: 0.8300
Test Classification Report:
              precision    recall  f1-score   support

       False       0.85      0.80      0.82        50
        True       0.81      0.86      0.83        50

    accuracy                           0.83       100
   macro avg       0.83      0.83      0.83       100
weighted avg       0.83      0.83      0.83       100

