In [1]:
import re
import nltk
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from transformers import pipeline, AutoTokenizer
import pandas as pd

In [None]:
# loading article data
df = pd.read_csv("/Users/reppmazc/Documents/IRONHACK/quests/final_project/cleaned_articles_wo_date.csv")

In [None]:
# test three different models on 100 randomly selected articles
# Ensure stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('german'))

# Step 1: Define text preprocessing function
def preprocess_text(text):
    # Remove numbers and punctuation, lowercase, and remove stopwords
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    tokens = simple_preprocess(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

# Apply preprocessing to the 'content' column
df['processed_content'] = df['content'].apply(preprocess_text)

# Step 2: Select a random sample of 100 articles
sample_df = df.sample(n=100, random_state=1)

# Step 3: Load tokenizers and define truncation for each model
model_names = {"XLM-RoBERTa-German-sentiment": "ssary/XLM-RoBERTa-German-sentiment",
               "GermanFinBert_SC_Sentiment": "scherrmann/GermanFinBert_SC_Sentiment",
               "twitter-xlm-roberta-base-sentiment-finetunned": "citizenlab/twitter-xlm-roberta-base-sentiment-finetunned"}

# Set up tokenizers with truncation for each model
max_token_length = 512
tokenizers = {name: AutoTokenizer.from_pretrained(model) for name, model in model_names.items()}

# Truncate each text if it exceeds the maximum length for each model
def truncate_text(text, tokenizer):
    tokens = tokenizer.encode(text, truncation=True, max_length=max_token_length)
    return tokenizer.decode(tokens, skip_special_tokens=True)

# Apply truncation for each model's processed content
for model_name, tokenizer in tokenizers.items():
    sample_df[f'processed_content_{model_name}'] = sample_df['processed_content'].apply(lambda x: truncate_text(x, tokenizer))

# Step 4: Load pipelines for each model with max_length and truncation set
pipelines = {
    name: pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, max_length=max_token_length, truncation=True)
    for name, (model, tokenizer) in zip(model_names.keys(), zip(model_names.values(), tokenizers.values()))}

# Apply each model's pipeline to the truncated text and save results in separate columns
for model_name, sentiment_pipeline in pipelines.items():
    sample_df[f'sentiment_{model_name}'] = sample_df[f'processed_content_{model_name}'].apply(
        lambda x: sentiment_pipeline(x)[0]['label'])

# Step 5: Save the sample DataFrame with sentiment results to a new CSV file
sample_df.to_csv("sample_processed_articles_with_multiple_sentiments.csv", index=False)

print("Sentiment analysis with multiple models, including the Twitter model, completed and results saved to 'sample_processed_articles_with_multiple_sentiments.csv'")

# Sentiment analysis by sources and topic

In [9]:
# Load the datasets
articles_df = pd.read_csv('/Users/reppmazc/Documents/IRONHACK/quests/final_project/df_toKenized_topic.csv')
topics_df = pd.read_csv('/Users/reppmazc/Documents/IRONHACK/quests/final_project/df_topics_edited.csv')

# Convert column names to lowercase for consistency
topics_df.columns = map(str.lower, topics_df.columns)
articles_df.columns = map(str.lower, articles_df.columns)

# Merge the dataframes on the 'topic' column using a left join
merged_df = articles_df.merge(topics_df, on='topic', how='left')

# Ensure stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('german'))

# Step 1: Identify the top 3 most common topics (excluding -1) and the top 3 sources
top_topics = merged_df[~merged_df['topic'].isin([-1, 2])]['topic'].value_counts().head(16).index
top_sources = merged_df['source'].value_counts().head(18).index

# Step 2: Filter the dataset for these topics and sources
filtered_df = merged_df[(merged_df['topic'].isin(top_topics)) & 
                        (merged_df['source'].isin(top_sources))]

# Optional: Limit rows further if necessary
filtered_df = filtered_df.sample(n=1000, random_state=1)  # Adjust sample size if needed

# Step 3: Define text preprocessing function
def preprocess_text(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    tokens = simple_preprocess(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

# Apply preprocessing to the 'content' column
filtered_df['processed_content'] = filtered_df['content'].apply(preprocess_text)

# Step 4: Load tokenizer and pipeline for one model
model_name = "ssary/XLM-RoBERTa-German-sentiment"  # Select one model for testing
max_token_length = 512
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Truncate text for tokenization
def truncate_text(text, tokenizer):
    tokens = tokenizer.encode(text, truncation=True, max_length=max_token_length)
    return tokenizer.decode(tokens, skip_special_tokens=True)

filtered_df['processed_content'] = filtered_df['processed_content'].apply(lambda x: truncate_text(x, tokenizer))

# Load sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model=model_name, tokenizer=tokenizer, max_length=max_token_length, truncation=True)

# Step 5: Apply the model to get sentiment
filtered_df['sentiment'] = filtered_df['processed_content'].apply(lambda x: sentiment_pipeline(x)[0]['label'])

# Step 6: Save filtered and processed results to CSV
#filtered_df.to_csv(f"filtered_topic_sentiment_test_{model_name.split('/')[0]}.csv", index=False)
#print("Sentiment analysis completed for top topics and sources. Results saved to 'filtered_topic_sentiment_test.csv'")

# Step 7: Analyze sentiment by topic and source
sentiment_by_topic_source = filtered_df.groupby(['topic', 'source'])['sentiment'].value_counts(normalize=True).unstack(fill_value=0)
print("Sentiment distribution by top topics and sources:")
print(sentiment_by_topic_source)

# Optional: Save distribution table to CSV
sentiment_by_topic_source.to_csv(f"filtered_sentiment_distribution_by_topic_source_{model_name.split('/')[0]}.csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/reppmazc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Sentiment analysis completed for top topics and sources. Results saved to 'filtered_topic_sentiment_test.csv'
Sentiment distribution by top topics and sources:
sentiment           negative   neutral  positive
topic source                                    
0     bild          0.000000  1.000000  0.000000
      faz           0.000000  1.000000  0.000000
      morgenpost    0.000000  1.000000  0.000000
      stern         0.111111  0.777778  0.111111
      sueddeutsche  0.000000  1.000000  0.000000
...                      ...       ...       ...
15    tagesschau    0.000000  1.000000  0.000000
      welt          1.000000  0.000000  0.000000
      zeit          0.000000  1.000000  0.000000
16    faz           0.000000  1.000000  0.000000
      jungewelt     1.000000  0.000000  0.000000

[159 rows x 3 columns]


In [8]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from transformers import AutoTokenizer, pipeline

# Load the datasets
articles_df = pd.read_csv('/Users/reppmazc/Documents/IRONHACK/quests/final_project/df_toKenized_topic.csv')
topics_df = pd.read_csv('/Users/reppmazc/Documents/IRONHACK/quests/final_project/df_topics.csv')

# Convert column names to lowercase for consistency
topics_df.columns = map(str.lower, topics_df.columns)
articles_df.columns = map(str.lower, articles_df.columns)

# Merge the dataframes on the 'topic' column using a left join
merged_df = articles_df.merge(topics_df, on='topic', how='left')

merged_df.to_csv('/Users/reppmazc/Documents/IRONHACK/quests/final_project/topics.csv')

# Ensure stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('german'))

# Step 1: Identify top topics with 'count' > 200 but <= 1000, and top sources appearing more than 90 times
top_topics = merged_df[(merged_df['count'] > 200) & (merged_df['count'] <= 1000)]['topic'].unique()
top_sources = merged_df['source'].value_counts()[lambda x: x > 90].index

# Step 2: Filter the dataset for these topics and sources
filtered_df = merged_df[(merged_df['topic'].isin(top_topics)) & 
                        (merged_df['source'].isin(top_sources))]

filtered_size = len(filtered_df)
print(filtered_size)
sample_size = min(10000, filtered_size)  # Use 10,000 if possible, or the maximum available

# Sample with the adjusted size
filtered_df = filtered_df.sample(n=sample_size, random_state=1)

# Step 3: Define text preprocessing function
def preprocess_text(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    tokens = simple_preprocess(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

# Apply preprocessing to the 'content' column
filtered_df['processed_content'] = filtered_df['content'].apply(preprocess_text)

# Step 4: Load tokenizer and pipeline for one model
model_name = "ssary/XLM-RoBERTa-German-sentiment"  # Select one model for testing
max_token_length = 512
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Truncate text for tokenization
def truncate_text(text, tokenizer):
    tokens = tokenizer.encode(text, truncation=True, max_length=max_token_length)
    return tokenizer.decode(tokens, skip_special_tokens=True)

filtered_df['processed_content'] = filtered_df['processed_content'].apply(lambda x: truncate_text(x, tokenizer))

# Load sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model=model_name, tokenizer=tokenizer, max_length=max_token_length, truncation=True)

# Step 5: Apply the model to get sentiment
filtered_df['sentiment'] = filtered_df['processed_content'].apply(lambda x: sentiment_pipeline(x)[0]['label'])

# Replace topic numbers with topic names in the final results
filtered_df['topic'] = filtered_df['name']  # Assuming 'name' column in topics_df contains the topic name

# Step 6: Save filtered and processed results to CSV
filtered_df.to_csv(f"filtered_topic_sentiment_test_{model_name.split('/')[0]}.csv", index=False)
print(f"Sentiment analysis completed for top topics and sources. Results saved to 'filtered_topic_sentiment_test_{model_name.split('/')[0]}.csv'")

# Step 7: Analyze sentiment by topic and source
sentiment_by_topic_source = filtered_df.groupby(['topic', 'source'])['sentiment'].value_counts(normalize=True).unstack(fill_value=0)
print("Sentiment distribution by top topics and sources:")
print(sentiment_by_topic_source)

# Optional: Save distribution table to CSV with topic names
sentiment_by_topic_source.to_csv(f"filtered_sentiment_distribution_by_topic_source_{model_name.split('/')[0]}.csv")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/reppmazc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


7907


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Sentiment analysis completed for top topics and sources. Results saved to 'filtered_topic_sentiment_test_ssary.csv'
Sentiment distribution by top topics and sources:
sentiment                                      negative   neutral  positive
topic                            source                                    
0_fc_trainer_fußball_bundesliga  bild          0.266667  0.666667  0.066667
                                 dw            0.142857  0.857143  0.000000
                                 faz           0.125000  0.812500  0.062500
                                 focus         0.000000  1.000000  0.000000
                                 jungewelt     0.000000  1.000000  0.000000
...                                                 ...       ...       ...
9_euro_millionen_milliarden_jahr sueddeutsche  0.250000  0.750000  0.000000
                                 tagesschau    0.000000  1.000000  0.000000
                                 tagesspiegel  0.064516  0.935484  0.00000

# political leaning classification with BERT

In [2]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

In [3]:
df = pd.read_csv("/Users/reppmazc/Documents/IRONHACK/quests/final_project/cleaned_articles_wo_date.csv")

# Map sources to political spectrum labels
source_to_label = {"taz": "links", "jungewelt": "links", "freitag": "links", "sueddeutsche": "links",
                   "dw": "mitte", "tagesschau": "mitte",
                   "bild": "rechts", "focus": "rechts", "welt": "rechts", "jungefreiheit": "rechts"}

# Apply the mapping to the 'label' column
df['label'] = df['source'].map(source_to_label)

# Drop any rows where label is NaN (in case of unmatched sources)
df = df.dropna(subset=['label'])

# Sample 750 articles per spectrum label to create a balanced dataset
df = df.groupby('label').apply(lambda x: x.sample(n=800, random_state=42)).reset_index(drop=True)

# Fill any NaN in content with an empty string (to avoid errors)
df['content'] = df['content'].fillna("")

# Encode Labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

  df = df.groupby('label').apply(lambda x: x.sample(n=800, random_state=42)).reset_index(drop=True)


In [4]:
# Split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert DataFrames to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df[['content', 'label']])
test_dataset = Dataset.from_pandas(test_df[['content', 'label']])

In [5]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
model = BertForSequenceClassification.from_pretrained('dbmdz/bert-base-german-cased', num_labels=len(label_encoder.classes_))

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['content'], padding="max_length", truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1920 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

In [6]:
# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10)

# Initialize Trainer
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=train_dataset,
                  eval_dataset=test_dataset)



In [7]:
# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

# Print the evaluation results
print("Evaluation Results:", results)

Epoch,Training Loss,Validation Loss
1,0.2806,0.294454
2,0.3022,0.32523
3,0.0015,0.302689


Evaluation Results: {'eval_loss': 0.3026885390281677, 'eval_runtime': 36.7475, 'eval_samples_per_second': 13.062, 'eval_steps_per_second': 1.633, 'epoch': 3.0}


# political leaning classification with BERT (2 epochs only)

In [3]:
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification
from datasets import Dataset

df = pd.read_csv("/Users/reppmazc/Documents/IRONHACK/quests/final_project/cleaned_articles_wo_date.csv")

# Map sources to political spectrum labels
source_to_label = {"taz": "links", "jungewelt": "links", "freitag": "links", "sueddeutsche": "links",
                   "dw": "mitte", "tagesschau": "mitte",
                   "bild": "rechts", "focus": "rechts", "welt": "rechts", "jungefreiheit": "rechts"}

# Apply the mapping to the 'label' column
df['label'] = df['source'].map(source_to_label)

# Drop any rows where label is NaN (in case of unmatched sources)
df = df.dropna(subset=['label'])

# Sample 750 articles per spectrum label to create a balanced dataset
df = df.groupby('label').apply(lambda x: x.sample(n=800, random_state=42)).reset_index(drop=True)

# Fill any NaN in content with an empty string (to avoid errors)
df['content'] = df['content'].fillna("")

# Encode Labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert DataFrames to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df[['content', 'label']])
test_dataset = Dataset.from_pandas(test_df[['content', 'label']])

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
model = BertForSequenceClassification.from_pretrained('dbmdz/bert-base-german-cased', num_labels=len(label_encoder.classes_))

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['content'], padding="max_length", truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Define the compute_metrics function for accuracy and F1-score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')  # 'weighted' for multi-class
    return {"accuracy": acc, "f1": f1}

# Define Training Arguments with only 2 epochs
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,  # Set to 2 epochs
    logging_dir='./logs',
    logging_steps=10)

# Initialize Trainer with the compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics)

# Train the model (will stop at 2 epochs)
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate()

# Print the evaluation results, including accuracy and F1-score
print("Evaluation Results:", results)

# Save the trained model and tokenizer
model_save_path = "./saved_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Save the label encoder (for decoding predictions later)
import joblib
joblib.dump(label_encoder, f"{model_save_path}/label_encoder.joblib")

print(f"Model, tokenizer, and label encoder saved to {model_save_path}")

  df = df.groupby('label').apply(lambda x: x.sample(n=800, random_state=42)).reset_index(drop=True)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1920 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1814,0.216879,0.920833,0.920894
2,0.2243,0.248046,0.933333,0.933016


Evaluation Results: {'eval_loss': 0.2480456829071045, 'eval_accuracy': 0.9333333333333333, 'eval_f1': 0.9330157533797034, 'eval_runtime': 33.7792, 'eval_samples_per_second': 14.21, 'eval_steps_per_second': 1.776, 'epoch': 2.0}
Model, tokenizer, and label encoder saved to ./saved_model


check predicted vs true label

apply model to new data without the sources used for initial political leaning labels

In [14]:
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import Trainer, BertTokenizer, BertForSequenceClassification
import joblib

# Load the new data
file_path = "/Users/reppmazc/Documents/IRONHACK/quests/final_project/cleaned_articles_wo_date.csv"
df = pd.read_csv(file_path)

# Define sources to exclude
excluded_sources = ["taz", "jungewelt", "freitag", "sueddeutsche", "dw", "tagesschau", "bild", "focus", "welt", "jungefreiheit"]

# Filter out rows with excluded sources
new_data_df = df[~df['source'].isin(excluded_sources)]

print("Filtered Data Shape:", new_data_df.shape)
print("Filtered Data Preview:", new_data_df.head())

# Load the saved model, tokenizer, and label encoder
model_save_path = "./saved_model"
model = BertForSequenceClassification.from_pretrained(model_save_path)
tokenizer = BertTokenizer.from_pretrained(model_save_path)
label_encoder = joblib.load(f"{model_save_path}/label_encoder.joblib")

# Tokenize new data
new_dataset = Dataset.from_pandas(new_data_df[['content']])

def tokenize_new_data(examples):
    return tokenizer(examples['content'], padding="max_length", truncation=True, max_length=256)

new_dataset = new_dataset.map(tokenize_new_data, batched=True)
new_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Initialize Trainer with the loaded model
trainer = Trainer(model=model)

# Use the trainer to predict
predictions = trainer.predict(new_dataset)

# Extract predicted labels
predicted_indices = np.argmax(predictions.predictions, axis=1)
predicted_labels = label_encoder.inverse_transform(predicted_indices)

# Add predictions to the DataFrame
new_data_df['predicted_label'] = predicted_labels

# Calculate label percentages per source
label_distribution = (
    new_data_df.groupby('source')['predicted_label']
    .value_counts(normalize=True)
    .rename("percentage")
    .reset_index()
)

# Convert percentages to readable format
label_distribution['percentage'] = label_distribution['percentage'] * 100

# Display the distribution results
print(label_distribution)


Filtered Data Shape: (18784, 5)
Filtered Data Preview:     Unnamed: 0                                                url  \
2            2  https://www.faz.net/aktuell/politik/krieg-in-n...   
3            3  https://www.tagesspiegel.de/berlin/bezirke/sch...   
6            8  https://www.stern.de/gesellschaft/regional/sac...   
7            9  https://www.faz.net/aktuell/politik/ausland/us...   
10          12  https://www.stern.de/news/habeck-erwartet-haus...   

                                              content  datetime        source  
2   Krieg in Nahost : Israels heikle Optionen Ein ...       NaN           faz  
3    Carsten Berger Das Angebot kam plötzlich: 20 ...       NaN  tagesspiegel  
6    Link Der deutsche Meister SC Magdeburg bleibt...       NaN         stern  
7   Wahlkampf in den USA : Nichts läuft nach Plan ...       NaN           faz  
10   Link Vizekanzler Robert Habeck (Grüne) erwart...       NaN         stern  


Map:   0%|          | 0/18784 [00:00<?, ? examples/s]

         source predicted_label  percentage
0    abendblatt           mitte  100.000000
1      brigitte           links  100.000000
2          chip          rechts  100.000000
3   derstandard           links   50.000000
4   derstandard          rechts   50.000000
..          ...             ...         ...
64  volksfreund          rechts   63.636364
65  volksfreund           links   36.363636
66         zeit           links   47.679255
67         zeit          rechts   37.381467
68         zeit           mitte   14.939278

[69 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data_df['predicted_label'] = predicted_labels


In [22]:
# Calculate the majority label for each source
majority_labels = (
    new_data_df.groupby('source')['predicted_label']
    .agg(lambda x: x.value_counts().idxmax())
    .reset_index()
    .rename(columns={'predicted_label': 'majority_label'})
)

# Merge the majority label with the original label distribution
label_distribution = label_distribution.merge(majority_labels, on='source')

# Save the final result with majority label
label_distribution.to_csv('/Users/reppmazc/Documents/IRONHACK/quests/final_project/pol_leaning_classification_dist_with_majority.csv', index=False)


# political leaning classification with BERT (2 epochs only) - changed initial lables

In [28]:
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification
from datasets import Dataset

df = pd.read_csv("/Users/reppmazc/Documents/IRONHACK/quests/final_project/cleaned_articles_wo_date.csv")

# Map sources to political spectrum labels
source_to_label = {"spiegel": "links", "jungewelt": "links", "freitag": "links", "sueddeutsche": "links",
                   "dw": "mitte", "tagesschau": "mitte",
                   "bild": "rechts", "focus": "rechts", "welt": "rechts", "jungefreiheit": "rechts"}

# Apply the mapping to the 'label' column
df['label'] = df['source'].map(source_to_label)

# Drop any rows where label is NaN (in case of unmatched sources)
df = df.dropna(subset=['label'])

# Sample 750 articles per spectrum label to create a balanced dataset
df = df.groupby('label').apply(lambda x: x.sample(n=800, random_state=42)).reset_index(drop=True)

# Fill any NaN in content with an empty string (to avoid errors)
df['content'] = df['content'].fillna("")

# Encode Labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert DataFrames to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df[['content', 'label']])
test_dataset = Dataset.from_pandas(test_df[['content', 'label']])

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
model = BertForSequenceClassification.from_pretrained('dbmdz/bert-base-german-cased', num_labels=len(label_encoder.classes_))

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['content'], padding="max_length", truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Define the compute_metrics function for accuracy and F1-score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')  # 'weighted' for multi-class
    return {"accuracy": acc, "f1": f1}

# Define Training Arguments with only 2 epochs
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,  # Set to 2 epochs
    logging_dir='./logs',
    logging_steps=10)

# Initialize Trainer with the compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics)

# Train the model (will stop at 2 epochs)
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate()

# Print the evaluation results, including accuracy and F1-score
print("Evaluation Results:", results)

# Save the trained model and tokenizer
model_save_path = "./saved_model_changed_lables"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Save the label encoder (for decoding predictions later)
import joblib
joblib.dump(label_encoder, f"{model_save_path}/label_encoder.joblib")

print(f"Model, tokenizer, and label encoder saved to {model_save_path}")

  df = df.groupby('label').apply(lambda x: x.sample(n=800, random_state=42)).reset_index(drop=True)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1920 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3292,0.276519,0.895833,0.895979
2,0.0782,0.287193,0.920833,0.920866


Evaluation Results: {'eval_loss': 0.28719285130500793, 'eval_accuracy': 0.9208333333333333, 'eval_f1': 0.9208658439944357, 'eval_runtime': 40.414, 'eval_samples_per_second': 11.877, 'eval_steps_per_second': 1.485, 'epoch': 2.0}
Model, tokenizer, and label encoder saved to ./saved_model_changed_lables


In [30]:
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import Trainer, BertTokenizer, BertForSequenceClassification
import joblib

# Load the new data
file_path = "/Users/reppmazc/Documents/IRONHACK/quests/final_project/cleaned_articles_wo_date.csv"
df = pd.read_csv(file_path)

# Define sources to exclude
excluded_sources = ["spiegel", "jungewelt", "freitag", "sueddeutsche", "dw", "tagesschau", "bild", "focus", "welt", "jungefreiheit"]

# Filter out rows with excluded sources
new_data_df = df[~df['source'].isin(excluded_sources)]

print("Filtered Data Shape:", new_data_df.shape)
print("Filtered Data Preview:", new_data_df.head())

# Load the saved model, tokenizer, and label encoder
model_save_path = "./saved_model_changed_lables"
model = BertForSequenceClassification.from_pretrained(model_save_path)
tokenizer = BertTokenizer.from_pretrained(model_save_path)
label_encoder = joblib.load(f"{model_save_path}/label_encoder.joblib")

# Tokenize new data
new_dataset = Dataset.from_pandas(new_data_df[['content']])

def tokenize_new_data(examples):
    return tokenizer(examples['content'], padding="max_length", truncation=True, max_length=256)

new_dataset = new_dataset.map(tokenize_new_data, batched=True)
new_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Initialize Trainer with the loaded model
trainer = Trainer(model=model)

# Use the trainer to predict
predictions = trainer.predict(new_dataset)

# Extract predicted labels
predicted_indices = np.argmax(predictions.predictions, axis=1)
predicted_labels = label_encoder.inverse_transform(predicted_indices)

# Add predictions to the DataFrame
new_data_df['predicted_label'] = predicted_labels

# Calculate label percentages per source
label_distribution = (
    new_data_df.groupby('source')['predicted_label']
    .value_counts(normalize=True)
    .rename("percentage")
    .reset_index()
)

# Convert percentages to readable format
label_distribution['percentage'] = label_distribution['percentage'] * 100

# Display the distribution results
print(label_distribution)


Filtered Data Shape: (18681, 5)
Filtered Data Preview:     Unnamed: 0                                                url  \
2            2  https://www.faz.net/aktuell/politik/krieg-in-n...   
3            3  https://www.tagesspiegel.de/berlin/bezirke/sch...   
6            8  https://www.stern.de/gesellschaft/regional/sac...   
7            9  https://www.faz.net/aktuell/politik/ausland/us...   
10          12  https://www.stern.de/news/habeck-erwartet-haus...   

                                              content  datetime        source  
2   Krieg in Nahost : Israels heikle Optionen Ein ...       NaN           faz  
3    Carsten Berger Das Angebot kam plötzlich: 20 ...       NaN  tagesspiegel  
6    Link Der deutsche Meister SC Magdeburg bleibt...       NaN         stern  
7   Wahlkampf in den USA : Nichts läuft nach Plan ...       NaN           faz  
10   Link Vizekanzler Robert Habeck (Grüne) erwart...       NaN         stern  


Map:   0%|          | 0/18681 [00:00<?, ? examples/s]

         source predicted_label  percentage
0    abendblatt           mitte  100.000000
1      brigitte           links  100.000000
2          chip           links  100.000000
3   derstandard           links   50.000000
4   derstandard          rechts   50.000000
..          ...             ...         ...
60  volksfreund          rechts   45.454545
61  volksfreund           mitte    1.010101
62         zeit           links   52.720013
63         zeit          rechts   37.215106
64         zeit           mitte   10.064881

[65 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data_df['predicted_label'] = predicted_labels


In [32]:
# Calculate the majority label for each source
majority_labels = (
    new_data_df.groupby('source')['predicted_label']
    .agg(lambda x: x.value_counts().idxmax())
    .reset_index()
    .rename(columns={'predicted_label': 'majority_label'})
)

# Merge the majority label with the original label distribution
label_distribution = label_distribution.merge(majority_labels, on='source')

# Save the final result with majority label
label_distribution.to_csv('/Users/reppmazc/Documents/IRONHACK/quests/final_project/pol_leaning_classification_dist_with_majority_new.csv', index=False)


# coding leaning manually

In [35]:
# generate 'pol_leaning' column based on 'source' column, accoroding to 'source_to_label' dictionary
import pandas as pd

# Load the data
df = pd.read_csv('/Users/reppmazc/Documents/IRONHACK/quests/final_project/cleaned_articles_wo_date.csv')

# Define the source to political leaning mapping
source_to_label = {
    "spiegel": "mitte_links",
    "taz": "links",
    "jungewelt": "links",
    "freitag": "links",
    "zeit": "mitte_links",
    "tagesspiegel": "mitte_links",
    "dw": "mitte",
    "tagesschau": "mitte",
    "stern": "mitte_rechts",
    "focus": "mitte_rechts",
    "welt": "mitte_rechts",
    "jungefreiheit": "rechts",
    "sueddeutsche": "mitte_links",
    "faz": "mitte_rechts",
    "morgenpost": "mitte_rechts",
    "bild": "rechts",
    "rbb24": "mitte",
    "express": "rechts"
}

# Create 'pol_leaning' column based on 'source' column using the dictionary
df['pol_leaning'] = df['source'].map(source_to_label)

# Save the updated DataFrame to a new CSV
df.to_csv('/Users/reppmazc/Documents/IRONHACK/quests/final_project/cleaned_articles_wo_date_leaning.csv', index=False)


# semtiment analysis on all data (only roberta)

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from transformers import AutoTokenizer, pipeline
from gensim.utils import simple_preprocess

# Ensure stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('german'))

# Step 1: Define text preprocessing function
def preprocess_text(text):
    # Remove numbers and punctuation, lowercase, and remove stopwords
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    tokens = simple_preprocess(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

# loading article data
df = pd.read_csv("/Users/reppmazc/Documents/IRONHACK/quests/final_project/cleaned_articles_wo_date.csv")

# Apply preprocessing to the 'content' column (Consider using batch processing if memory is an issue)
df['processed_content'] = df['content'].apply(preprocess_text)

# Step 2: Load the tokenizer and sentiment analysis pipeline for the selected model
model_name = "ssary/XLM-RoBERTa-German-sentiment"
max_token_length = 512
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model_name, tokenizer=tokenizer, max_length=max_token_length, truncation=True)

# Truncate text function
def truncate_text(text):
    tokens = tokenizer.encode(text, truncation=True, max_length=max_token_length)
    return tokenizer.decode(tokens, skip_special_tokens=True)

# Apply truncation to the processed content
df['processed_content_truncated'] = df['processed_content'].apply(truncate_text)

# Step 3: Apply the sentiment pipeline to each row in a memory-efficient way
# Process data in chunks if the dataset is too large for your laptop's memory
batch_size = 50  # You can adjust this based on your system's memory capacity
sentiment_results = []

for start in range(0, len(df), batch_size):
    batch_texts = df['processed_content_truncated'][start:start+batch_size].tolist()
    batch_sentiments = sentiment_pipeline(batch_texts)
    sentiment_results.extend([result['label'] for result in batch_sentiments])

# Save sentiment results to the DataFrame
df['sentiment_XLM_RoBERTa'] = sentiment_results

# Step 4: Save the entire DataFrame with sentiment results to a new CSV file
df.to_csv("all_processed_articles_with_XLM_RoBERTa_sentiment.csv", index=False)

print("Sentiment analysis with XLM-RoBERTa model completed and results saved to 'all_processed_articles_with_XLM_RoBERTa_sentiment.csv'")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/reppmazc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Sentiment analysis with XLM-RoBERTa model completed and results saved to 'all_processed_articles_with_XLM_RoBERTa_sentiment.csv'
