In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define file paths in Google Drive
xml_file_path = "/content/drive/My Drive/German Pol Speeches/Bundesregierung.xml"
qids_file_path = "/content/drive/My Drive/German Pol Speeches/QIDs.csv"

# Read QIDs CSV file
qids = pd.read_csv(qids_file_path, encoding="ISO-8859-1", delimiter=";")

# Parse XML file
tree = ET.parse(xml_file_path)
root = tree.getroot()

# Extract data from XML into a DataFrame
data = []
for text_node in root.findall("text"):
    person = text_node.get("person", "")
    titel = text_node.get("titel", "")
    datum = text_node.get("datum", "")
    ort = text_node.get("ort", "")
    untertitel = text_node.get("untertitel", "")
    url = text_node.get("url", "")
    anrede = text_node.get("anrede", "")

    # Extract 'rohtext'
    rohtext_node = text_node.find("rohtext")
    rohtext = rohtext_node.text if rohtext_node is not None else ""

    data.append([person, titel, datum, ort, untertitel, url, anrede, rohtext])

# Convert extracted XML data to a Pandas DataFrame
df = pd.DataFrame(data, columns=["Person", "Titel", "Datum", "Ort", "Untertitel", "URL", "Anrede", "Rohtext"])

# Standardize names (trim spaces and remove multiple spaces)
df["Person"] = df["Person"].str.strip().str.replace(r"\s+", " ", regex=True)
qids["person"] = qids["person"].str.strip().str.replace(r"\s+", " ", regex=True)

# Merge the speech dataset with QIDs
merged_df = pd.merge(df, qids, left_on="Person", right_on="person", how="outer")

# Save the merged dataset to Google Drive
output_file = "/content/drive/My Drive/German Pol Speeches/Merged_Speeches_QIDs.csv"
merged_df.to_csv(output_file, index=False)

print(f"✅ Merged dataset saved to Google Drive: {output_file}")


In [None]:
# Display the first 5 rows of the merged dataset
merged_df.head()


In [None]:
import pandas as pd
import re
# Drop rows where 'sex' is NaN and reassign properly
merged_df = merged_df.loc[merged_df["sex"].notna()].copy()

# Convert 'sex' to integer binary (0 or 1)
merged_df["sex"] = merged_df["sex"].astype(int)


# Function to split text into sentences
def split_text(text):
    return re.split(r'(?<=[.!?])\s+', text.strip()) if isinstance(text, str) else []

# Define words referring to women in German
women_keywords = [
    "Frau", "Frauen", "Mädchen", "Mädel", "Dame", "Damen", "Weib", "Weiber", "Mutter", "Mütter",
    "Schwester", "Schwestern", "Tochter", "Töchter", "Ehefrau", "Ehefrauen", "Gattin", "Göttin",
    "Weiblichkeit", "weiblich", "Mütterlichkeit", "Mädchenhaft", "Frauenrecht", "Frauenrechte",
    "Frauenbewegung", "Feministin", "Feminismus", "Geschlechtergerechtigkeit"
]

# Initialize a new DataFrame for sentence-level data
sentence_data = []

# Iterate through each speech and split into sentences
for idx, row in merged_df.iterrows():
    sentences = split_text(row["Rohtext"])  # Split text into sentences
    speech_id = f"{idx}"  # Create an identifier for the speech

    for sentence_num, sentence in enumerate(sentences, start=1):
        sentence_id = f"{speech_id}_{sentence_num}"  # Unique sentence identifier

        # Check if sentence contains any women-related words
        contains_women = any(word in sentence for word in women_keywords)

        # Append to new dataset (excluding Rohtext to save space)
        sentence_data.append({
            "speech_id": speech_id,
            "sentence_id": sentence_id,
            "sentence": sentence.strip(),
            "sex": row["sex"],
            "party": row["party"],
            "Datum": row["Datum"],
            "Ort": row["Ort"],
            "Titel": row["Titel"],
            "Untertitel": row["Untertitel"],
            "URL": row["URL"],
            "Anrede": row["Anrede"],
            "women": contains_women  # True if sentence references women, else False
        })

# Convert to DataFrame
sentence_df = pd.DataFrame(sentence_data)

# Save the new structured dataset to Google Drive
output_file = "/content/drive/My Drive/German Pol Speeches/Sentence_Level_Dataset.csv"
sentence_df.to_csv(output_file, index=False)



print(f"✅ Process complete! Sentence-level dataset saved to: {output_file}")


In [None]:
!pip install transformers torch


In [None]:
!pip install transformers datasets torch scikit-learn


In [None]:
import torch
import pandas as pd
import json
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Disable wandb
import os
os.environ["WANDB_DISABLED"] = "true"

# Define file paths
dataset_path = "/content/drive/MyDrive/complete_dataset_with_claims.csv"
best_params_path = "/content/drive/MyDrive/best_params.json"
save_model_path = "/content/drive/MyDrive/final_model"

# Load dataset
df = pd.read_csv(dataset_path)

# Load best hyperparameters
with open(best_params_path, "r") as f:
    best_params = json.load(f)

# Ensure correct column names
text_column = "sentences"  # Use the correct column name
label_column = "claim"  # Label column remains the same

# Ensure necessary columns exist
if text_column not in df.columns or label_column not in df.columns:
    raise ValueError(f"Dataset must contain '{text_column}' (text) and '{label_column}' (label). Found columns: {df.columns}")

# Convert "claim" to integer labels (0 or 1)
df[label_column] = df[label_column].astype(int)

# Split data into training and test sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df[text_column].tolist(), df[label_column].tolist(), test_size=0.1, random_state=42
)

# Load multilingual BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples[text_column], padding="max_length", truncation=True, max_length=512)

# Convert dataset to Hugging Face Dataset format
train_dataset = Dataset.from_dict({text_column: train_texts, "label": train_labels}).map(tokenize_function, batched=True)
val_dataset = Dataset.from_dict({text_column: val_texts, "label": val_labels}).map(tokenize_function, batched=True)

# Load pre-trained multilingual BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

# Define training arguments using best hyperparameters
training_args = TrainingArguments(
    output_dir="/content/output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=best_params["batch_size"],
    per_device_eval_batch_size=best_params["batch_size"],
    num_train_epochs=best_params["epochs"],
    weight_decay=best_params["weight_decay"],
    logging_dir="/content/logs",
    logging_steps=100,
    save_total_limit=2
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model (No wandb!)
trainer.train()

# Save the model & tokenizer to Google Drive
trainer.save_model(save_model_path)
tokenizer.save_pretrained(save_model_path)

print(f" Training complete! Model saved to: {save_model_path}")


In [None]:
# Count the number of claims (label == 1)
num_claims = df["claim"].sum()

print(f"Number of claims found: {num_claims}")


In [None]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from google.colab import drive
from IPython.display import display

# 🚀 Mount Google Drive
drive.mount('/content/drive')

# Define paths
model_path = "/content/drive/My Drive/final_model"  # Trained BERT model
sentence_file = "/content/drive/My Drive/German Pol Speeches/Sentence_Level_Dataset.csv"  # Sentence dataset
output_file = "/content/drive/My Drive/German Pol Speeches/Classified_Sentences.csv"  # Save classification results

# Load the dataset
sentence_df = pd.read_csv(sentence_file)

# Ensure "women" column exists
if "women" not in sentence_df.columns:
    raise ValueError(f"Dataset must contain a 'women' column. Found columns: {sentence_df.columns}")

# Load trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()  # Set model to evaluation mode

# Filter sentences that reference women
women_sentences = sentence_df[sentence_df["women"] == True].copy()

# Function to classify sentences
def classify_claims(sentences):
    predictions = []
    with torch.no_grad():
        for sentence in sentences:
            inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=-1)
            predicted_class = torch.argmax(probs, dim=-1).item()  # Get predicted class (0 or 1)
            predictions.append(predicted_class)
    return predictions

# Apply classification
women_sentences["claim"] = classify_claims(women_sentences["sentence"].tolist())

# Merge classification results back into the main dataset
sentence_df.loc[sentence_df["women"] == True, "claim"] = women_sentences["claim"]

# Convert "claim" column to binary format (0 or 1)
sentence_df["claim"] = sentence_df["claim"].fillna(0).astype(int)

# Count the number of TRUE (1) values in "claim"
num_claims = sentence_df["claim"].sum()
print(f"Total 'TRUE' values in claim: {num_claims}")

# Save the updated dataset
sentence_df.to_csv(output_file, index=False)

print(f"Classified dataset saved to: {output_file}")

# Display first few rows of the classified dataset
display(sentence_df.head())


In [None]:
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

#Define file paths
sentence_file = "/content/drive/My Drive/German Pol Speeches/Sentence_Level_Dataset.csv"
classified_file = "/content/drive/My Drive/German Pol Speeches/Classified_Sentences.csv"
output_summary_file = "/content/drive/My Drive/German Pol Speeches/Speech_Level_Summary.csv"

# Load datasets with low_memory=False to avoid warnings
df_sentences = pd.read_csv(sentence_file, low_memory=False)
df_classified = pd.read_csv(classified_file, low_memory=False)

# Ensure necessary columns exist
if "speech_id" not in df_sentences.columns or "sentence" not in df_sentences.columns:
    raise ValueError(f"Sentence dataset must contain 'speech_id' and 'sentence' columns. Found: {df_sentences.columns}")

if "speech_id" not in df_classified.columns or "sentence_id" not in df_classified.columns or "claim" not in df_classified.columns:
    raise ValueError(f"Classified dataset must contain 'speech_id', 'sentence_id', and 'claim' columns. Found: {df_classified.columns}")

# Merge claim data into sentence dataset
df_merged = df_sentences.merge(df_classified[["speech_id", "sentence_id", "claim"]], on=["speech_id", "sentence_id"], how="left")

# Convert 'claim' to numeric (defaulting NaN to 0)
df_merged["claim"] = pd.to_numeric(df_merged["claim"], errors="coerce").fillna(0).astype(int)

# Select all columns except "sentence_id" and "sentence" for grouping
metadata_columns = [col for col in df_merged.columns if col not in ["sentence_id", "sentence", "claim"]]

# Aggregate by speech_id, keeping claim values
speech_summary = df_merged.groupby("speech_id", as_index=False).agg({
    **{col: "first" for col in metadata_columns},  # Keep first occurrence of metadata columns
    "sentence": " ".join,  # Concatenate all sentences into a single string
    "claim": "sum"  # Sum up claims per speech
})

# Save summarized dataset
speech_summary.to_csv(output_summary_file, index=False)

print(f"Speech-level summary saved to: {output_summary_file}")

# Display first few rows
from IPython.display import display
display(speech_summary.head())


In [None]:
# ✅ Calculate the total number of claims in the dataset
total_claims = speech_summary["claim"].sum()

print(f"Total sum of claim for the whole dataset: {total_claims}")


In [None]:
# Add claimbinary column (1 if claim > 0, else 0)
speech_summary["claimbinary"] = (speech_summary["claim"] > 0).astype(int)

# Calculate total sum of claims
total_claims = speech_summary["claim"].sum()
print(f"Total sum of claim for the whole dataset: {total_claims}")

# Save updated speech summary
output_summary_file = "/content/drive/My Drive/German Pol Speeches/Speech_Level_Summary.csv"
speech_summary.to_csv(output_summary_file, index=False)

print(f"Updated speech-level summary saved to: {output_summary_file}")

# Display first few rows
from IPython.display import display
display(speech_summary.head())


In [None]:
# Count the number of speeches where claimbinary is 1
num_speeches_with_claims = speech_summary["claimbinary"].sum()

print(f"Number of speeches where claimbinary is 1: {num_speeches_with_claims}")



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the summarized speech dataset from Google Drive
output_summary_file = "/content/drive/My Drive/German Pol Speeches/Speech_Level_Summary.csv"
speech_summary = pd.read_csv(output_summary_file)

# Ensure "Datum" is in datetime format
speech_summary["Datum"] = pd.to_datetime(speech_summary["Datum"], errors="coerce")

# Extract year from the date
speech_summary["year"] = speech_summary["Datum"].dt.year

# Group by year and sex, counting speeches where claimbinary == 1
yearly_claims = speech_summary[speech_summary["claimbinary"] == 1].groupby(["year", "sex"]).size().reset_index(name="count")

# Define custom colors and labels
colors = {0: "#003399", 1: "#FFCC00"}  # Men (blue), Women (yellow)
labels = {0: "Men", 1: "Women"}  # Replacing "Sex 0" with "Men" and "Sex 1" with "Women"

# Plot the number of speeches with claims over time, separated by sex
plt.figure(figsize=(10, 6))

for sex_value in yearly_claims["sex"].unique():
    subset = yearly_claims[yearly_claims["sex"] == sex_value]
    plt.plot(subset["year"], subset["count"], marker="o", label=labels[sex_value], color=colors[sex_value])

plt.xlabel("Year")
plt.ylabel("Number of Speeches with Claims")
plt.title("Speeches with Claims Over Time by Sex")
plt.legend(title="Speaker Sex")
plt.show()


# Neuer Abschnitt

In [None]:
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset from Google Drive
output_summary_file = "/content/drive/My Drive/German Pol Speeches/Speech_Level_Summary.csv"
speech_summary = pd.read_csv(output_summary_file)

# Define party replacements
party_replacements = {
    "spd": "PES",
    "cdu": "EPP",
    "grüne": "EGP",
    "fdp": "ALDE"
}

# Replace values in "party" column
speech_summary["party"] = speech_summary["party"].str.lower().map(party_replacements).fillna(speech_summary["party"])

# Save the updated dataset back to Google Drive
updated_summary_file = "/content/drive/My Drive/German Pol Speeches/Speech_Level_Summary_Updated.csv"
speech_summary.to_csv(updated_summary_file, index=False)

# Display first few rows
print(speech_summary.head())

print(f"Updated dataset saved to: {updated_summary_file}")


In [None]:
# Ensure "Datum" is in datetime format
speech_summary["Datum"] = pd.to_datetime(speech_summary["Datum"], errors="coerce")

# Check for duplicates in the "Datum" (date) column
duplicate_dates = speech_summary["Datum"].duplicated().sum()

# Check if all values in "Datum" are unique
if duplicate_dates == 0:
    print("All values in the 'Datum' column are unique.")
else:
    print(f"There are {duplicate_dates} duplicate entries in the 'Datum' column.")

In [None]:
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define file path
topics_file_path = "/content/drive/My Drive/germanoptimized_speeches_with_topics_updated.csv"

# Load the dataset
topics_df = pd.read_csv(topics_file_path)

# Display first few rows to confirm successful loading
print("Dataset successfully loaded. Preview:")
print(topics_df.head())


In [None]:
merged_file_path = "/content/drive/My Drive/German Pol Speeches/Speech_Level_Summary_With_Topics.csv"



# Merge speech_summary with topics_df based on "speech_id"
merged_df = speech_summary.merge(
    topics_df[["speech_id", "w_top_tot", "length"]],
    on="speech_id",
    how="left"
)

# Save the merged dataset back to Google Drive
merged_df.to_csv(merged_file_path, index=False)



In [None]:
# Rename the "claim" column to "claimcount"
merged_df = merged_df.rename(columns={"claim": "claimcount"})
merged_df = merged_df.rename(columns={"Datum": "date"})




In [None]:
merged_df.to_csv(merged_file_path, index=False)

In [None]:
import pandas as pd

# Define file paths
commission_path = "/content/drive/My Drive/final_data_cleaned.csv"
bundestag_path = "/content/drive/My Drive/German Pol Speeches/Speech_Level_Summary_With_Topics.csv"

# Load the datasets
df_commission_raw = pd.read_csv(commission_path)
df_bundestag_raw = pd.read_csv(bundestag_path)

# Display first rows to confirm successful loading
print("Commission Data:")
display(df_commission_raw.head())

print("\nBundestag Data:")
display(df_bundestag_raw.head())


In [None]:
# Count occurrences of 1 (True) and 0 (False) in claimbinary
claimbinary_counts = df_bundestag_raw['claimbinary'].value_counts()

# Display counts
print("Claimbinary Value Counts in Bundestag Data:")
print(claimbinary_counts)

# Specifically print the number of speeches with claims (1)
true_claims = claimbinary_counts.get(1, 0)  # Default to 0 if not found
print(f"\nNumber of speeches with claims (claimbinary = 1): {true_claims}")


In [None]:
# Check claimbinary counts for Bundestag
print("Bundestag Data - Claimbinary Counts:")
print(df_bundestag_raw['claimbinary'].value_counts())

# Check claimbinary counts for Commission
print("\nCommission Data - Claimbinary Counts:")
print(df_commission_raw['claimbinary'].value_counts())


In [None]:
# Calculate claim probability for Bundestag
bundestag_claim_prob = df_bundestag_raw['claimbinary'].mean() * 100

# Calculate claim probability for Commission
commission_claim_prob = df_commission_raw['claimbinary'].mean() * 100

# Print results
print(f"Bundestag - Claim Probability: {bundestag_claim_prob:.2f}%")
print(f"Commission - Claim Probability: {commission_claim_prob:.2f}%")


In [None]:
import matplotlib.pyplot as plt

# Data for the bar plot
systems = ["Bundesregierung", "European Commission"]
claim_probs = [bundestag_claim_prob, commission_claim_prob]

# Define colors (Bundestag = Yellow, Commission = Blue)
colors = ["#FFCC00", "#003399"]

# Create bar plot
plt.figure(figsize=(8,5))
plt.bar(systems, claim_probs, color=colors, edgecolor="black")

# Add value labels
for i, prob in enumerate(claim_probs):
    plt.text(i, prob + 0.5, f"{prob:.2f}%", ha="center", fontsize=12, fontweight="bold")

# Formatting
plt.xlabel("System")
plt.ylabel("Percentage of Speeches with Claims")
plt.title("Claim-Making Probability in Bundesregierung vs. European Commission")
plt.ylim(0, max(claim_probs) * 1.2)
plt.grid(axis="y", linestyle="", alpha=0)

# Show plot
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Ensure date is in datetime format
df_bundestag_raw['date'] = pd.to_datetime(df_bundestag_raw['date'])
df_commission_raw['date'] = pd.to_datetime(df_commission_raw['date'])

# Add system labels
df_bundestag_raw['system'] = 'Bundesregierung'
df_commission_raw['system'] = 'European Commission'

# Select relevant columns and merge datasets
df_bundestag = df_bundestag_raw[['date', 'claimbinary', 'system']]
df_commission = df_commission_raw[['date', 'claimbinary', 'system']]
df_combined = pd.concat([df_bundestag, df_commission], ignore_index=True)

# Extract year and create 5-year bins
df_combined['year'] = df_combined['date'].dt.year
df_combined['year_bin'] = (df_combined['year'] // 5) * 5  # Groups years into 5-year bins

# Compute mean claim probability per 5-year period
df_time = df_combined.groupby(['year_bin', 'system'])['claimbinary'].mean().reset_index()

# Plot
plt.figure(figsize=(10,6))
sns.lineplot(x='year_bin', y='claimbinary', hue='system', data=df_time, marker="o", palette=["#FFCC00", "#003399"])

# Formatting
plt.xlabel("Year (5-Year Intervals)")
plt.ylabel("Average Claim Probability")
plt.title("Claim-Making Probability Over Time (5-Year Steps)")
plt.ylim(0, df_time['claimbinary'].max() * 1.2)  # Scale y-axis for visibility
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle="", alpha=0)
plt.legend(title="System")

# Show plot
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure topic_share_women is numeric
df_bundestag_raw['w_top_tot'] = pd.to_numeric(df_bundestag_raw['w_top_tot'], errors='coerce')
df_commission_raw['w_top_tot'] = pd.to_numeric(df_commission_raw['w_top_tot'], errors='coerce')

# Add system labels
df_bundestag_raw['system'] = 'Bundestag'
df_commission_raw['system'] = 'Commission'

# Select relevant columns and merge datasets
df_bundestag = df_bundestag_raw[['w_top_tot', 'claimbinary', 'system']].dropna()
df_commission = df_commission_raw[['w_top_tot', 'claimbinary', 'system']].dropna()
df_combined = pd.concat([df_bundestag, df_commission], ignore_index=True)

# Plot: Relationship between Topic Share Women and Claim Probability
plt.figure(figsize=(10,6))
sns.regplot(
    x='w_top_tot', y='claimbinary', data=df_combined,
    scatter_kws={'alpha':0.3}, line_kws={'color':'black'}, lowess=True
)

# Formatting
plt.xlabel("Proportion of Speech on Women's Topics")
plt.ylabel("Claim Probability (Binary Outcome)")
plt.title("Effect of Women's Topic Share on Claim-Making")
plt.grid(axis='y', linestyle="--", alpha=0.7)

# Show plot
plt.show()


In [None]:
import pandas as pd
import numpy as np

# Ensure dates are in datetime format
df_commission_raw['date'] = pd.to_datetime(df_commission_raw['date'])
df_bundestag_raw['date'] = pd.to_datetime(df_bundestag_raw['date'])

# Rename key variables
rename_dict = {
    'European.Party_speakerinfo': 'party',
    'Sex_speakerinfo': 'sex'
}
df_commission_raw = df_commission_raw.rename(columns=rename_dict)
df_bundestag_raw = df_bundestag_raw.rename(columns=rename_dict)

# Define event dates
event_commission = pd.Timestamp("2019-12-01")
event_bundestag = pd.Timestamp("2005-11-01")

# Create running variable: days before/after the event
df_commission_raw['time_to_event'] = (df_commission_raw['date'] - event_commission).dt.days
df_bundestag_raw['time_to_event'] = (df_bundestag_raw['date'] - event_bundestag).dt.days

# Select relevant columns
columns_to_keep = ['time_to_event', 'claimbinary', 'w_top_tot', 'length', 'party', 'sex']
df_commission = df_commission_raw[columns_to_keep].dropna()
df_bundestag = df_bundestag_raw[columns_to_keep].dropna()

# Add system identifier
df_commission['system'] = "Commission"
df_bundestag['system'] = "Bundestag"

# Combine datasets
df_combined = pd.concat([df_commission, df_bundestag], ignore_index=True)

# Show data structure
print(df_combined.head())


In [None]:
# Check for missing values
print("Missing values per column:\n")
print(df_combined.isnull().sum())

# Check unique values for categorical variables
print("\nUnique values for 'sex':", df_combined['sex'].unique())
print("Unique values for 'party':", df_combined['party'].unique())


In [None]:
print("Available columns in df_combined:")
print(df_combined.columns)


In [None]:
print("Available columns in df_commission_raw:")
print(df_commission_raw.columns)

print("\nAvailable columns in df_bundestag_raw:")
print(df_bundestag_raw.columns)


In [None]:
# Correct renaming, if needed
rename_dict = {
    'European.Party_speakerinfo': 'party',
    'Sex_speakerinfo': 'sex',  # Check if this matches your data!
    'sex_speakerinfo': 'sex'   # Alternative, in case the name is different
}

df_commission_raw = df_commission_raw.rename(columns=rename_dict)
df_bundestag_raw = df_bundestag_raw.rename(columns=rename_dict)

# Now check again:
print("\nColumns after renaming:")
print(df_commission_raw.columns)
print(df_bundestag_raw.columns)


In [None]:
# Rename 'European Party_speakerinfo' to 'party' in Commission dataset
rename_dict = {
    'European Party_speakerinfo': 'party',  # Fix party name
    'Sex_speakerinfo': 'sex'  # Already fixed in previous step
}

df_commission_raw = df_commission_raw.rename(columns=rename_dict)

# Check if party and sex are now correctly named
print("\nColumns after renaming in Commission dataset:")
print(df_commission_raw.columns)

print("\nColumns after renaming in Bundestag dataset:")
print(df_bundestag_raw.columns)


In [None]:
columns_to_keep = ['time_to_event', 'claimbinary', 'w_top_tot', 'length', 'party', 'sex']

df_commission = df_commission_raw[columns_to_keep].dropna()
df_bundestag = df_bundestag_raw[columns_to_keep].dropna()

df_commission['system'] = "Commission"
df_bundestag['system'] = "Bundestag"

# Merge datasets
df_combined = pd.concat([df_commission, df_bundestag], ignore_index=True)

# Final check: Do party and sex exist?
print("\nFinal columns in df_combined:")
print(df_combined.columns)


In [None]:
# Check missing values
print("Missing values per column in df_combined:")
print(df_combined.isnull().sum())

# Check unique values for categorical variables
print("\nUnique values for 'sex':", df_combined['sex'].unique())
print("Unique values for 'party':", df_combined['party'].unique())


In [None]:
# Define a mapping dictionary for party names
party_mapping = {
    'PES': 'PES',
    'PES[34]': 'PES',
    'PES\xa0/': 'PES',  # Handles special character issues
    'ALDE': 'ALDE',
    'ALDE[23]': 'ALDE',
    'ALDE[25]': 'ALDE',
    'ALDE[5]': 'ALDE',
    'ALDE[32]': 'ALDE',
    'ALDE[38]': 'ALDE',
    'EPP': 'EPP',
    'csu': 'EPP',  # Map CSU to EPP
    'EGP': 'EGP',
    'ECR': 'ECR',
    'ECR[41]': 'ECR'
}

# Apply mapping to both datasets
df_commission_raw['party'] = df_commission_raw['party'].replace(party_mapping)
df_bundestag_raw['party'] = df_bundestag_raw['party'].replace(party_mapping)

# Check if the replacements worked
print("Unique values for 'party' after cleaning:")
print(df_commission_raw['party'].unique())
print(df_bundestag_raw['party'].unique())


In [None]:
print("\nUnexpected party values (Commission):", df_commission_raw[~df_commission_raw['party'].isin(party_mapping.values())]['party'].unique())
print("Unexpected party values (Bundestag):", df_bundestag_raw[~df_bundestag_raw['party'].isin(party_mapping.values())]['party'].unique())


In [None]:
# Remove rows where 'party' is NaN
df_bundestag_raw = df_bundestag_raw.dropna(subset=['party'])

print("\nRemaining missing values for 'party' in Bundestag dataset:", df_bundestag_raw['party'].isna().sum())


In [None]:
print("\nFinal unique values for 'party':")
print(df_bundestag_raw['party'].unique())
print(df_commission_raw['party'].unique())


In [None]:
# Select relevant columns
columns_to_keep = ['time_to_event', 'claimbinary', 'w_top_tot', 'length', 'party', 'sex']
df_commission = df_commission_raw[columns_to_keep].dropna()
df_bundestag = df_bundestag_raw[columns_to_keep].dropna()

df_commission['system'] = "Commission"
df_bundestag['system'] = "Bundestag"

# Merge datasets
df_combined = pd.concat([df_commission, df_bundestag], ignore_index=True)

# Check final structure
print("\nFinal columns in df_combined:")
print(df_combined.columns)

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Define bandwidth (e.g., ±4 years = 1460 days)
bandwidth = 1460

# Filter data for the RDD window
df_rdd = df_combined[np.abs(df_combined['time_to_event']) <= bandwidth]

# Split by gender
df_men = df_rdd[df_rdd['sex'] == 'male']
df_women = df_rdd[df_rdd['sex'] == 'female']

# Logistic Regression Model (RDD)
formula = "claimbinary ~ time_to_event + w_top_tot + length + party"

# Run RDD models
model_men = smf.logit(formula, data=df_men).fit()
model_women = smf.logit(formula, data=df_women).fit()

# Display results
print("RDD Logistic Regression Results for Men:")
print(model_men.summary())

print("\nRDD Logistic Regression Results for Women:")
print(model_women.summary())


In [None]:
# Check data size after filtering
print(f"Total rows in df_rdd: {df_rdd.shape[0]}")
print(f"Rows for men: {df_men.shape[0]}")
print(f"Rows for women: {df_women.shape[0]}")

# Check unique values in 'sex'
print("\nUnique values in 'sex' column:")
print(df_rdd['sex'].unique())


In [None]:
# Convert sex to categorical labels
df_rdd['sex'] = df_rdd['sex'].replace({0: 'male', 1: 'female'})

# Re-filter after correction
df_men = df_rdd[df_rdd['sex'] == 'male']
df_women = df_rdd[df_rdd['sex'] == 'female']

# Check unique values again
print("\nFixed unique values in 'sex' column:")
print(df_rdd['sex'].unique())

print(f"Rows for men after fix: {df_men.shape[0]}")
print(f"Rows for women after fix: {df_women.shape[0]}")


In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Logistic Regression Model (RDD)
formula = "claimbinary ~ time_to_event + w_top_tot + length + party"

# Run RDD models
if df_men.shape[0] > 0:
    model_men = smf.logit(formula, data=df_men).fit()
    print("RDD Logistic Regression Results for Men:")
    print(model_men.summary())
else:
    print("Not enough data for men.")

if df_women.shape[0] > 0:
    model_women = smf.logit(formula, data=df_women).fit()
    print("\nRDD Logistic Regression Results for Women:")
    print(model_women.summary())
else:
    print("Not enough data for women.")


In [None]:
# Filter data for each system within ±2 years
df_commission_rdd = df_rdd[df_rdd['system'] == "Commission"]
df_bundestag_rdd = df_rdd[df_rdd['system'] == "Bundestag"]

# Split by gender
df_commission_men = df_commission_rdd[df_commission_rdd['sex'] == 'male']
df_commission_women = df_commission_rdd[df_commission_rdd['sex'] == 'female']
df_bundestag_men = df_bundestag_rdd[df_bundestag_rdd['sex'] == 'male']
df_bundestag_women = df_bundestag_rdd[df_bundestag_rdd['sex'] == 'female']

# Define the logistic regression formula
formula = "claimbinary ~ time_to_event + w_top_tot + length + party"

# Run separate models
print("\nCommission - Men")
model_commission_men = smf.logit(formula, data=df_commission_men).fit()
print(model_commission_men.summary())

print("\nCommission - Women")
model_commission_women = smf.logit(formula, data=df_commission_women).fit()
print(model_commission_women.summary())

print("\nBundestag - Men")
model_bundestag_men = smf.logit(formula, data=df_bundestag_men).fit()
print(model_bundestag_men.summary())

print("\nBundestag - Women")
model_bundestag_women = smf.logit(formula, data=df_bundestag_women).fit()
print(model_bundestag_women.summary())


In [None]:
# Add interaction term: system × time_to_event
formula_interaction = "claimbinary ~ time_to_event * system + w_top_tot + length + party"

# Run pooled model
model_interaction = smf.logit(formula_interaction, data=df_rdd).fit()

# Print results
print("\nInteraction Model: Does Institutional Setup Change Time Effects?")
print(model_interaction.summary())


In [None]:
# Ensure datasets are loaded and merged
import pandas as pd
import numpy as np

# Reload Commission and Bundestag datasets
df_combined = pd.concat([df_commission, df_bundestag], ignore_index=True)

# Define the event time window (±2 years)
bandwidth = 10000
df_rdd = df_combined[np.abs(df_combined['time_to_event']) <= bandwidth]

# Separate by institution
df_commission_rdd = df_rdd[df_rdd['system'] == "Commission"]
df_bundestag_rdd = df_rdd[df_rdd['system'] == "Bundestag"]

# Check dataset sizes
print(f"Commission RDD dataset: {df_commission_rdd.shape[0]} observations")
print(f"Bundestag RDD dataset: {df_bundestag_rdd.shape[0]} observations")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Define bin width (100 days)
bin_width = 200

# Function to create binned data
def bin_time_data(df, system_name, color):
    df['time_bin'] = np.floor(df['time_to_event'] / bin_width) * bin_width  # Group into 100-day bins
    bin_avg = df.groupby('time_bin')['claimbinary'].mean().reset_index()

    plt.figure(figsize=(10,6))
    sns.scatterplot(x='time_bin', y='claimbinary', data=bin_avg, color=color, label=system_name)
    sns.lineplot(x='time_bin', y='claimbinary', data=bin_avg, color=color)

    plt.axvline(0, color='red', linestyle="--", label="Event Date")
    plt.xlabel("Days from Event")
    plt.ylabel("Average Claim Probability")
    plt.title(f"RDD Effect on Claim-Making ({system_name}) - Binned Data")
    plt.legend()
    plt.grid(axis='y', linestyle="--", alpha=0.7)
    plt.show()

# Plot for Commission and Bundestag
bin_time_data(df_commission_rdd, "European Commission", "#003399")
bin_time_data(df_bundestag_rdd, "German Bundestag", "#FFCC00")


In [None]:
# Create a before/after indicator
df_rdd['before_event'] = df_rdd['time_to_event'] < 0

# Compute mean claim probability before vs. after event
mean_claims = df_rdd.groupby(['before_event', 'system'])['claimbinary'].mean().reset_index()

# Rename values for plotting
mean_claims['before_event'] = mean_claims['before_event'].replace({True: "Before Event", False: "After Event"})

# Plot bar chart
plt.figure(figsize=(10,6))
sns.barplot(x='before_event', y='claimbinary', hue='system', data=mean_claims, palette={"Commission": "#003399", "Bundestag": "#FFCC00"})

plt.xlabel("Event Period")
plt.ylabel("Mean Claim Probability")
plt.title("Comparison of Claim Probability Before and After the Event")
plt.legend(title="Institution")
plt.grid(axis='y', linestyle="--", alpha=0.7)
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns

# Define range of days for predictions (±800 days)
time_range = np.linspace(-8000, 8000, 100)

# Create a dataframe for prediction
df_pred = pd.DataFrame({'time_to_event': time_range})
df_pred['w_top_tot'] = df_rdd['w_top_tot'].mean()  # Use mean of w_top_tot
df_pred['length'] = df_rdd['length'].mean()  # Use mean of length
df_pred['party'] = "EPP"  # Choose a reference category for party

# Predict probabilities for Commission
df_pred_commission = df_pred.copy()
df_pred_commission['system'] = "Commission"
df_pred_commission['system_Commission'] = 1  # Dummy for interaction
df_pred_commission['pred_prob'] = model_commission_men.predict(df_pred_commission)

# Predict probabilities for Bundestag
df_pred_bundestag = df_pred.copy()
df_pred_bundestag['system'] = "Bundestag"
df_pred_bundestag['system_Commission'] = 0  # Dummy for interaction
df_pred_bundestag['pred_prob'] = model_bundestag_men.predict(df_pred_bundestag)

# Combine both datasets for plotting
df_pred_combined = pd.concat([df_pred_commission, df_pred_bundestag])

# Plot RDD with estimated probabilities
plt.figure(figsize=(10,6))
sns.lineplot(x='time_to_event', y='pred_prob', hue='system', data=df_pred_combined, palette={"Commission": "#003399", "Bundestag": "#FFCC00"})

plt.axvline(0, color='red', linestyle="--", label="Event Date")
plt.xlabel("Days from Event")
plt.ylabel("Predicted Claim Probability")
plt.title("Regression Discontinuity: Estimated Claim-Making Probability")
plt.legend(title="Institution")
plt.grid(axis='y', linestyle="--", alpha=0.7)
plt.show()


In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Function to fit separate pre/post RDD models
def fit_rdd_models(df):
    df_pre = df[df['time_to_event'] < 0]
    df_post = df[df['time_to_event'] >= 0]

    # Logistic regression models
    model_pre = smf.logit("claimbinary ~ time_to_event + w_top_tot + length + party", data=df_pre).fit()
    model_post = smf.logit("claimbinary ~ time_to_event + w_top_tot + length + party", data=df_post).fit()

    return model_pre, model_post

# Fit RDD models separately for Commission and Bundestag
model_commission_pre, model_commission_post = fit_rdd_models(df_commission_rdd)
model_bundestag_pre, model_bundestag_post = fit_rdd_models(df_bundestag_rdd)


In [None]:
# Define range of days for predictions (±800 days)
time_range_pre = np.linspace(-800, -1, 50)
time_range_post = np.linspace(0, 800, 50)

# Function to predict probabilities
def predict_rdd(model_pre, model_post, df, time_range_pre, time_range_post):
    df_pred_pre = pd.DataFrame({'time_to_event': time_range_pre})
    df_pred_pre['w_top_tot'] = df['w_top_tot'].mean()
    df_pred_pre['length'] = df['length'].mean()
    df_pred_pre['party'] = "EPP"  # Set reference party

    df_pred_post = df_pred_pre.copy()
    df_pred_post['time_to_event'] = time_range_post

    # Get predicted probabilities
    df_pred_pre['pred_prob'] = model_pre.predict(df_pred_pre)
    df_pred_post['pred_prob'] = model_post.predict(df_pred_post)

    return df_pred_pre, df_pred_post

# Predict separately for Commission & Bundestag
df_commission_pred_pre, df_commission_pred_post = predict_rdd(model_commission_pre, model_commission_post, df_commission_rdd, time_range_pre, time_range_post)
df_bundestag_pred_pre, df_bundestag_pred_post = predict_rdd(model_bundestag_pre, model_bundestag_post, df_bundestag_rdd, time_range_pre, time_range_post)


In [None]:
# Function to plot true RDD effect
def plot_rdd_discontinuity(df_pred_pre, df_pred_post, system_name, color):
    plt.figure(figsize=(10,6))

    # Plot predictions
    sns.lineplot(x='time_to_event', y='pred_prob', data=df_pred_pre, color=color, label=f"{system_name} (Before Event)")
    sns.lineplot(x='time_to_event', y='pred_prob', data=df_pred_post, color=color, linestyle="dashed", label=f"{system_name} (After Event)")

    # Add event line
    plt.axvline(0, color='red', linestyle="--", label="Event Date")
    plt.xlabel("Days from Event")
    plt.ylabel("Predicted Claim Probability")
    plt.title(f"RDD Effect on Claim-Making ({system_name}) - True Discontinuity")
    plt.legend()
    plt.grid(axis='y', linestyle="--", alpha=0.7)
    plt.show()

# Plot true RDD for Commission & Bundestag
plot_rdd_discontinuity(df_commission_pred_pre, df_commission_pred_post, "European Commission", "#003399")
plot_rdd_discontinuity(df_bundestag_pred_pre, df_bundestag_pred_post, "German Bundestag", "#FFCC00")


In [None]:
plt.figure(figsize=(10,6))

# Plot predictions
sns.lineplot(x='time_to_event', y='pred_prob', data=df_commission_pred_pre, color="#003399", label="Commission (Before Event)")
sns.lineplot(x='time_to_event', y='pred_prob', data=df_commission_pred_post, color="#003399", linestyle="dashed", label="Commission (After Event)")

sns.lineplot(x='time_to_event', y='pred_prob', data=df_bundestag_pred_pre, color="#FFCC00", label="Bundesregierung (Before Event)")
sns.lineplot(x='time_to_event', y='pred_prob', data=df_bundestag_pred_post, color="#FFCC00", linestyle="dashed", label="Bundesregierung (After Event)")

# Add event line
plt.axvline(0, color='red', linestyle="--", label="Event Date")
plt.xlabel("Days from Event")
plt.ylabel("Predicted Claim Probability")
plt.title("Woman Leader Effect on Claim-Making: Institutional Comparison")
plt.legend()
plt.grid(axis='y', linestyle="", alpha=0)

plt.show()


In [None]:
# Define a shorter range for better visibility
time_range_pre = np.linspace(-1000, -1, 50)
time_range_post = np.linspace(0, 1000, 50)

# Predict for the new restricted time range
df_commission_pred_pre, df_commission_pred_post = predict_rdd(model_commission_pre, model_commission_post, df_commission_rdd, time_range_pre, time_range_post)
df_bundestag_pred_pre, df_bundestag_pred_post = predict_rdd(model_bundestag_pre, model_bundestag_post, df_bundestag_rdd, time_range_pre, time_range_post)

# Plot with new range
plt.figure(figsize=(10,6))

# Plot predictions
sns.lineplot(x='time_to_event', y='pred_prob', data=df_commission_pred_pre, color="#003399", label="European Commission (Before Event)")
sns.lineplot(x='time_to_event', y='pred_prob', data=df_commission_pred_post, color="#003399", linestyle="dashed", label="European Commission (After Event)")

sns.lineplot(x='time_to_event', y='pred_prob', data=df_bundestag_pred_pre, color="#FFCC00", label="Bundesregierung (Before Event)")
sns.lineplot(x='time_to_event', y='pred_prob', data=df_bundestag_pred_post, color="#FFCC00", linestyle="dashed", label="Bundesregierung (After Event)")

# Add event line
plt.axvline(0, color='red', linestyle="--", label="Event Date")
plt.xlabel("Days from Event")
plt.ylabel("Predicted Claim Probability")
plt.title("RDD Effect on Claim-Making: Institutional Comparison (±1000 days)")
plt.legend()
plt.grid(axis='y', linestyle="", alpha=0)

plt.show()


In [None]:
# Function to calculate confidence intervals for RDD
def predict_with_ci(model, df, time_range):
    df_pred = pd.DataFrame({'time_to_event': time_range})
    df_pred['w_top_tot'] = df['w_top_tot'].mean()
    df_pred['length'] = df['length'].mean()
    df_pred['party'] = "EPP"  # Reference party

    # Predict probabilities
    df_pred['pred_prob'] = model.predict(df_pred)

    # Compute confidence intervals
    pred_std = model.bse[0]  # Standard error
    df_pred['lower_ci'] = df_pred['pred_prob'] - 1.96 * pred_std
    df_pred['upper_ci'] = df_pred['pred_prob'] + 1.96 * pred_std

    return df_pred

# Predict with confidence intervals
df_commission_pre_ci = predict_with_ci(model_commission_pre, df_commission_rdd, time_range_pre)
df_commission_post_ci = predict_with_ci(model_commission_post, df_commission_rdd, time_range_post)
df_bundestag_pre_ci = predict_with_ci(model_bundestag_pre, df_bundestag_rdd, time_range_pre)
df_bundestag_post_ci = predict_with_ci(model_bundestag_post, df_bundestag_rdd, time_range_post)

# Plot with confidence intervals
plt.figure(figsize=(10,6))

# Commission
sns.lineplot(x='time_to_event', y='pred_prob', data=df_commission_pre_ci, color="#003399", label="Commission (Before Event)")
sns.lineplot(x='time_to_event', y='pred_prob', data=df_commission_post_ci, color="#003399", linestyle="dashed", label="Commission (After Event)")

plt.fill_between(df_commission_pre_ci['time_to_event'], df_commission_pre_ci['lower_ci'], df_commission_pre_ci['upper_ci'], color="#003399", alpha=0.2)
plt.fill_between(df_commission_post_ci['time_to_event'], df_commission_post_ci['lower_ci'], df_commission_post_ci['upper_ci'], color="#003399", alpha=0.2)

# Bundestag
sns.lineplot(x='time_to_event', y='pred_prob', data=df_bundestag_pre_ci, color="#FFCC00", label="Bundestag (Before Event)")
sns.lineplot(x='time_to_event', y='pred_prob', data=df_bundestag_post_ci, color="#FFCC00", linestyle="dashed", label="Bundestag (After Event)")

plt.fill_between(df_bundestag_pre_ci['time_to_event'], df_bundestag_pre_ci['lower_ci'], df_bundestag_pre_ci['upper_ci'], color="#FFCC00", alpha=0.2)
plt.fill_between(df_bundestag_post_ci['time_to_event'], df_bundestag_post_ci['lower_ci'], df_bundestag_post_ci['upper_ci'], color="#FFCC00", alpha=0.2)

# Add event line
plt.axvline(0, color='red', linestyle="--", label="Event Date")
plt.xlabel("Days from Event")
plt.ylabel("Predicted Claim Probability")
plt.title("RDD Effect on Claim-Making: Institutional Comparison (with Confidence Intervals)")
plt.legend()
plt.grid(axis='y', linestyle="--", alpha=0.7)

plt.show()


In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Function to fit separate pre/post RDD models
def fit_rdd_models(df):
    df_pre = df[df['time_to_event'] < 0]
    df_post = df[df['time_to_event'] >= 0]

    # Fit logistic regression models for pre and post event
    model_pre = smf.logit("claimbinary ~ time_to_event + w_top_tot + length + party", data=df_pre).fit()
    model_post = smf.logit("claimbinary ~ time_to_event + w_top_tot + length + party", data=df_post).fit()

    return model_pre, model_post


In [None]:
# Fit models separately by gender and institution
model_commission_men_pre, model_commission_men_post = fit_rdd_models(df_commission_rdd[df_commission_rdd['sex'] == 'male'])
model_commission_women_pre, model_commission_women_post = fit_rdd_models(df_commission_rdd[df_commission_rdd['sex'] == 'female'])

model_bundestag_men_pre, model_bundestag_men_post = fit_rdd_models(df_bundestag_rdd[df_bundestag_rdd['sex'] == 'male'])
model_bundestag_women_pre, model_bundestag_women_post = fit_rdd_models(df_bundestag_rdd[df_bundestag_rdd['sex'] == 'female'])


In [None]:
# Convert numerical encoding to labels (if necessary)
df_commission_rdd['sex'] = df_commission_rdd['sex'].replace({0: 'male', 1: 'female'})
df_bundestag_rdd['sex'] = df_bundestag_rdd['sex'].replace({0: 'male', 1: 'female'})

# Verify the correction
print("Unique values after correction:", df_commission_rdd['sex'].unique())


In [None]:
# Check dataset sizes before running RDD models
df_commission_men = df_commission_rdd[df_commission_rdd['sex'] == 'male']
df_commission_women = df_commission_rdd[df_commission_rdd['sex'] == 'female']

df_bundestag_men = df_bundestag_rdd[df_bundestag_rdd['sex'] == 'male']
df_bundestag_women = df_bundestag_rdd[df_bundestag_rdd['sex'] == 'female']

print("\nNumber of observations by gender in Commission dataset:")
print(f"Men: {df_commission_men.shape[0]}, Women: {df_commission_women.shape[0]}")

print("\nNumber of observations by gender in Bundestag dataset:")
print(f"Men: {df_bundestag_men.shape[0]}, Women: {df_bundestag_women.shape[0]}")


In [None]:
# Fit models only if data exists
models = {}

for label, df in [("commission_men", df_commission_men), ("commission_women", df_commission_women),
                  ("bundestag_men", df_bundestag_men), ("bundestag_women", df_bundestag_women)]:
    if df.shape[0] > 10:  # Ensure enough data points
        print(f"Fitting RDD model for {label}...")
        models[f"{label}_pre"], models[f"{label}_post"] = fit_rdd_models(df)
    else:
        print(f"Skipping RDD model for {label} (not enough data).")


In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Function to fit separate pre/post RDD models
def fit_rdd_models(df):
    df_pre = df[df['time_to_event'] < 0]
    df_post = df[df['time_to_event'] >= 0]

    # Fit logistic regression models for pre and post event
    model_pre = smf.logit("claimbinary ~ time_to_event + w_top_tot + length + party", data=df_pre).fit()
    model_post = smf.logit("claimbinary ~ time_to_event + w_top_tot + length + party", data=df_post).fit()

    return model_pre, model_post


In [None]:
# Fit models separately by gender and institution
model_commission_men_pre, model_commission_men_post = fit_rdd_models(df_commission_rdd[df_commission_rdd['sex'] == 'male'])
model_commission_women_pre, model_commission_women_post = fit_rdd_models(df_commission_rdd[df_commission_rdd['sex'] == 'female'])

model_bundestag_men_pre, model_bundestag_men_post = fit_rdd_models(df_bundestag_rdd[df_bundestag_rdd['sex'] == 'male'])
model_bundestag_women_pre, model_bundestag_women_post = fit_rdd_models(df_bundestag_rdd[df_bundestag_rdd['sex'] == 'female'])


In [None]:
from scipy.special import expit  # Sigmoid function

# Function to predict probabilities with correct transformation
def predict_with_fixed_ci(model, df, time_range):
    df_pred = pd.DataFrame({'time_to_event': time_range})
    df_pred['w_top_tot'] = df['w_top_tot'].mean()
    df_pred['length'] = df['length'].mean()
    df_pred['party'] = "EPP"  # Reference party

    # Predict log-odds
    df_pred['logit'] = model.predict(df_pred)

    # Convert log-odds to probability using sigmoid
    df_pred['pred_prob'] = expit(df_pred['logit'])

    # Compute confidence intervals
    pred_std = model.bse[0]  # Standard error
    df_pred['lower_ci'] = expit(df_pred['logit'] - 1.96 * pred_std)
    df_pred['upper_ci'] = expit(df_pred['logit'] + 1.96 * pred_std)

    return df_pred

# Define prediction time range
time_range_pre = np.linspace(-1000, -1, 50)
time_range_post = np.linspace(0, 1000, 50)

# Compute predictions separately for men and women
df_commission_men_pre, df_commission_men_post = predict_with_fixed_ci(model_commission_men_pre, df_commission_rdd, time_range_pre), predict_with_fixed_ci(model_commission_men_post, df_commission_rdd, time_range_post)
df_commission_women_pre, df_commission_women_post = predict_with_fixed_ci(model_commission_women_pre, df_commission_rdd, time_range_pre), predict_with_fixed_ci(model_commission_women_post, df_commission_rdd, time_range_post)

df_bundestag_men_pre, df_bundestag_men_post = predict_with_fixed_ci(model_bundestag_men_pre, df_bundestag_rdd, time_range_pre), predict_with_fixed_ci(model_bundestag_men_post, df_bundestag_rdd, time_range_post)
df_bundestag_women_pre, df_bundestag_women_post = predict_with_fixed_ci(model_bundestag_women_pre, df_bundestag_rdd, time_range_pre), predict_with_fixed_ci(model_bundestag_women_post, df_bundestag_rdd, time_range_post)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Function to plot RDD for men and women separately
def plot_rdd_by_sex(df_men_pre, df_men_post, df_women_pre, df_women_post, system_name, color_men, color_women):
    plt.figure(figsize=(10,6))

    # Men
    sns.lineplot(x='time_to_event', y='pred_prob', data=df_men_pre, color=color_men, label=f"{system_name} - Men (Before Event)")
    sns.lineplot(x='time_to_event', y='pred_prob', data=df_men_post, color=color_men, linestyle="dashed", label=f"{system_name} - Men (After Event)")

    plt.fill_between(df_men_pre['time_to_event'], df_men_pre['lower_ci'], df_men_pre['upper_ci'], color=color_men, alpha=0.2)
    plt.fill_between(df_men_post['time_to_event'], df_men_post['lower_ci'], df_men_post['upper_ci'], color=color_men, alpha=0.2)

    # Women
    sns.lineplot(x='time_to_event', y='pred_prob', data=df_women_pre, color=color_women, label=f"{system_name} - Women (Before Event)")
    sns.lineplot(x='time_to_event', y='pred_prob', data=df_women_post, color=color_women, linestyle="dashed", label=f"{system_name} - Women (After Event)")

    plt.fill_between(df_women_pre['time_to_event'], df_women_pre['lower_ci'], df_women_pre['upper_ci'], color=color_women, alpha=0.2)
    plt.fill_between(df_women_post['time_to_event'], df_women_post['lower_ci'], df_women_post['upper_ci'], color=color_women, alpha=0.2)

    # Event line
    plt.axvline(0, color='red', linestyle="--", label="Event Date")
    plt.xlabel("Days from Event")
    plt.ylabel("Predicted Claim Probability")
    plt.title(f"RDD Effect on Claim-Making: {system_name} (Men vs. Women)")
    plt.legend()
    plt.grid(axis='y', linestyle="--", alpha=0.7)

    plt.show()

# Plot for Commission and Bundestag
plot_rdd_by_sex(df_commission_men_pre, df_commission_men_post, df_commission_women_pre, df_commission_women_post, "European Commission", "#003399", "#6699FF")
plot_rdd_by_sex(df_bundestag_men_pre, df_bundestag_men_post, df_bundestag_women_pre, df_bundestag_women_post, "German Bundestag", "#FFCC00", "#FFD700")


In [None]:
# Function to get predicted probabilities with correct confidence intervals
def predict_with_fixed_ci(model, df, time_range):
    df_pred = pd.DataFrame({'time_to_event': time_range})
    df_pred['w_top_tot'] = df['w_top_tot'].mean()
    df_pred['length'] = df['length'].mean()
    df_pred['party'] = "EPP"  # Reference party

    # Get predictions with confidence intervals
    pred_results = model.get_prediction(df_pred)
    pred_summary = pred_results.summary_frame(alpha=0.05)  # 95% CI

    # Convert log-odds to probabilities
    df_pred['pred_prob'] = expit(pred_summary['mean'])
    df_pred['lower_ci'] = expit(pred_summary['mean_ci_lower'])
    df_pred['upper_ci'] = expit(pred_summary['mean_ci_upper'])

    return df_pred

# Recompute predictions with corrected CIs
df_commission_men_pre, df_commission_men_post = predict_with_fixed_ci(model_commission_men_pre, df_commission_rdd, time_range_pre), predict_with_fixed_ci(model_commission_men_post, df_commission_rdd, time_range_post)
df_commission_women_pre, df_commission_women_post = predict_with_fixed_ci(model_commission_women_pre, df_commission_rdd, time_range_pre), predict_with_fixed_ci(model_commission_women_post, df_commission_rdd, time_range_post)


In [None]:
# Function to get predicted probabilities with correct confidence intervals
def predict_with_fixed_ci(model, df, time_range):
    df_pred = pd.DataFrame({'time_to_event': time_range})
    df_pred['w_top_tot'] = df['w_top_tot'].mean()
    df_pred['length'] = df['length'].mean()
    df_pred['party'] = "EPP"  # Reference party

    # Get predictions with confidence intervals
    pred_results = model.get_prediction(df_pred)
    pred_summary = pred_results.summary_frame(alpha=0.05)  # 95% CI

    # Convert log-odds to probabilities
    df_pred['pred_prob'] = expit(pred_summary['mean'])
    df_pred['lower_ci'] = expit(pred_summary['mean_ci_lower'])
    df_pred['upper_ci'] = expit(pred_summary['mean_ci_upper'])

    return df_pred

# Recompute predictions with corrected CIs
df_commission_men_pre, df_commission_men_post = predict_with_fixed_ci(model_commission_men_pre, df_commission_rdd, time_range_pre), predict_with_fixed_ci(model_commission_men_post, df_commission_rdd, time_range_post)
df_commission_women_pre, df_commission_women_post = predict_with_fixed_ci(model_commission_women_pre, df_commission_rdd, time_range_pre), predict_with_fixed_ci(model_commission_women_post, df_commission_rdd, time_range_post)


In [None]:
# Function to predict probabilities with correct confidence intervals
def predict_with_fixed_ci(model, df, time_range):
    df_pred = pd.DataFrame({'time_to_event': time_range})
    df_pred['w_top_tot'] = df['w_top_tot'].mean()
    df_pred['length'] = df['length'].mean()
    df_pred['party'] = "EPP"  # Reference party

    # Get predicted values
    pred_results = model.get_prediction(df_pred)

    # Extract predicted probabilities
    df_pred['pred_prob'] = expit(pred_results.predicted_mean)

    # Compute confidence intervals
    ci = pred_results.conf_int(alpha=0.05)  # 95% confidence interval
    df_pred['lower_ci'] = expit(ci[:, 0])  # Convert log-odds to probability
    df_pred['upper_ci'] = expit(ci[:, 1])  # Convert log-odds to probability

    return df_pred


In [None]:
# Ensure the main RDD dataset exists
df_rdd = pd.concat([df_commission_rdd, df_bundestag_rdd], ignore_index=True)

# Split datasets by sex and system
df_commission_men = df_commission_rdd[df_commission_rdd['sex'] == 'male']
df_commission_women = df_commission_rdd[df_commission_rdd['sex'] == 'female']

df_bundestag_men = df_bundestag_rdd[df_bundestag_rdd['sex'] == 'male']
df_bundestag_women = df_bundestag_rdd[df_bundestag_rdd['sex'] == 'female']

# Print the number of observations
print("\nNumber of observations by gender and institution:")
print(f"Commission - Men: {df_commission_men.shape[0]}, Women: {df_commission_women.shape[0]}")
print(f"Bundestag - Men: {df_bundestag_men.shape[0]}, Women: {df_bundestag_women.shape[0]}")


In [None]:
# Function to test statistical significance of RDD discontinuity
import statsmodels.formula.api as smf

def test_rdd_significance(df, system_name, sex_category):
    """
    Runs a logistic regression model including a discontinuity term and prints the statistical significance.
    The model includes an interaction term between `time_to_event` and a binary indicator for post-event period.
    """
    df = df.copy()

    # Create binary variable for post-event period
    df['post_event'] = (df['time_to_event'] >= 0).astype(int)

    # Fit the logistic regression model with discontinuity
    model = smf.logit("claimbinary ~ time_to_event + post_event + time_to_event:post_event + w_top_tot + length + party", data=df).fit()

    # Print model results
    print(f"\nRDD Discontinuity Test - {system_name} ({sex_category}):")
    print(model.summary())

    return model

# Test discontinuity separately for men and women in each institution
model_rdd_commission_men = test_rdd_significance(df_commission_men, "European Commission", "Men")
model_rdd_commission_women = test_rdd_significance(df_commission_women, "European Commission", "Women")
model_rdd_bundestag_men = test_rdd_significance(df_bundestag_men, "German Bundestag", "Men")
model_rdd_bundestag_women = test_rdd_significance(df_bundestag_women, "German Bundestag", "Women")


In [None]:
# Define party mapping
party_mapping = {
    'PES': 'left',
    'EGP': 'left',
    'EPP': 'cons',
    'ALDE': 'cons'
}

# Apply mapping
df_commission_rdd['party_group'] = df_commission_rdd['party'].map(party_mapping).fillna('other')
df_bundestag_rdd['party_group'] = df_bundestag_rdd['party'].map(party_mapping).fillna('other')

# Check new value distribution
print("\nNew Party Group Distribution (Commission):\n", df_commission_rdd['party_group'].value_counts())
print("\nNew Party Group Distribution (Bundestag):\n", df_bundestag_rdd['party_group'].value_counts())


In [None]:
import statsmodels.formula.api as smf

def test_rdd_significance_with_party(df, system_name, sex_category):
    df = df.copy()
    df['post_event'] = (df['time_to_event'] >= 0).astype(int)

    # Fit logistic regression model using 'party_group'
    model = smf.logit("claimbinary ~ time_to_event + post_event + time_to_event:post_event + w_top_tot + length + C(party_group)", data=df).fit()

    print(f"\nRDD Discontinuity Test (Using 'party_group') - {system_name} ({sex_category}):")
    print(model.summary())

    return model

# Run RDD models with new party grouping
model_rdd_commission_men = test_rdd_significance_with_party(df_commission_rdd[df_commission_rdd['sex'] == 'male'], "European Commission", "Men")
model_rdd_commission_women = test_rdd_significance_with_party(df_commission_rdd[df_commission_rdd['sex'] == 'female'], "European Commission", "Women")

model_rdd_bundestag_men = test_rdd_significance_with_party(df_bundestag_rdd[df_bundestag_rdd['sex'] == 'male'], "German Bundestag", "Men")
model_rdd_bundestag_women = test_rdd_significance_with_party(df_bundestag_rdd[df_bundestag_rdd['sex'] == 'female'], "German Bundestag", "Women")


In [None]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from itertools import combinations
from scipy.spatial.distance import euclidean

# Extract 'year' from 'date' and create time bins
df_commission_rdd["year"] = df_commission_rdd["date"].dt.year
df_bundestag_rdd["year"] = df_bundestag_rdd["date"].dt.year

# Define 10-year and 5-year bins
df_commission_rdd["year_bin_10"] = (df_commission_rdd["year"] // 10) * 10
df_bundestag_rdd["year_bin_10"] = (df_bundestag_rdd["year"] // 10) * 10

df_commission_rdd["year_bin_5"] = (df_commission_rdd["year"] // 5) * 5
df_bundestag_rdd["year_bin_5"] = (df_bundestag_rdd["year"] // 5) * 5

# Check bins
print("Unique 10-year bins (Commission):", df_commission_rdd["year_bin_10"].unique())
print("Unique 10-year bins (Bundestag):", df_bundestag_rdd["year_bin_10"].unique())


In [None]:
# Check columns in both datasets
print("Columns in df_commission_rdd:", df_commission_rdd.columns)
print("Columns in df_bundestag_rdd:", df_bundestag_rdd.columns)


In [None]:
import pandas as pd

# Load datasets
df_commission_rdd = pd.read_csv("/content/drive/My Drive/final_data_cleaned.csv")
df_bundestag_rdd = pd.read_csv("/content/drive/My Drive/germanoptimized_speeches_with_topics_updated.csv")

# Convert 'date' column to datetime format
df_commission_rdd['date'] = pd.to_datetime(df_commission_rdd['date'], errors='coerce')
df_bundestag_rdd['date'] = pd.to_datetime(df_bundestag_rdd['date'], errors='coerce')

# Confirm loading
print("\n Datasets reloaded successfully!")


In [None]:
# Define event dates
event_commission = pd.Timestamp("2019-12-01")
event_bundestag = pd.Timestamp("2005-11-01")

# Calculate actual dates from time_to_event
df_commission_rdd["date"] = event_commission + pd.to_timedelta(df_commission_rdd["time_to_event"], unit="D")
df_bundestag_rdd["date"] = event_bundestag + pd.to_timedelta(df_bundestag_rdd["time_to_event"], unit="D")

# Extract year and define 10-year and 5-year bins
df_commission_rdd["year"] = df_commission_rdd["date"].dt.year
df_bundestag_rdd["year"] = df_bundestag_rdd["date"].dt.year

df_commission_rdd["year_bin_10"] = (df_commission_rdd["year"] // 10) * 10
df_bundestag_rdd["year_bin_10"] = (df_bundestag_rdd["year"] // 10) * 10

df_commission_rdd["year_bin_5"] = (df_commission_rdd["year"] // 5) * 5
df_bundestag_rdd["year_bin_5"] = (df_bundestag_rdd["year"] // 5) * 5

# Verify the calculations
df_commission_rdd[["time_to_event", "date", "year", "year_bin_10", "year_bin_5"]].head(), \
df_bundestag_rdd[["time_to_event", "date", "year", "year_bin_10", "year_bin_5"]].head()


In [None]:
# Check available columns
print("Columns in df_commission_rdd:", df_commission_rdd.columns)
print("Columns in df_bundestag_rdd:", df_bundestag_rdd.columns)


In [None]:
# Ensure 'date' is in datetime format
df_commission_rdd["date"] = pd.to_datetime(df_commission_rdd["date"], errors='coerce')
df_bundestag_rdd["date"] = pd.to_datetime(df_bundestag_rdd["date"], errors='coerce')

# Define event dates
event_commission = pd.Timestamp("2019-12-01")
event_bundestag = pd.Timestamp("2005-11-01")

# Calculate time_to_event (days before or after the event)
df_commission_rdd["time_to_event"] = (df_commission_rdd["date"] - event_commission).dt.days
df_bundestag_rdd["time_to_event"] = (df_bundestag_rdd["date"] - event_bundestag).dt.days

# Verify calculations
df_commission_rdd[["date", "time_to_event"]].head(), df_bundestag_rdd[["date", "time_to_event"]].head()


In [None]:
# Extract the actual year from the date column
df_commission_rdd["year"] = df_commission_rdd["date"].dt.year
df_bundestag_rdd["year"] = df_bundestag_rdd["date"].dt.year

# Create 10-year bins
df_commission_rdd["year_bin_10"] = (df_commission_rdd["year"] // 10) * 10
df_bundestag_rdd["year_bin_10"] = (df_bundestag_rdd["year"] // 10) * 10

# Create 5-year bins
df_commission_rdd["year_bin_5"] = (df_commission_rdd["year"] // 5) * 5
df_bundestag_rdd["year_bin_5"] = (df_bundestag_rdd["year"] // 5) * 5

# Verify results
df_commission_rdd[["date", "time_to_event", "year", "year_bin_10", "year_bin_5"]].head(), \
df_bundestag_rdd[["date", "time_to_event", "year", "year_bin_10", "year_bin_5"]].head()


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from itertools import combinations
from scipy.spatial.distance import euclidean

# For the Commission dataset, the party variable is labeled "European Party_speakerinfo"
rename_dict = {
    'European Party_speakerinfo': 'party',   # Note: space instead of dot
    'Sex_speakerinfo': 'sex'
}
if "European Party_speakerinfo" in df_commission_rdd.columns:
    df_commission_rdd.rename(columns=rename_dict, inplace=True)
else:
    print("Error: Cannot find the column 'European Party_speakerinfo' in the Commission dataset.")

# For the Bundestag dataset, check if 'party' exists; if not, rename if possible (adjust if needed)
if "party" not in df_bundestag_rdd.columns:
    if "European Party_speakerinfo" in df_bundestag_rdd.columns:
        df_bundestag_rdd.rename(columns={'European Party_speakerinfo': 'party'}, inplace=True)
    else:
        print("Warning: 'party' column not found in the Bundestag dataset. Please verify the column name.")

# Recode party names using the party_mapping dictionary for both datasets
party_mapping = {
    'PES': 'PES',
    'PES[34]': 'PES',
    'PES\xa0/': 'PES',  # Handles special character issues
    'ALDE': 'ALDE',
    'ALDE[23]': 'ALDE',
    'ALDE[25]': 'ALDE',
    'ALDE[5]': 'ALDE',
    'ALDE[32]': 'ALDE',
    'ALDE[38]': 'ALDE',
    'EPP': 'EPP',
    'csu': 'EPP',       # Map CSU to EPP
    'EGP': 'EGP',
    'ECR': 'ECR',
    'ECR[41]': 'ECR'
}

df_commission_rdd["party"] = df_commission_rdd["party"].replace(party_mapping)
if "party" in df_bundestag_rdd.columns:
    df_bundestag_rdd["party"] = df_bundestag_rdd["party"].replace(party_mapping)



# Ensure 'date' is in datetime format (errors='coerce' will turn invalid dates to NaT)
df_commission_rdd["date"] = pd.to_datetime(df_commission_rdd["date"], errors="coerce")
df_bundestag_rdd["date"] = pd.to_datetime(df_bundestag_rdd["date"], errors="coerce")

# Define event dates:
# - European Commission event: December 1, 2019
# - German Bundestag event: November 1, 2005
event_commission = pd.Timestamp("2019-12-01")
event_bundestag = pd.Timestamp("2005-11-01")

# Calculate time_to_event (number of days from event)
df_commission_rdd["time_to_event"] = (df_commission_rdd["date"] - event_commission).dt.days
df_bundestag_rdd["time_to_event"] = (df_bundestag_rdd["date"] - event_bundestag).dt.days

# Extract year from date
df_commission_rdd["year"] = df_commission_rdd["date"].dt.year
df_bundestag_rdd["year"] = df_bundestag_rdd["date"].dt.year

# Create 10-year bins and 5-year bins
df_commission_rdd["year_bin_10"] = (df_commission_rdd["year"] // 10) * 10
df_bundestag_rdd["year_bin_10"] = (df_bundestag_rdd["year"] // 10) * 10

df_commission_rdd["year_bin_5"] = (df_commission_rdd["year"] // 5) * 5
df_bundestag_rdd["year_bin_5"] = (df_bundestag_rdd["year"] // 5) * 5

# Optional: Verify the new variables
print(df_commission_rdd[["date", "time_to_event", "year", "year_bin_10", "year_bin_5"]].head())
print(df_bundestag_rdd[["date", "time_to_event", "year", "year_bin_10", "year_bin_5"]].head())



# For analysis, we use the datasets as df_commission and df_bundestag:
df_commission = df_commission_rdd.copy()
df_bundestag = df_bundestag_rdd.copy()

def run_logistic_by_time(df, time_var, institution):
    results = {}
    for time_period in sorted(df[time_var].unique()):
        subset = df[df[time_var] == time_period]
        if subset.shape[0] > 10:  # Only run if enough observations
            model = smf.logit("claimbinary ~ C(sex) + C(party) + length + w_top_tot", data=subset).fit(disp=False)
            results[time_period] = model
        else:
            print(f"Skipping {institution} {time_period} due to insufficient data.")
    print(f"Logistic regressions completed for {institution} ({time_var}).")
    return results

# Run regressions for 10-year bins and 5-year bins for each institution:
models_commission_10 = run_logistic_by_time(df_commission, "year_bin_10", "European Commission")
models_commission_5 = run_logistic_by_time(df_commission, "year_bin_5", "European Commission")
models_bundestag_10 = run_logistic_by_time(df_bundestag, "year_bin_10", "German Bundestag")
models_bundestag_5 = run_logistic_by_time(df_bundestag, "year_bin_5", "German Bundestag")



def wald_test_over_time(models, institution):
    test_results = {}
    time_combinations = list(combinations(models.keys(), 2))
    for (t1, t2) in time_combinations:
        model1 = models[t1]
        model2 = models[t2]
        common_params = list(set(model1.params.index) & set(model2.params.index))
        # Compute the Wald statistic over the common parameters
        diff = model1.params[common_params] - model2.params[common_params]
        var_diff = model1.bse[common_params]**2 + model2.bse[common_params]**2
        wald_stat = sum((diff**2) / var_diff)
        df_param = len(common_params)
        p_value = 1 - sm.stats.chi2.sf(wald_stat, df=df_param)
        test_results[(t1, t2)] = p_value
    print(f"Wald tests completed for {institution}.")
    return test_results

wald_commission_10 = wald_test_over_time(models_commission_10, "European Commission")
wald_commission_5 = wald_test_over_time(models_commission_5, "European Commission")
wald_bundestag_10 = wald_test_over_time(models_bundestag_10, "German Bundestag")
wald_bundestag_5 = wald_test_over_time(models_bundestag_5, "German Bundestag")



def find_most_similar_periods(models_commission, models_bundestag):
    similarity_results = {}
    for comm_time, comm_model in models_commission.items():
        best_match = None
        min_distance = float("inf")
        for bund_time, bund_model in models_bundestag.items():
            common_params = list(set(comm_model.params.index) & set(bund_model.params.index))
            if len(common_params) > 0:
                distance = euclidean(comm_model.params[common_params], bund_model.params[common_params])
                if distance < min_distance:
                    min_distance = distance
                    best_match = bund_time
        similarity_results[comm_time] = best_match
    print("Similarity analysis completed between European Commission and German Bundestag.")
    return similarity_results

similar_periods_10 = find_most_similar_periods(models_commission_10, models_bundestag_10)
similar_periods_5 = find_most_similar_periods(models_commission_5, models_bundestag_5)

# Display the similarity results
print("Most similar time periods (10-year bins):", similar_periods_10)
print("Most similar time periods (5-year bins):", similar_periods_5)


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from itertools import combinations
from scipy.spatial.distance import euclidean
from scipy.stats import chi2


# For the European Commission dataset, check for the party column.
if "European Party_speakerinfo" in df_commission_rdd.columns:
    df_commission_rdd.rename(columns={'European Party_speakerinfo': 'party',
                                       'Sex_speakerinfo': 'sex'}, inplace=True)
else:
    print("Note: 'European Party_speakerinfo' not found – using existing 'party' column in Commission dataset.")

# For the Bundestag dataset, if 'party' is missing, try renaming
if "party" not in df_bundestag_rdd.columns:
    if "European Party_speakerinfo" in df_bundestag_rdd.columns:
        df_bundestag_rdd.rename(columns={'European Party_speakerinfo': 'party'}, inplace=True)
    else:
        print("Warning: 'party' column not found in Bundestag dataset.")

# Recode party names using party_mapping for both datasets
party_mapping = {
    'PES': 'PES',
    'PES[34]': 'PES',
    'PES\xa0/': 'PES',  # Handles special character issues
    'ALDE': 'ALDE',
    'ALDE[23]': 'ALDE',
    'ALDE[25]': 'ALDE',
    'ALDE[5]': 'ALDE',
    'ALDE[32]': 'ALDE',
    'ALDE[38]': 'ALDE',
    'EPP': 'EPP',
    'csu': 'EPP',       # Map CSU to EPP
    'EGP': 'EGP',
    'ECR': 'ECR',
    'ECR[41]': 'ECR'
}
if "party" in df_commission_rdd.columns:
    df_commission_rdd["party"] = df_commission_rdd["party"].replace(party_mapping)
if "party" in df_bundestag_rdd.columns:
    df_bundestag_rdd["party"] = df_bundestag_rdd["party"].replace(party_mapping)


# Ensure 'date' is datetime
df_commission_rdd["date"] = pd.to_datetime(df_commission_rdd["date"], errors="coerce")
df_bundestag_rdd["date"] = pd.to_datetime(df_bundestag_rdd["date"], errors="coerce")

# Define event dates:
#   - European Commission event: December 1, 2019
#   - German Bundestag event: November 1, 2005
event_commission = pd.Timestamp("2019-12-01")
event_bundestag = pd.Timestamp("2005-11-01")

# Calculate time_to_event (days from event)
df_commission_rdd["time_to_event"] = (df_commission_rdd["date"] - event_commission).dt.days
df_bundestag_rdd["time_to_event"] = (df_bundestag_rdd["date"] - event_bundestag).dt.days

# Extract year from date
df_commission_rdd["year"] = df_commission_rdd["date"].dt.year
df_bundestag_rdd["year"] = df_bundestag_rdd["date"].dt.year

# Create 10-year and 5-year bins
df_commission_rdd["year_bin_10"] = (df_commission_rdd["year"] // 10) * 10
df_bundestag_rdd["year_bin_10"] = (df_bundestag_rdd["year"] // 10) * 10

df_commission_rdd["year_bin_5"] = (df_commission_rdd["year"] // 5) * 5
df_bundestag_rdd["year_bin_5"] = (df_bundestag_rdd["year"] // 5) * 5

# Verify the new variables
print("Commission dataset sample:")
print(df_commission_rdd[["date", "time_to_event", "year", "year_bin_10", "year_bin_5"]].head())
print("\nBundestag dataset sample:")
print(df_bundestag_rdd[["date", "time_to_event", "year", "year_bin_10", "year_bin_5"]].head())

# work with df_commission and df_bundestag as copies of the respective datasets.
df_commission = df_commission_rdd.copy()
df_bundestag = df_bundestag_rdd.copy()

# Ensure the dependent variable 'claimbinary' is numeric and one-dimensional
df_commission["claimbinary"] = pd.to_numeric(df_commission["claimbinary"], errors="coerce")
df_bundestag["claimbinary"] = pd.to_numeric(df_bundestag["claimbinary"], errors="coerce").squeeze()

# Keep only rows where claimbinary is 0 or 1
df_commission = df_commission[df_commission["claimbinary"].isin([0, 1])]
df_bundestag = df_bundestag[df_bundestag["claimbinary"].isin([0, 1])]


def run_logistic_by_time(df, time_var, institution):
    results = {}
    for time_period in sorted(df[time_var].unique()):
        subset = df[df[time_var] == time_period]
        if subset.shape[0] > 10:
            try:
                model = smf.logit("claimbinary ~ C(sex) + C(party) + length + w_top_tot", data=subset).fit(disp=False)
                results[time_period] = model
            except Exception as e:
                print(f"Error in {institution} {time_period}: {e}. Skipping this time bin.")
        else:
            print(f"Skipping {institution} {time_period} due to insufficient data.")
    print(f"Logistic regressions completed for {institution} ({time_var}).")
    return results

# Run regressions for 10-year bins and 5-year bins
models_commission_10 = run_logistic_by_time(df_commission, "year_bin_10", "European Commission")
models_commission_5 = run_logistic_by_time(df_commission, "year_bin_5", "European Commission")
models_bundestag_10 = run_logistic_by_time(df_bundestag, "year_bin_10", "German Bundestag")
models_bundestag_5 = run_logistic_by_time(df_bundestag, "year_bin_5", "German Bundestag")


def wald_test_over_time(models, institution):
    test_results = {}
    time_combinations = list(combinations(models.keys(), 2))
    for (t1, t2) in time_combinations:
        model1 = models[t1]
        model2 = models[t2]
        common_params = list(set(model1.params.index) & set(model2.params.index))
        if len(common_params) == 0:
            continue
        diff = model1.params[common_params] - model2.params[common_params]
        var_diff = model1.bse[common_params]**2 + model2.bse[common_params]**2
        wald_stat = sum((diff**2) / var_diff)
        df_param = len(common_params)
        p_value = 1 - chi2.sf(wald_stat, df=df_param)
        test_results[(t1, t2)] = p_value
    print(f"Wald tests completed for {institution}.")
    return test_results

wald_commission_10 = wald_test_over_time(models_commission_10, "European Commission")
wald_commission_5 = wald_test_over_time(models_commission_5, "European Commission")
wald_bundestag_10 = wald_test_over_time(models_bundestag_10, "German Bundestag")
wald_bundestag_5 = wald_test_over_time(models_bundestag_5, "German Bundestag")


def find_most_similar_periods(models_commission, models_bundestag):
    similarity_results = {}
    for comm_time, comm_model in models_commission.items():
        best_match = None
        min_distance = float("inf")
        for bund_time, bund_model in models_bundestag.items():
            common_params = list(set(comm_model.params.index) & set(bund_model.params.index))
            if len(common_params) > 0:
                distance = euclidean(comm_model.params[common_params], bund_model.params[common_params])
                if distance < min_distance:
                    min_distance = distance
                    best_match = bund_time
        similarity_results[comm_time] = best_match
    print("Similarity analysis completed between European Commission and German Bundestag.")
    return similarity_results

similar_periods_10 = find_most_similar_periods(models_commission_10, models_bundestag_10)
similar_periods_5 = find_most_similar_periods(models_commission_5, models_bundestag_5)

# Display the similarity results
print("Most similar time periods (10-year bins):", similar_periods_10)
print("Most similar time periods (5-year bins):", similar_periods_5)


In [None]:
# Check the data type and the first few rows of claimbinary in the Bundestag dataset
print("Claimbinary (Bundestag) dtype:", df_bundestag["claimbinary"].dtype)
print("First few values of claimbinary (Bundestag):")
print(df_bundestag["claimbinary"].head())

# Check if there are duplicate column names for claimbinary
print("All columns in Bundestag dataset:")
print(df_bundestag.columns.tolist())


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from itertools import combinations
from scipy.spatial.distance import euclidean
from scipy.stats import chi2


# For Commission dataset, rename "European Party_speakerinfo" if it exists; otherwise, use the existing 'party' column.
if "European Party_speakerinfo" in df_commission_rdd.columns:
    df_commission_rdd.rename(columns={'European Party_speakerinfo': 'party',
                                      'Sex_speakerinfo': 'sex'}, inplace=True)
else:
    print("Note: 'European Party_speakerinfo' not found – using existing 'party' column in Commission dataset.")

# For Bundestag dataset, if 'party' is missing, try renaming (adjust if needed)
if "party" not in df_bundestag_rdd.columns:
    if "European Party_speakerinfo" in df_bundestag_rdd.columns:
        df_bundestag_rdd.rename(columns={'European Party_speakerinfo': 'party'}, inplace=True)
    else:
        print("Warning: 'party' column not found in Bundestag dataset.")

# Recode party names using the party_mapping dictionary for both datasets
party_mapping = {
    'PES': 'PES',
    'PES[34]': 'PES',
    'PES\xa0/': 'PES',  # Handles special character issues
    'ALDE': 'ALDE',
    'ALDE[23]': 'ALDE',
    'ALDE[25]': 'ALDE',
    'ALDE[5]': 'ALDE',
    'ALDE[32]': 'ALDE',
    'ALDE[38]': 'ALDE',
    'EPP': 'EPP',
    'csu': 'EPP',       # Map CSU to EPP
    'EGP': 'EGP',
    'ECR': 'ECR',
    'ECR[41]': 'ECR'
}
if "party" in df_commission_rdd.columns:
    df_commission_rdd["party"] = df_commission_rdd["party"].replace(party_mapping)
if "party" in df_bundestag_rdd.columns:
    df_bundestag_rdd["party"] = df_bundestag_rdd["party"].replace(party_mapping)


# Ensure 'date' is in datetime format
df_commission_rdd["date"] = pd.to_datetime(df_commission_rdd["date"], errors="coerce")
df_bundestag_rdd["date"] = pd.to_datetime(df_bundestag_rdd["date"], errors="coerce")

# Define event dates:
#   - European Commission event: December 1, 2019
#   - German Bundestag event: November 1, 2005
event_commission = pd.Timestamp("2019-12-01")
event_bundestag = pd.Timestamp("2005-11-01")

# Calculate time_to_event (in days)
df_commission_rdd["time_to_event"] = (df_commission_rdd["date"] - event_commission).dt.days
df_bundestag_rdd["time_to_event"] = (df_bundestag_rdd["date"] - event_bundestag).dt.days

# Extract year from the date
df_commission_rdd["year"] = df_commission_rdd["date"].dt.year
df_bundestag_rdd["year"] = df_bundestag_rdd["date"].dt.year

# Create 10-year and 5-year bins
df_commission_rdd["year_bin_10"] = (df_commission_rdd["year"] // 10) * 10
df_bundestag_rdd["year_bin_10"] = (df_bundestag_rdd["year"] // 10) * 10

df_commission_rdd["year_bin_5"] = (df_commission_rdd["year"] // 5) * 5
df_bundestag_rdd["year_bin_5"] = (df_bundestag_rdd["year"] // 5) * 5

# Optional: Verify the new variables
print("Commission dataset sample:")
print(df_commission_rdd[["date", "time_to_event", "year", "year_bin_10", "year_bin_5"]].head())
print("\nBundestag dataset sample:")
print(df_bundestag_rdd[["date", "time_to_event", "year", "year_bin_10", "year_bin_5"]].head())


# Define df_commission and df_bundestag as copies
df_commission = df_commission_rdd.copy()
df_bundestag = df_bundestag_rdd.copy()

# Convert the dependent variable 'claimbinary' to numeric and then to integer
df_commission["claimbinary"] = pd.to_numeric(df_commission["claimbinary"], errors="coerce").astype(int)
df_bundestag["claimbinary"] = pd.to_numeric(df_bundestag["claimbinary"], errors="coerce").astype(int)

# Keep only rows where claimbinary is 0 or 1
df_commission = df_commission[df_commission["claimbinary"].isin([0, 1])]
df_bundestag = df_bundestag[df_bundestag["claimbinary"].isin([0, 1])]


def run_logistic_by_time(df, time_var, institution):
    results = {}
    for time_period in sorted(df[time_var].unique()):
        subset = df[df[time_var] == time_period]
        if subset.shape[0] > 10:
            try:
                model = smf.logit("claimbinary ~ C(sex) + length + w_top_tot", data=subset).fit(disp=False)
                results[time_period] = model
            except Exception as e:
                print(f"Error in {institution} {time_period}: {e}. Skipping this time bin.")
        else:
            print(f"Skipping {institution} {time_period} due to insufficient data.")
    print(f"Logistic regressions completed for {institution} ({time_var}).")
    return results

# Run regressions for 10-year bins and 5-year bins for each institution
models_commission_10 = run_logistic_by_time(df_commission, "year_bin_10", "European Commission")
models_commission_5 = run_logistic_by_time(df_commission, "year_bin_5", "European Commission")
models_bundestag_10 = run_logistic_by_time(df_bundestag, "year_bin_10", "German Bundestag")
models_bundestag_5 = run_logistic_by_time(df_bundestag, "year_bin_5", "German Bundestag")


def wald_test_over_time(models, institution):
    test_results = {}
    time_combinations = list(combinations(models.keys(), 2))
    for (t1, t2) in time_combinations:
        model1 = models[t1]
        model2 = models[t2]
        common_params = list(set(model1.params.index) & set(model2.params.index))
        if len(common_params) == 0:
            continue
        diff = model1.params[common_params] - model2.params[common_params]
        var_diff = model1.bse[common_params]**2 + model2.bse[common_params]**2
        wald_stat = sum((diff**2) / var_diff)
        df_param = len(common_params)
        p_value = 1 - chi2.sf(wald_stat, df=df_param)
        test_results[(t1, t2)] = p_value
    print(f"Wald tests completed for {institution}.")
    return test_results

wald_commission_10 = wald_test_over_time(models_commission_10, "European Commission")
wald_commission_5 = wald_test_over_time(models_commission_5, "European Commission")
wald_bundestag_10 = wald_test_over_time(models_bundestag_10, "German Bundestag")
wald_bundestag_5 = wald_test_over_time(models_bundestag_5, "German Bundestag")


def find_most_similar_periods(models_commission, models_bundestag):
    similarity_results = {}
    for comm_time, comm_model in models_commission.items():
        best_match = None
        min_distance = float("inf")
        for bund_time, bund_model in models_bundestag.items():
            common_params = list(set(comm_model.params.index) & set(bund_model.params.index))
            if len(common_params) > 0:
                distance = euclidean(comm_model.params[common_params], bund_model.params[common_params])
                if distance < min_distance:
                    min_distance = distance
                    best_match = bund_time
        similarity_results[comm_time] = best_match
    print("Similarity analysis completed between European Commission and German Bundestag.")
    return similarity_results

similar_periods_10 = find_most_similar_periods(models_commission_10, models_bundestag_10)
similar_periods_5 = find_most_similar_periods(models_commission_5, models_bundestag_5)

# Display the similarity results
print("Most similar time periods (10-year bins):", similar_periods_10)
print("Most similar time periods (5-year bins):", similar_periods_5)


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from itertools import combinations
from scipy.spatial.distance import euclidean
from scipy.stats import chi2


# For Commission dataset, rename "European Party_speakerinfo" if it exists; otherwise, use the existing 'party' column.
if "European Party_speakerinfo" in df_commission_rdd.columns:
    df_commission_rdd.rename(columns={'European Party_speakerinfo': 'party',
                                      'Sex_speakerinfo': 'sex'}, inplace=True)
else:
    print("Note: 'European Party_speakerinfo' not found – using existing 'party' column in Commission dataset.")

# For Bundestag dataset, if 'party' is missing, try renaming (adjust if needed)
if "party" not in df_bundestag_rdd.columns:
    if "European Party_speakerinfo" in df_bundestag_rdd.columns:
        df_bundestag_rdd.rename(columns={'European Party_speakerinfo': 'party'}, inplace=True)
    else:
        print("Warning: 'party' column not found in Bundestag dataset.")

# Recode party names using the party_mapping dictionary for both datasets
party_mapping = {
    'PES': 'PES',
    'PES[34]': 'PES',
    'PES\xa0/': 'PES',  # Handles special character issues
    'ALDE': 'ALDE',
    'ALDE[23]': 'ALDE',
    'ALDE[25]': 'ALDE',
    'ALDE[5]': 'ALDE',
    'ALDE[32]': 'ALDE',
    'ALDE[38]': 'ALDE',
    'EPP': 'EPP',
    'csu': 'EPP',       # Map CSU to EPP
    'EGP': 'EGP',
    'ECR': 'ECR',
    'ECR[41]': 'ECR'
}
if "party" in df_commission_rdd.columns:
    df_commission_rdd["party"] = df_commission_rdd["party"].replace(party_mapping)
if "party" in df_bundestag_rdd.columns:
    df_bundestag_rdd["party"] = df_bundestag_rdd["party"].replace(party_mapping)


# Ensure 'date' is in datetime format
df_commission_rdd["date"] = pd.to_datetime(df_commission_rdd["date"], errors="coerce")
df_bundestag_rdd["date"] = pd.to_datetime(df_bundestag_rdd["date"], errors="coerce")

# Define event dates:
#   - European Commission event: December 1, 2019
#   - German Bundestag event: November 1, 2005
event_commission = pd.Timestamp("2019-12-01")
event_bundestag = pd.Timestamp("2005-11-01")

# Calculate time_to_event (in days)
df_commission_rdd["time_to_event"] = (df_commission_rdd["date"] - event_commission).dt.days
df_bundestag_rdd["time_to_event"] = (df_bundestag_rdd["date"] - event_bundestag).dt.days

# Extract year from the date
df_commission_rdd["year"] = df_commission_rdd["date"].dt.year
df_bundestag_rdd["year"] = df_bundestag_rdd["date"].dt.year

# Create 10-year and 5-year bins
df_commission_rdd["year_bin_2"] = (df_commission_rdd["year"] // 2) * 2
df_bundestag_rdd["year_bin_2"] = (df_bundestag_rdd["year"] // 2) * 2

df_commission_rdd["year_bin_5"] = (df_commission_rdd["year"] // 5) * 5
df_bundestag_rdd["year_bin_5"] = (df_bundestag_rdd["year"] // 5) * 5

# Optional: Verify the new variables
print("Commission dataset sample:")
print(df_commission_rdd[["date", "time_to_event", "year", "year_bin_2", "year_bin_5"]].head())
print("\nBundestag dataset sample:")
print(df_bundestag_rdd[["date", "time_to_event", "year", "year_bin_2", "year_bin_5"]].head())


# Define df_commission and df_bundestag as copies
df_commission = df_commission_rdd.copy()
df_bundestag = df_bundestag_rdd.copy()

# Convert the dependent variable 'claimbinary' to numeric and then to integer
df_commission["claimbinary"] = pd.to_numeric(df_commission["claimbinary"], errors="coerce").astype(int)
df_bundestag["claimbinary"] = pd.to_numeric(df_bundestag["claimbinary"], errors="coerce").astype(int)

# Keep only rows where claimbinary is 0 or 1
df_commission = df_commission[df_commission["claimbinary"].isin([0, 1])]
df_bundestag = df_bundestag[df_bundestag["claimbinary"].isin([0, 1])]


def run_logistic_by_time(df, time_var, institution):
    results = {}
    for time_period in sorted(df[time_var].unique()):
        subset = df[df[time_var] == time_period]
        if subset.shape[0] > 10:
            try:
                model = smf.logit("claimbinary ~ C(sex) + length + w_top_tot", data=subset).fit(disp=False)
                results[time_period] = model
            except Exception as e:
                print(f"Error in {institution} {time_period}: {e}. Skipping this time bin.")
        else:
            print(f"Skipping {institution} {time_period} due to insufficient data.")
    print(f"Logistic regressions completed for {institution} ({time_var}).")
    return results

# Run regressions for 10-year bins and 5-year bins for each institution
models_commission_2 = run_logistic_by_time(df_commission, "year_bin_2", "European Commission")
models_commission_5 = run_logistic_by_time(df_commission, "year_bin_5", "European Commission")
models_bundestag_2 = run_logistic_by_time(df_bundestag, "year_bin_2", "German Bundestag")
models_bundestag_5 = run_logistic_by_time(df_bundestag, "year_bin_5", "German Bundestag")

def wald_test_over_time(models, institution):
    test_results = {}
    time_combinations = list(combinations(models.keys(), 2))
    for (t1, t2) in time_combinations:
        model1 = models[t1]
        model2 = models[t2]
        common_params = list(set(model1.params.index) & set(model2.params.index))
        if len(common_params) == 0:
            continue
        diff = model1.params[common_params] - model2.params[common_params]
        var_diff = model1.bse[common_params]**2 + model2.bse[common_params]**2
        wald_stat = sum((diff**2) / var_diff)
        df_param = len(common_params)
        p_value = 1 - chi2.sf(wald_stat, df=df_param)
        test_results[(t1, t2)] = p_value
    print(f"Wald tests completed for {institution}.")
    return test_results

wald_commission_2 = wald_test_over_time(models_commission_2, "European Commission")
wald_commission_5 = wald_test_over_time(models_commission_5, "European Commission")
wald_bundestag_2 = wald_test_over_time(models_bundestag_2, "German Bundestag")
wald_bundestag_5 = wald_test_over_time(models_bundestag_5, "German Bundestag")

def find_most_similar_periods(models_commission, models_bundestag):
    similarity_results = {}
    for comm_time, comm_model in models_commission.items():
        best_match = None
        min_distance = float("inf")
        for bund_time, bund_model in models_bundestag.items():
            common_params = list(set(comm_model.params.index) & set(bund_model.params.index))
            if len(common_params) > 0:
                distance = euclidean(comm_model.params[common_params], bund_model.params[common_params])
                if distance < min_distance:
                    min_distance = distance
                    best_match = bund_time
        similarity_results[comm_time] = best_match
    print("Similarity analysis completed between European Commission and German Bundestag.")
    return similarity_results

similar_periods_2 = find_most_similar_periods(models_commission_2, models_bundestag_2)
similar_periods_5 = find_most_similar_periods(models_commission_5, models_bundestag_5)

# Display the similarity results
print("Most similar time periods (2-year bins):", similar_periods_2)
print("Most similar time periods (5-year bins):", similar_periods_5)


In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.discrete.count_model import ZeroInflatedNegativeBinomialP
import patsy
from itertools import combinations
from scipy.spatial.distance import euclidean
from scipy.stats import chi2


# For the Commission dataset, use the existing 'party' column (if "European Party_speakerinfo" is missing)
if "European Party_speakerinfo" in df_commission_rdd.columns:
    df_commission_rdd.rename(columns={'European Party_speakerinfo': 'party',
                                      'Sex_speakerinfo': 'sex'}, inplace=True)
else:
    print("Note: 'European Party_speakerinfo' not found – using existing 'party' column in Commission dataset.")

# For the Bundestag dataset, if 'party' is missing, attempt renaming if possible.
if "party" not in df_bundestag_rdd.columns:
    if "European Party_speakerinfo" in df_bundestag_rdd.columns:
        df_bundestag_rdd.rename(columns={'European Party_speakerinfo': 'party'}, inplace=True)
    else:
        print("Warning: 'party' column not found in Bundestag dataset.")

# Recode party names using your provided party_mapping
party_mapping = {
    'PES': 'PES',
    'PES[34]': 'PES',
    'PES\xa0/': 'PES',  # Handles special character issues
    'ALDE': 'ALDE',
    'ALDE[23]': 'ALDE',
    'ALDE[25]': 'ALDE',
    'ALDE[5]': 'ALDE',
    'ALDE[32]': 'ALDE',
    'ALDE[38]': 'ALDE',
    'EPP': 'EPP',
    'csu': 'EPP',       # Map CSU to EPP
    'EGP': 'EGP',
    'ECR': 'ECR',
    'ECR[41]': 'ECR'
}

if "party" in df_commission_rdd.columns:
    df_commission_rdd["party"] = df_commission_rdd["party"].replace(party_mapping)
if "party" in df_bundestag_rdd.columns:
    df_bundestag_rdd["party"] = df_bundestag_rdd["party"].replace(party_mapping)


# Ensure 'date' is datetime
df_commission_rdd["date"] = pd.to_datetime(df_commission_rdd["date"], errors="coerce")
df_bundestag_rdd["date"] = pd.to_datetime(df_bundestag_rdd["date"], errors="coerce")

# Define event dates:
#   - European Commission event: December 1, 2019
#   - German Bundestag event: November 1, 2005
event_commission = pd.Timestamp("2019-12-01")
event_bundestag = pd.Timestamp("2005-11-01")

# Calculate time_to_event (in days)
df_commission_rdd["time_to_event"] = (df_commission_rdd["date"] - event_commission).dt.days
df_bundestag_rdd["time_to_event"] = (df_bundestag_rdd["date"] - event_bundestag).dt.days

# Extract year from date
df_commission_rdd["year"] = df_commission_rdd["date"].dt.year
df_bundestag_rdd["year"] = df_bundestag_rdd["date"].dt.year

# Create 10-year and 5-year bins
df_commission_rdd["year_bin_10"] = (df_commission_rdd["year"] // 10) * 10
df_bundestag_rdd["year_bin_10"] = (df_bundestag_rdd["year"] // 10) * 10

df_commission_rdd["year_bin_5"] = (df_commission_rdd["year"] // 5) * 5
df_bundestag_rdd["year_bin_5"] = (df_bundestag_rdd["year"] // 5) * 5

# Optional: Check samples
print("Commission dataset sample:")
print(df_commission_rdd[["date", "time_to_event", "year", "year_bin_10", "year_bin_5"]].head())
print("\nBundestag dataset sample:")
print(df_bundestag_rdd[["date", "time_to_event", "year", "year_bin_10", "year_bin_5"]].head())


# Work with copies:
df_commission = df_commission_rdd.copy()
df_bundestag = df_bundestag_rdd.copy()

# Convert the dependent variable 'claimcount' to numeric
df_commission["claimcount"] = pd.to_numeric(df_commission["claimcount"], errors="coerce")
df_bundestag["claimcount"] = pd.to_numeric(df_bundestag["claimcount"], errors="coerce")

# Optionally, remove missing values in claimcount:
df_commission = df_commission.dropna(subset=["claimcount"])
df_bundestag = df_bundestag.dropna(subset=["claimcount"])


# use patsy to create design matrices and run ZINB regressions using ZeroInflatedNegativeBinomialP.

from statsmodels.discrete.count_model import ZeroInflatedNegativeBinomialP
import patsy

def run_zinb_by_time(df, time_var, institution):
    results = {}
    for time_period in sorted(df[time_var].unique()):
        subset = df[df[time_var] == time_period]
        if subset.shape[0] > 10:
            try:
                # Create design matrices using patsy
                # We use a formula for the count part.
                formula = "claimcount ~ C(sex) + length + w_top_tot"
                y, X = patsy.dmatrices(formula, data=subset, return_type="dataframe")
                # For the inflation model, use only a constant.
                Z = pd.DataFrame({"const": 1}, index=X.index)
                # Fit the zero-inflated negative binomial model
                model = ZeroInflatedNegativeBinomialP(y, X, exog_infl=Z, inflation="logit").fit(disp=0)
                results[time_period] = model
            except Exception as e:
                print(f"ZINB Error in {institution} {time_period}: {e}. Skipping this time bin.")
        else:
            print(f"Skipping {institution} {time_period} due to insufficient data.")
    print(f"ZINB regressions completed for {institution} ({time_var}).")
    return results

# Run ZINB regressions for 10-year and 5-year bins
models_commission_zinb_10 = run_zinb_by_time(df_commission, "year_bin_10", "European Commission")
models_commission_zinb_5 = run_zinb_by_time(df_commission, "year_bin_5", "European Commission")
models_bundestag_zinb_10 = run_zinb_by_time(df_bundestag, "year_bin_10", "German Bundestag")
models_bundestag_zinb_5 = run_zinb_by_time(df_bundestag, "year_bin_5", "German Bundestag")


def extract_count_params(model):
    # Extract parameters for the count model (those not starting with 'inflate_')
    return model.params[~model.params.index.str.contains("inflate")]

def wald_test_over_time_zinb(models, institution):
    test_results = {}
    time_combinations = list(combinations(models.keys(), 2))
    for (t1, t2) in time_combinations:
        model1 = models[t1]
        model2 = models[t2]
        params1 = extract_count_params(model1)
        params2 = extract_count_params(model2)
        common_params = list(set(params1.index) & set(params2.index))
        if len(common_params) == 0:
            continue
        diff = params1[common_params] - params2[common_params]
        var_diff = model1.bse[common_params]**2 + model2.bse[common_params]**2
        wald_stat = sum((diff**2) / var_diff)
        df_param = len(common_params)
        p_value = 1 - chi2.sf(wald_stat, df=df_param)
        test_results[(t1, t2)] = p_value
    print(f"ZINB Wald tests completed for {institution}.")
    return test_results

wald_commission_zinb_10 = wald_test_over_time_zinb(models_commission_zinb_10, "European Commission")
wald_commission_zinb_5 = wald_test_over_time_zinb(models_commission_zinb_5, "European Commission")
wald_bundestag_zinb_10 = wald_test_over_time_zinb(models_bundestag_zinb_10, "German Bundestag")
wald_bundestag_zinb_5 = wald_test_over_time_zinb(models_bundestag_zinb_5, "German Bundestag")

def find_most_similar_periods_zinb(models_commission, models_bundestag):
    similarity_results = {}
    for comm_time, comm_model in models_commission.items():
        best_match = None
        min_distance = float("inf")
        count_params_comm = extract_count_params(comm_model)
        for bund_time, bund_model in models_bundestag.items():
            count_params_bund = extract_count_params(bund_model)
            common_params = list(set(count_params_comm.index) & set(count_params_bund.index))
            if len(common_params) > 0:
                distance = euclidean(count_params_comm[common_params], count_params_bund[common_params])
                if distance < min_distance:
                    min_distance = distance
                    best_match = bund_time
        similarity_results[comm_time] = best_match
    print("ZINB similarity analysis completed between European Commission and German Bundestag.")
    return similarity_results

similar_periods_zinb_10 = find_most_similar_periods_zinb(models_commission_zinb_10, models_bundestag_zinb_10)
similar_periods_zinb_5 = find_most_similar_periods_zinb(models_commission_zinb_5, models_bundestag_zinb_5)

# Display the similarity results
print("ZINB Most similar time periods (10-year bins):", similar_periods_zinb_10)
print("ZINB Most similar time periods (5-year bins):", similar_periods_zinb_5)


In [None]:
from scipy.stats import chi2
from itertools import combinations

def wald_test_sex_effect(models, sex_param="C(sex)[T.1]"):

    test_results = {}
    time_combinations = list(combinations(models.keys(), 2))
    for (t1, t2) in time_combinations:
        model1 = models[t1]
        model2 = models[t2]
        # Check if the sex parameter is in both models
        if sex_param in model1.params.index and sex_param in model2.params.index:
            beta1 = model1.params[sex_param]
            se1 = model1.bse[sex_param]
            beta2 = model2.params[sex_param]
            se2 = model2.bse[sex_param]
            diff = beta1 - beta2
            var_diff = se1**2 + se2**2
            wald_stat = (diff**2) / var_diff
            p_value = 1 - chi2.sf(wald_stat, df=1)
            test_results[(t1, t2)] = p_value
        else:
            test_results[(t1, t2)] = None
    return test_results

# For example, for the European Commission models:
# (If sex is coded as 0/1, the dummy for value 1 is likely "C(sex)[T.1]".)
wald_commission_sex_10 = wald_test_sex_effect(models_commission_10, sex_param="C(sex)[T.1]")
wald_commission_sex_5 = wald_test_sex_effect(models_commission_5, sex_param="C(sex)[T.1]")

# For the German Bundestag models:
# (If sex is boolean, the dummy might be "C(sex)[T.True]". Adjust as needed.)
wald_bundestag_sex_10 = wald_test_sex_effect(models_bundestag_10, sex_param="C(sex)[T.True]")
wald_bundestag_sex_5 = wald_test_sex_effect(models_bundestag_5, sex_param="C(sex)[T.True]")

print("Wald tests for sex effect (European Commission, 10-year bins):", wald_commission_sex_10)
print("Wald tests for sex effect (European Commission, 5-year bins):", wald_commission_sex_5)
print("Wald tests for sex effect (German Bundestag, 10-year bins):", wald_bundestag_sex_10)
print("Wald tests for sex effect (German Bundestag, 5-year bins):", wald_bundestag_sex_5)


In [None]:
from scipy.stats import chi2
from itertools import combinations

def compare_sex_effect_across_institutions(models_commission, models_bundestag,
                                           sex_param_comm="C(sex)[T.1]",
                                           sex_param_bund="C(sex)[T.True]"):

    comparisons = {}
    common_bins = set(models_commission.keys()).intersection(set(models_bundestag.keys()))
    for time_bin in sorted(common_bins):
        mod_comm = models_commission[time_bin]
        mod_bund = models_bundestag[time_bin]
        if (sex_param_comm in mod_comm.params.index) and (sex_param_bund in mod_bund.params.index):
            beta_comm = mod_comm.params[sex_param_comm]
            beta_bund = mod_bund.params[sex_param_bund]
            se_comm = mod_comm.bse[sex_param_comm]
            se_bund = mod_bund.bse[sex_param_bund]
            diff = beta_comm - beta_bund
            var_diff = se_comm**2 + se_bund**2
            if var_diff <= 0:
                comparisons[time_bin] = np.nan
            else:
                wald_stat = (diff**2) / var_diff
                p_val = 1 - chi2.sf(wald_stat, df=1)
                comparisons[time_bin] = p_val
        else:
            comparisons[time_bin] = None
    return comparisons

# Compare the sex effect across institutions for 10-year bins:
sex_diff_10 = compare_sex_effect_across_institutions(models_commission_zinb_10,
                                                     models_bundestag_zinb_10,
                                                     sex_param_comm="C(sex)[T.1]",
                                                     sex_param_bund="C(sex)[T.True]")

# And for 5-year bins:
sex_diff_5 = compare_sex_effect_across_institutions(models_commission_zinb_5,
                                                    models_bundestag_zinb_5,
                                                    sex_param_comm="C(sex)[T.1]",
                                                    sex_param_bund="C(sex)[T.True]")

print("Wald test p-values for sex effect differences (10-year bins):", sex_diff_10)
print("Wald test p-values for sex effect differences (5-year bins):", sex_diff_5)


In [None]:
from scipy.stats import chi2
from itertools import combinations
import numpy as np

def compare_sex_effect_logistic(models_comm, models_bund, sex_param_comm="C(sex)[T.1]", sex_param_bund="C(sex)[T.True]"):


    comparisons = {}
    # Only consider time bins that exist in both dictionaries
    common_bins = set(models_comm.keys()).intersection(set(models_bund.keys()))
    for time_bin in sorted(common_bins):
        mod_comm = models_comm[time_bin]
        mod_bund = models_bund[time_bin]
        if (sex_param_comm in mod_comm.params.index) and (sex_param_bund in mod_bund.params.index):
            beta_comm = mod_comm.params[sex_param_comm]
            beta_bund = mod_bund.params[sex_param_bund]
            se_comm = mod_comm.bse[sex_param_comm]
            se_bund = mod_bund.bse[sex_param_bund]
            diff = beta_comm - beta_bund
            var_diff = se_comm**2 + se_bund**2
            if var_diff <= 0:
                comparisons[time_bin] = np.nan
            else:
                wald_stat = (diff**2) / var_diff
                p_val = 1 - chi2.sf(wald_stat, df=1)
                comparisons[time_bin] = p_val
        else:
            comparisons[time_bin] = None
    return comparisons

# Now, compare the sex effect between the two institutions.
# For example, using your 10-year bin logistic models:
sex_diff_logistic_10 = compare_sex_effect_logistic(models_commission_10, models_bundestag_10,
                                                   sex_param_comm="C(sex)[T.1]",
                                                   sex_param_bund="C(sex)[T.True]")

# And similarly for the 5-year bins:
sex_diff_logistic_5 = compare_sex_effect_logistic(models_commission_5, models_bundestag_5,
                                                  sex_param_comm="C(sex)[T.1]",
                                                  sex_param_bund="C(sex)[T.True]")

print("Wald test p-values for sex effect differences (10-year bins):", sex_diff_logistic_10)
print("Wald test p-values for sex effect differences (5-year bins):", sex_diff_logistic_5)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# # Convert dictionaries to DataFrames
df_sim_10 = pd.DataFrame(list(similar_periods_10.items()), columns=["EC Period", "GB Period"])
df_sim_5 = pd.DataFrame(list(similar_periods_5.items()), columns=["EC Period", "GB Period"])

# Create pivot tables for heatmap visualization
heatmap_10 = df_sim_10.pivot(index="EC Period", columns="GB Period", values="EC Period")
heatmap_5 = df_sim_5.pivot(index="EC Period", columns="GB Period", values="EC Period")

# Plot heatmap for 10-year bins
plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_10, annot=True, fmt=".0f", cmap="Blues", linewidths=0.5, cbar=True)
plt.title("Similarity Heatmap (10-Year Bins)\nEuropean Commission vs. German Bundestag")
plt.xlabel("German Bundestag Period")
plt.ylabel("European Commission Period")
plt.show()

# Plot heatmap for 5-year bins
plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_5, annot=True, fmt=".0f", cmap="Oranges", linewidths=0.5, cbar=True)
plt.title("Similarity Heatmap (5-Year Bins)\nEuropean Commission vs. German Bundestag")
plt.xlabel("German Bundestag Period")
plt.ylabel("European Commission Period")
plt.show()
