In [1]:
import pandas as pd
import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np

from sklearn.neighbors import NearestNeighbors

In [2]:
# Find and remove highly correlated features
def remove_collinear_features(X: pd.DataFrame, threshold=0.95):
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    print("to_drop", to_drop)
    return X.drop(to_drop, axis=1)


def calculate_vif(X: pd.DataFrame):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [
        variance_inflation_factor(X.values, i) for i in range(X.shape[1])
    ]
    return vif_data

In [4]:
df = pd.read_csv("./data/gold/panel_data_treated.csv")
df_cross = pd.read_csv("./data/gold/cross_section.csv")

df["date"] = pd.to_datetime(df["date"])

  df = pd.read_csv("./data/gold/panel_data_treated.csv")


In [52]:
important_columns = [
    'mep_id',
    'meetings',
    'questions_log'
]
country_columns = [c for c in df_cross.columns if 'country' in c]
p_groups_columns = [c for c in df_cross.columns if 'political_group' in c]
membership_columns = [c for c in df_cross.columns if c not in columns_to_keep][2:]

columns_to_keep = important_columns + country_columns + p_groups_columns + membership_columns


In [65]:
x_columns_psm

['questions_log',
 'country_0',
 'country_AUT',
 'country_BEL',
 'country_BGR',
 'country_CYP',
 'country_CZE',
 'country_DEU',
 'country_DNK',
 'country_ESP',
 'country_EST',
 'country_FIN',
 'country_GBR',
 'country_GRC',
 'country_HRV',
 'country_HUN',
 'country_IRL',
 'country_ITA',
 'country_LTU',
 'country_LUX',
 'country_LVA',
 'country_MLT',
 'country_NLD',
 'country_POL',
 'country_PRT',
 'country_ROU',
 'country_SVK',
 'country_SVN',
 'country_SWE',
 'political_group_1533',
 'political_group_1534',
 'political_group_1537',
 'political_group_1538',
 'political_group_1539',
 'political_group_1541',
 'political_group_1550',
 'political_group_1554',
 'political_group_3968',
 'political_group_4273',
 'political_group_4275',
 'political_group_4280',
 'political_group_4281',
 'political_group_4283',
 'political_group_4284',
 'political_group_4285',
 'political_group_4880',
 'political_group_4908',
 'political_group_5148',
 'political_group_5151',
 'political_group_5152',
 'political

In [71]:
# Run a PSM analysis
df_psm = df_cross.copy()[columns_to_keep]
# # Create treatment and control groups
df_psm["treatment"] = (df_psm["meetings"] > 0).astype(int)

# Prepare features for PSM
x_columns_psm = columns_to_keep[2:]

X_psm = df_psm[x_columns_psm].copy()
y_psm = df_psm["treatment"]

# Fit logistic regression for propensity scores
logit = sm.Logit(y_psm, sm.add_constant(X_psm))
logit_fit = logit.fit()


         Current function value: 0.319766
         Iterations: 35




In [72]:
logit_fit.summary()

0,1,2,3
Dep. Variable:,treatment,No. Observations:,1353.0
Model:,Logit,Df Residuals:,1281.0
Method:,MLE,Df Model:,71.0
Date:,"Mon, 17 Mar 2025",Pseudo R-squ.:,0.5386
Time:,16:49:01,Log-Likelihood:,-432.64
converged:,False,LL-Null:,-937.77
Covariance Type:,nonrobust,LLR p-value:,4.8470000000000005e-166

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
questions_log,-0.0410,0.107,-0.385,0.701,-0.250,0.168
country_0,-0.7280,1.180,-0.617,0.537,-3.042,1.586
country_AUT,0.4050,0.649,0.624,0.532,-0.866,1.676
country_BEL,0.5603,0.571,0.981,0.327,-0.559,1.679
country_BGR,-1.1744,0.666,-1.763,0.078,-2.480,0.131
country_CYP,-1.9605,1.023,-1.917,0.055,-3.965,0.044
country_CZE,-0.7221,0.548,-1.319,0.187,-1.795,0.351
country_DEU,0.4751,0.431,1.101,0.271,-0.370,1.321
country_DNK,0.3865,0.724,0.534,0.594,-1.033,1.806


In [73]:

# Calculate propensity scores
df_psm["propensity_score"] = logit_fit.predict()

# Separate treatment and control
treatment = df_psm[df_psm["treatment"] == 1]
control = df_psm[df_psm["treatment"] == 0]

# Find nearest neighbors
nbrs = NearestNeighbors(n_neighbors=1).fit(control[["propensity_score"]])
distances, indices = nbrs.kneighbors(treatment[["propensity_score"]])

# Get matched control group
matched_control = control.iloc[indices.flatten()]

# Combine matched samples
matched_df_psm = pd.concat([treatment, matched_control])


# Check balance of covariates
def check_balance(df_psm, features, treatment_col="treatment"):
    balance_stats = []
    for feature in features:
        treated_mean = df_psm[df_psm[treatment_col] == 1][feature].mean()
        control_mean = df_psm[df_psm[treatment_col] == 0][feature].mean()
        std_diff = (treated_mean - control_mean) / np.sqrt(
            (
                df_psm[df_psm[treatment_col] == 1][feature].var()
                + df_psm[df_psm[treatment_col] == 0][feature].var()
            )
            / 2
        )
        balance_stats.append(
            {
                "Feature": feature,
                "Treated Mean": treated_mean,
                "Control Mean": control_mean,
                "Std Diff": std_diff,
            }
        )
    return pd.DataFrame(balance_stats)


# Print balance statistics
print("\nCovariate Balance After Matching:")
check_balance(matched_df_psm, x_columns_psm)


Covariate Balance After Matching:


  std_diff = (treated_mean - control_mean) / np.sqrt(


Unnamed: 0,Feature,Treated Mean,Control Mean,Std Diff
0,questions_log,3.536373,3.547898,-0.008410
1,country_0,0.976119,0.964179,0.070155
2,country_AUT,0.028358,0.008955,0.143660
3,country_BEL,0.035821,0.011940,0.156776
4,country_BGR,0.019403,0.062687,-0.219316
...,...,...,...,...
67,COMMITTEE_PARLIAMENTARY_TEMPORARY,0.365672,0.341791,0.049924
68,DELEGATION_JOINT_COMMITTEE,0.428358,0.374627,0.109694
69,DELEGATION_PARLIAMENTARY,0.988060,0.977612,0.080445
70,DELEGATION_PARLIAMENTARY_ASSEMBLY,0.652239,0.577612,0.153698


# DID

In [74]:
df_did = df[df['mep_id'].isin(matched_df_psm.index)]

## DID regression

In [77]:
# Now, run a regression with fixed effects
from linearmodels import PanelOLS

In [121]:
# Set the index for panel data
df_panel = df_did.set_index(["mep_id", "date"])


# Set columns list
important_columns = [
    "quetions_log",
    # 'meetings',
    "treatment",
    "received_treatment_and_started",
]
country_columns = [c for c in df_panel.columns if "country" in c]
p_groups_columns = [c for c in df_panel.columns if "political_group" in c]
membership_columns = [
    c
    for c in df_panel.columns
    if c not in [*important_columns, *country_columns, *p_groups_columns]
    and c not in ["EU_INSTITUTION", "EU_POLITICAL_GROUP", "NATIONAL_CHAMBER"]
][8:]

columns_to_keep = (
    important_columns + country_columns + p_groups_columns + membership_columns
)


# Prepare dependent and independent variables
Y = df_panel['quetions_log']
X = df_panel[columns_to_keep[1:]]

# X = X.drop(to_drop_colinear_columns, axis=1)

model = sm.OLS(Y, sm.add_constant(X))

results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:           quetions_log   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     22.42
Date:                Mon, 17 Mar 2025   Prob (F-statistic):           3.83e-66
Time:                        16:58:07   Log-Likelihood:                 45519.
No. Observations:               31790   AIC:                        -9.100e+04
Df Residuals:                   31773   BIC:                        -9.086e+04
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
const 

In [18]:
# Run panel regression with entity (MEP) fixed effects
model = PanelOLS(Y, X, entity_effects=True)
results_fe = model.fit()

print(results_fe)

ValueError: exog does not have full column rank. If you wish to proceed with model estimation irrespective of the numerical accuracy of coefficient estimates, you can set check_rank=False.