In [1]:
import pandas as pd
import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np

from sklearn.neighbors import NearestNeighbors

In [2]:
# Find and remove highly correlated features
def remove_collinear_features(X: pd.DataFrame, threshold=0.95):
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    print("to_drop", to_drop)
    return X.drop(to_drop, axis=1)


def calculate_vif(X: pd.DataFrame):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [
        variance_inflation_factor(X.values, i) for i in range(X.shape[1])
    ]
    return vif_data

In [3]:
df = pd.read_csv("./data/gold/panel_data_2.csv")
df["date"] = pd.to_datetime(df["date"])

df = df.fillna(0)

In [4]:
new_columns = []

# Drop political group 0 as reference category
political_groups = df["POLITICAL_GROUP"].unique()
political_groups = political_groups[political_groups != 0]  # Remove 0
for group in political_groups:
    col_name = f"political_group_{int(group)}"
    new_columns.append(col_name)
    df[col_name] = (df["POLITICAL_GROUP"] == group).astype(int)



# DROP country FRA as reference
countries = df["COUNTRY"].unique()
countries = countries[countries != "FRA"]
for country in countries:
    col_name = f"country_{country}"
    new_columns.append(col_name)
    df[col_name] = (df["COUNTRY"] == country).astype(int)









In [5]:
cols_to_ignore = [
    "ID",
    "member_id_x",
    "member_id_y",
    "meetings",
    "date",
    "POLITICAL_GROUP",
    "questions",
    "COUNTRY",
    " - CHAIR",
    " - CHAIR_VICE",
    " - MEMBER",
    " - MEMBER_PARLIAMENT",
    " - PRESIDENT",
    " - PRESIDENT_PARLIAMENT_STOA",
    " - PRESIDENT_VICE",
    " - QUAESTOR",
]

mep_data_columns = [c for c in df.columns if c not in cols_to_ignore]
mep_data_columns.sort()


y_column = "questions"

x_columns = [
    "meetings",
    *mep_data_columns,
    *new_columns,
]

In [6]:
# Run a PSM analysis
# Create treatment and control groups
df["treatment"] = (df["meetings"] > 0).astype(int)

# Prepare features for PSM
X_psm = df[x_columns].copy()
y_psm = df["treatment"]

# Fit logistic regression for propensity scores
logit = sm.Logit(y_psm, sm.add_constant(X_psm))
logit_fit = logit.fit()

# Calculate propensity scores
df["propensity_score"] = logit_fit.predict()

# Separate treatment and control
treatment = df[df["treatment"] == 1]
control = df[df["treatment"] == 0]

# Find nearest neighbors
nbrs = NearestNeighbors(n_neighbors=1).fit(control[["propensity_score"]])
distances, indices = nbrs.kneighbors(treatment[["propensity_score"]])

# Get matched control group
matched_control = control.iloc[indices.flatten()]

# Combine matched samples
matched_df = pd.concat([treatment, matched_control])


# Check balance of covariates
def check_balance(df, features, treatment_col="treatment"):
    balance_stats = []
    for feature in features:
        treated_mean = df[df[treatment_col] == 1][feature].mean()
        control_mean = df[df[treatment_col] == 0][feature].mean()
        std_diff = (treated_mean - control_mean) / np.sqrt(
            (
                df[df[treatment_col] == 1][feature].var()
                + df[df[treatment_col] == 0][feature].var()
            )
            / 2
        )
        balance_stats.append(
            {
                "Feature": feature,
                "Treated Mean": treated_mean,
                "Control Mean": control_mean,
                "Std Diff": std_diff,
            }
        )
    return pd.DataFrame(balance_stats)


# Print balance statistics
print("\nCovariate Balance After Matching:")
print(check_balance(matched_df, x_columns))

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


         Current function value: inf
         Iterations: 35


LinAlgError: Singular matrix