In [1]:
import pandas as pd

In [10]:
df_cs = pd.read_csv("./data/gold/cross_section.csv", dtype={"mep_id": str})
df_graph_data = pd.read_csv("./data/gold/gephi_graph_data.csv")

In [68]:
# correlation matrix
print(
    df_graph_data[["modularity_class", "degree", "Authority", "eigencentrality"]].corr()
)

                  modularity_class    degree  Authority  eigencentrality
modularity_class          1.000000  0.385273   0.200847         0.208553
degree                    0.385273  1.000000   0.883139         0.899061
Authority                 0.200847  0.883139   1.000000         0.981707
eigencentrality           0.208553  0.899061   0.981707         1.000000


In [83]:
graph_var_to_choose = "eigencentrality"

df_merged = df_cs.merge(
    df_graph_data[["Id", "modularity_class", graph_var_to_choose]],
    left_on="mep_id",
    right_on="Id",
    how="left",
)

# Get dummies for modularity class
df_merged = pd.get_dummies(df_merged, columns=["modularity_class"])

# Drop the reference classes
df_merged = df_merged.drop(
    columns=[
        "modularity_class_0",
        "country_0",
        "political_group_4273",
        "COMMITTEE_PARLIAMENTARY_STANDING", # because it is empty
    ]
)

# Convert everything to float
df_merged = df_merged.astype(float)

In [84]:
from statsmodels.api import OLS


df = df_merged.drop(columns=["mep_id", "Id", "questions_log"])

Y = df["questions"]

X = df.drop(columns=["questions"])

X['const'] = 1

# Do a simlpe regression
model = OLS(Y, X)

results = model.fit()

results.summary()

0,1,2,3
Dep. Variable:,questions,R-squared:,0.408
Model:,OLS,Adj. R-squared:,0.368
Method:,Least Squares,F-statistic:,10.28
Date:,"Fri, 02 May 2025",Prob (F-statistic):,1.25e-94
Time:,21:28:31,Log-Likelihood:,-7749.4
No. Observations:,1353,AIC:,15670.0
Df Residuals:,1267,BIC:,16120.0
Df Model:,85,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
meetings,-0.0009,0.052,-0.017,0.987,-0.103,0.102
country_AUT,-9.2004,15.323,-0.600,0.548,-39.262,20.861
country_BEL,57.0382,14.421,3.955,0.000,28.747,85.330
country_BGR,0.4379,16.887,0.026,0.979,-32.693,33.568
country_CYP,45.3526,27.279,1.663,0.097,-8.164,98.869
country_CZE,-20.8806,14.432,-1.447,0.148,-49.194,7.433
country_DEU,-22.1972,10.078,-2.203,0.028,-41.969,-2.426
country_DNK,12.8948,17.575,0.734,0.463,-21.585,47.375
country_ESP,66.8004,10.580,6.314,0.000,46.044,87.557

0,1,2,3
Omnibus:,826.999,Durbin-Watson:,1.733
Prob(Omnibus):,0.0,Jarque-Bera (JB):,11927.522
Skew:,2.587,Prob(JB):,0.0
Kurtosis:,16.595,Cond. No.,2150.0


## Testando clogaritimizado

In [85]:
from statsmodels.api import OLS
import numpy as np

df = df_merged.drop(columns=["mep_id", "Id", "questions_log"])

df["questions"] = np.log(df["questions"] + 1)

Y = df["questions"]

X = df.drop(columns=["questions"])


X['const'] = 1

# Do a simlpe regression
model = OLS(Y, X)

results = model.fit()

results.summary()

0,1,2,3
Dep. Variable:,questions,R-squared:,0.576
Model:,OLS,Adj. R-squared:,0.547
Method:,Least Squares,F-statistic:,20.24
Date:,"Fri, 02 May 2025",Prob (F-statistic):,4.19e-180
Time:,21:28:37,Log-Likelihood:,-1801.5
No. Observations:,1353,AIC:,3775.0
Df Residuals:,1267,BIC:,4223.0
Df Model:,85,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
meetings,0.0014,0.001,2.212,0.027,0.000,0.003
country_AUT,-0.2830,0.189,-1.498,0.134,-0.653,0.088
country_BEL,0.3308,0.178,1.861,0.063,-0.018,0.679
country_BGR,-0.2287,0.208,-1.099,0.272,-0.637,0.180
country_CYP,0.0365,0.336,0.109,0.913,-0.623,0.696
country_CZE,-0.4385,0.178,-2.465,0.014,-0.787,-0.089
country_DEU,-0.5498,0.124,-4.426,0.000,-0.794,-0.306
country_DNK,0.1171,0.217,0.541,0.589,-0.308,0.542
country_ESP,0.5653,0.130,4.335,0.000,0.309,0.821

0,1,2,3
Omnibus:,0.021,Durbin-Watson:,1.711
Prob(Omnibus):,0.99,Jarque-Bera (JB):,0.044
Skew:,0.008,Prob(JB):,0.978
Kurtosis:,2.977,Cond. No.,2150.0


In [88]:
df_graph_data[df_graph_data['modularity_class'] == 9].sort_values(by='eigencentrality', ascending=False)

Unnamed: 0,Id,Label,timeset,type,modularity_class,indegree,outdegree,degree,weighted indegree,weighted outdegree,weighted degree,Authority,Hub,componentnumber,strongcompnum,stat_inf_class,clustering,eigencentrality
38,197654,,,,9,128,0,128,311,0,311,0.083265,0.000000,0,39,0,0,0.677249
39,96668,Seán KELLY,,Mep,9,85,0,85,142,0,142,0.064284,0.000000,0,38,0,0,0.449735
227,101585,Niels FUGLSANG,,Mep,9,79,0,79,187,0,187,0.063928,0.000000,0,225,0,0,0.417989
490,124872,Morten PETERSEN,,Mep,9,75,0,75,146,0,146,0.055215,0.000000,0,485,0,0,0.396825
365,197573,,,,9,54,0,54,82,0,82,0.047799,0.000000,0,352,0,0,0.285714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1922,788025840814-90,JAE Public Affairs,,Self-employed individuals,9,0,4,4,0,6,6,0.000000,0.008060,0,1729,0,0,0.000000
1905,818300434979-49,PGE Polska Grupa Energetyczna SA,,Companies & groups,9,0,35,35,0,55,55,0.000000,0.065799,0,1715,0,0,0.000000
1873,43859808000-87,European Association for Storage of Energy,,Trade and business associations,9,0,12,12,0,16,16,0.000000,0.024938,0,1693,0,0,0.000000
1870,024782946888-95,Danfoss A/S,,Companies & groups,9,0,12,12,0,25,25,0.000000,0.027225,0,1690,0,0,0.000000
