In [26]:
import pandas as pd
pd.set_option('display.max_columns', None)
import pyreadstat as st
import numpy as np
import matplotlib.pyplot as plt


path = r"C:\Users\HP\OneDrive\Escritorio\David Guzzi\DiTella\MEC\Materias\2025\2025 1T\[MT09] Econometría de Datos de Panel\Clases prácticas\PS 2-20250525\data\murder.dta"

df, meta = st.read_dta(path)
df.head(1)

Unnamed: 0,id,state,year,mrdrte,exec,unem,d90,d93,cmrdrte,cexec,cunem,cexec_1,cunem_1
0,2,AL,87.0,9.3,2.0,7.8,0.0,0.0,,,,,


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       153 non-null    int64  
 1   state    153 non-null    object 
 2   year     153 non-null    float64
 3   mrdrte   153 non-null    float64
 4   exec     153 non-null    float64
 5   unem     153 non-null    float64
 6   d90      153 non-null    float64
 7   d93      153 non-null    float64
 8   cmrdrte  102 non-null    float64
 9   cexec    102 non-null    float64
 10  cunem    102 non-null    float64
 11  cexec_1  51 non-null     float64
 12  cunem_1  51 non-null     float64
dtypes: float64(11), int64(1), object(1)
memory usage: 15.7+ KB


In [4]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Usamos patsy y statsmodels con fórmula
model = smf.ols('mrdrte ~ exec + unem + C(year)', data=df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 mrdrte   R-squared:                       0.076
Model:                            OLS   Adj. R-squared:                  0.051
Method:                 Least Squares   F-statistic:                     3.047
Date:                Mon, 26 May 2025   Prob (F-statistic):             0.0190
Time:                        20:25:50   Log-Likelihood:                -549.96
No. Observations:                 153   AIC:                             1110.
Df Residuals:                     148   BIC:                             1125.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -1.8644      3.070     

In [21]:
from linearmodels.panel import PanelOLS

# Crear una copia auxiliar para las dummies sin eliminar 'year'
year_dummies = pd.get_dummies(df['year'], prefix='year', drop_first=True)

# Concatenar las dummies al DataFrame original
df = pd.concat([df, year_dummies], axis=1)

# Luego fijar el índice
df = df.set_index(['state', 'year'])

# Definir fórmula
exog_vars = ['exec', 'unem'] + [col for col in df.columns if col.startswith('year_')]
X = df[exog_vars]
X = sm.add_constant(X)  # Añadir constante al modelo
y = df['mrdrte']

# Ajustar el modelo de efectos fijos (por entidad: state)
model = PanelOLS(y, X, entity_effects=True)
results = model.fit()

print(results.summary)

                          PanelOLS Estimation Summary                           
Dep. Variable:                 mrdrte   R-squared:                        0.0734
Estimator:                   PanelOLS   R-squared (Between):              0.0037
No. Observations:                 153   R-squared (Within):               0.0734
Date:                Mon, May 26 2025   R-squared (Overall):              0.0108
Time:                        20:38:42   Log-likelihood                   -375.63
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      1.9398
Entities:                          51   P-value                           0.1098
Avg Obs:                       3.0000   Distribution:                    F(4,98)
Min Obs:                       3.0000                                           
Max Obs:                       3.0000   F-statistic (robust):             1.9398
                            

In [19]:
# No coinciden los R overall y between.

In [24]:
# Creamos las columnas cd90 y cd93 de manera eficiente:

df['cd90'] = df['year'].map({90: 1, 93: -1})  # Solo asigna 1 si 1990, -1 si 1993
df['cd93'] = df['year'].map({90: 0, 93: 1})   # Solo asigna 0 si 1990, 1 si 1993

# Seleccionar las variables necesarias y eliminar filas con valores faltantes
cols = ['cmrdrte', 'cexec', 'cunem', 'cd90', 'cd93']
df_clean = df[cols].dropna()

# Definir variables dependiente e independientes
y = df_clean['cmrdrte']
X = df_clean[['cexec', 'cunem', 'cd90', 'cd93']]

# Estimar modelo sin constante
model = sm.OLS(y, X).fit()

# Mostrar resumen
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                cmrdrte   R-squared:                       0.025
Model:                            OLS   Adj. R-squared:                 -0.005
Method:                 Least Squares   F-statistic:                    0.8429
Date:                Mon, 26 May 2025   Prob (F-statistic):              0.474
Time:                        20:48:03   Log-Likelihood:                -291.48
No. Observations:                 102   AIC:                             591.0
Df Residuals:                      98   BIC:                             601.5
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
cexec         -0.1151      0.147     -0.781      0.4

In [27]:
from linearmodels.panel import PanelOLS, RandomEffects
from scipy.stats import chi2

# Asegurarse de que el DataFrame esté indexado correctamente
df = df.set_index(['id', 'year'])  # Reemplazá 'id' y 'year' con tus variables

# Definir fórmula del modelo
exog_vars = ['exec', 'unem']
exog = df[exog_vars]
y = df['mrdrte']

# Modelo FE
mod_fe = PanelOLS(y, exog, entity_effects=True)
res_fe = mod_fe.fit()

# Modelo RE
mod_re = RandomEffects(y, exog)
res_re = mod_re.fit()

# Hausman test manual
b_FE = res_fe.params.values
b_RE = res_re.params.values
diff = b_FE - b_RE

# Varianzas (solo para coeficientes, sin constante)
v_FE = res_fe.cov.values
v_RE = res_re.cov.values

# Diferencia de matrices de varianzas
V_diff = v_FE - v_RE

# Invertimos matriz (chequear si es singular)
try:
    stat = diff.T @ np.linalg.inv(V_diff) @ diff
    df_hausman = len(diff)  # grados de libertad
    pval = 1 - chi2.cdf(stat, df_hausman)
    print(f"Hausman test statistic: {stat:.4f}")
    print(f"Degrees of freedom: {df_hausman}")
    print(f"P-value: {pval:.4f}")
except np.linalg.LinAlgError:
    print("La matriz de varianzas es singular. No se puede aplicar el test de Hausman.")

Hausman test statistic: 16.8225
Degrees of freedom: 2
P-value: 0.0002


In [28]:
# No da resultado Test de Hausman.

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import pyreadstat as st
import numpy as np
import matplotlib.pyplot as plt


path = r"C:\Users\HP\OneDrive\Escritorio\David Guzzi\DiTella\MEC\Materias\2025\2025 1T\[MT09] Econometría de Datos de Panel\Clases prácticas\PS 2-20250525\data\wagepan.dta"

df, meta = st.read_dta(path)
df.head(1)

Unnamed: 0,nr,year,agric,black,bus,construc,ent,exper,fin,hisp,poorhlth,hours,manuf,married,min,nrthcen,nrtheast,occ1,occ2,occ3,occ4,occ5,occ6,occ7,occ8,occ9,per,pro,pub,rur,south,educ,tra,trad,union,lwage,d81,d82,d83,d84,d85,d86,d87,expersq
0,13,1980,0,0,1,0,0,1,0,0,0,2672,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,14,0,0,0,1.19754,0,0,0,0,0,0,0,1


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4360 entries, 0 to 4359
Data columns (total 44 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   nr        4360 non-null   int64  
 1   year      4360 non-null   int64  
 2   agric     4360 non-null   int64  
 3   black     4360 non-null   int64  
 4   bus       4360 non-null   int64  
 5   construc  4360 non-null   int64  
 6   ent       4360 non-null   int64  
 7   exper     4360 non-null   int64  
 8   fin       4360 non-null   int64  
 9   hisp      4360 non-null   int64  
 10  poorhlth  4360 non-null   int64  
 11  hours     4360 non-null   int64  
 12  manuf     4360 non-null   int64  
 13  married   4360 non-null   int64  
 14  min       4360 non-null   int64  
 15  nrthcen   4360 non-null   int64  
 16  nrtheast  4360 non-null   int64  
 17  occ1      4360 non-null   int64  
 18  occ2      4360 non-null   int64  
 19  occ3      4360 non-null   int64  
 20  occ4      4360 non-null   int6

In [None]:
# POLS
df = df.set_index(['nr', 'year'])


import statsmodels.api as sm

# Definir variables explicativas (dummies de años)
X = df[['d81', 'd82', 'd83', 'd84', 'd85', 'd86', 'd87']]
y = df['lwage']

# Ajustar el modelo POLS (Pooled OLS)
X = sm.add_constant(X)  # Agrega constante, si querés replicar exactamente sin constante, omití esta línea
ols_model = sm.OLS(y, X).fit()

print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.075
Model:                            OLS   Adj. R-squared:                  0.074
Method:                 Least Squares   F-statistic:                     50.54
Date:                Tue, 27 May 2025   Prob (F-statistic):           1.43e-69
Time:                        20:18:32   Log-Likelihood:                -3269.0
No. Observations:                4360   AIC:                             6554.
Df Residuals:                    4352   BIC:                             6605.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.3935      0.022     63.462      0.0

In [5]:
# RE
from linearmodels.panel import RandomEffects
import statsmodels.api as sm

exog = sm.add_constant(df[['d81', 'd82', 'd83', 'd84', 'd85', 'd86', 'd87']])
mod_re = RandomEffects(df['lwage'], exog)
re_res = mod_re.fit()
print(re_res.summary)

                        RandomEffects Estimation Summary                        
Dep. Variable:                  lwage   R-squared:                        0.1453
Estimator:              RandomEffects   R-squared (Between):            2.22e-16
No. Observations:                4360   R-squared (Within):               0.1625
Date:                Tue, May 27 2025   R-squared (Overall):              0.0752
Time:                        20:20:02   Log-likelihood                   -1660.0
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      105.73
Entities:                         545   P-value                           0.0000
Avg Obs:                       8.0000   Distribution:                  F(7,4352)
Min Obs:                       8.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             105.73
                            

In [6]:
# FE
from linearmodels.panel import PanelOLS

# Sin constante porque ya hay dummies de año (como en Stata)
exog = df[['d81', 'd82', 'd83', 'd84', 'd85', 'd86', 'd87']]

mod_fe = PanelOLS(df['lwage'], exog, entity_effects=True)
fe_res = mod_fe.fit()
print(fe_res.summary)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  lwage   R-squared:                        0.1625
Estimator:                   PanelOLS   R-squared (Between):              0.2709
No. Observations:                4360   R-squared (Within):               0.1625
Date:                Tue, May 27 2025   R-squared (Overall):              0.2661
Time:                        20:20:13   Log-likelihood                   -1372.4
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      105.56
Entities:                         545   P-value                           0.0000
Avg Obs:                       8.0000   Distribution:                  F(7,3808)
Min Obs:                       8.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             105.56
                            

In [8]:
# Calcular diferencias primeras
df_diff = df.groupby(level=0).diff()

# Eliminar filas con NaN (generalmente la primera de cada grupo)
df_diff = df_diff.dropna()

# Variables independientes (sin constante)
exog_diff = df_diff[['d81', 'd82', 'd83', 'd84', 'd85', 'd86', 'd87']]

# Ajustar el modelo
mod_fd = sm.OLS(df_diff['lwage'], exog_diff).fit()
print(mod_fd.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     1.647
Date:                Tue, 27 May 2025   Prob (F-statistic):              0.130
Time:                        20:21:15   Log-Likelihood:                -2308.6
No. Observations:                3815   AIC:                             4631.
Df Residuals:                    3808   BIC:                             4675.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
d81            0.1194      0.019      6.284      0.0