In [1]:
# Ver repaso primeros minutos.

In [3]:
import pandas as pd
pd.set_option('display.max_columns', None)
import pyreadstat as st
import numpy as np
import matplotlib.pyplot as plt


path = r"C:\Users\HP\OneDrive\Escritorio\David Guzzi\DiTella\MEC\Materias\2025\2025 1T\[MT09] Econometría de Datos de Panel\Clases prácticas\PS 3-20250531\data\mod_abdata.dta"

df, meta = st.read_dta(path)
df['year'] = df['year'].astype('int')
df.head(1)

Unnamed: 0,c1,ind,year,emp,wage,cap,indoutpt,n,w,k,ys,rec,yearm1,id,nL1,nL2,wL1,kL1,kL2,ysL1,ysL2,yr1977,yr1978,yr1979,yr1980,yr1981,yr1982
0,1-1,7.0,1977,5.041,13.1516,0.5894,95.707199,1.617604,2.576543,-0.52865,4.561294,1.0,1977.0,1.0,,,,,,,,1,0,0,0,0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 828 entries, 0 to 827
Data columns (total 27 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   c1        828 non-null    object 
 1   ind       828 non-null    float64
 2   year      828 non-null    float64
 3   emp       828 non-null    float64
 4   wage      828 non-null    float64
 5   cap       828 non-null    float64
 6   indoutpt  828 non-null    float64
 7   n         828 non-null    float64
 8   w         828 non-null    float64
 9   k         828 non-null    float64
 10  ys        828 non-null    float64
 11  rec       828 non-null    float64
 12  yearm1    828 non-null    float64
 13  id        828 non-null    float64
 14  nL1       770 non-null    float64
 15  nL2       632 non-null    float64
 16  wL1       770 non-null    float64
 17  kL1       770 non-null    float64
 18  kL2       632 non-null    float64
 19  ysL1      770 non-null    float64
 20  ysL2      632 non-null    float6

#### **a) POLS con errores estándar robustos.**

In [31]:
import statsmodels.api as sm
from linearmodels.panel import PooledOLS


df_1 = df.set_index(['id', 'year'])

# X: todas las explicativas, incluyendo las yr*
X_vars = ['nL1', 'nL2', 'w', 'wL1', 'k', 'kL1', 'kL2', 'ys', 'ysL1', 'ysL2']
yr_vars = sorted([col for col in df_1.columns if col.startswith('yr')])
yr_vars = yr_vars[1:-1]  # eliminar la primera y última dummy automáticamente
X = df_1[X_vars + yr_vars]

# Agregar constante
X = sm.add_constant(X)

# Variable dependiente
y = df_1['n']

model = PooledOLS(y, X)
results = model.fit(cov_type='clustered', cluster_entity=True)

print(results.summary)

                          PooledOLS Estimation Summary                          
Dep. Variable:                      n   R-squared:                        0.9948
Estimator:                  PooledOLS   R-squared (Between):              0.9989
No. Observations:                 632   R-squared (Within):               0.7224
Date:                Sat, Jun 28 2025   R-squared (Overall):              0.9948
Time:                        19:44:36   Log-likelihood                    573.37
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      8456.1
Entities:                         138   P-value                           0.0000
Avg Obs:                       4.5797   Distribution:                  F(14,617)
Min Obs:                       4.0000                                           
Max Obs:                       5.0000   F-statistic (robust):          1.513e+04
                            

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


#### **b) FE con errores estándar robustos.**

In [32]:
from linearmodels.panel import PanelOLS

# Asegurarse de tener panel indexado
df_1 = df.set_index(['id', 'year'])

# Variables explicativas (sin colinearidad perfecta)
X_vars = ['nL1', 'nL2', 'w', 'wL1', 'k', 'kL1', 'kL2', 'ys', 'ysL1', 'ysL2']
yr_vars = sorted([col for col in df_1.columns if col.startswith('yr')])
yr_vars = yr_vars[1:-1]  # omitir la primera y última dummy de año
X = df_1[X_vars + yr_vars]

# Variable dependiente
y = df_1['n']

# No agregar constante: PanelOLS con entity_effects=True ya incluye efectos fijos
model = PanelOLS(y, X, entity_effects=True)
results = model.fit(cov_type='clustered', cluster_entity=True)

print(results.summary)

                          PanelOLS Estimation Summary                           
Dep. Variable:                      n   R-squared:                        0.7708
Estimator:                   PanelOLS   R-squared (Between):              0.4470
No. Observations:                 632   R-squared (Within):               0.7708
Date:                Sat, Jun 28 2025   R-squared (Overall):              0.4530
Time:                        19:48:07   Log-likelihood                    709.13
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      115.31
Entities:                         138   P-value                           0.0000
Avg Obs:                       4.5797   Distribution:                  F(14,480)
Min Obs:                       4.0000                                           
Max Obs:                       5.0000   F-statistic (robust):             128.97
                            

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


#### **c) A-H.**

In [33]:
from linearmodels.iv import IV2SLS

# 1. Crear primeras diferencias
df_diff = df.sort_values(['id', 'year']).copy()

vars_diff = ['n', 'nL1', 'nL2', 'w', 'wL1', 'k', 'kL1', 'kL2', 'ys', 'ysL1', 'ysL2']
for var in vars_diff:
    df_diff[f'D_{var}'] = df_diff.groupby('id')[var].diff()

# dummies de año también en diferencias
yr_vars = sorted([col for col in df.columns if col.startswith('yr')])
yr_vars = yr_vars[1:-1]  # excluir una al inicio y fin
for col in yr_vars:
    df_diff[f'D_{col}'] = df_diff.groupby('id')[col].diff()

# Eliminar filas con NaN (producto de las diferencias)
df_diff = df_diff.dropna(subset=[f'D_{var}' for var in ['n', 'nL1', 'nL2']])

# 2. Definir variables
y = df_diff['D_n']
endog = df_diff['D_nL1']
instrument = df_diff['nL2']  # sin D, como en Stata
exog_vars = ['D_nL2', 'D_w', 'D_wL1', 'D_k', 'D_kL1', 'D_kL2', 'D_ys', 'D_ysL1', 'D_ysL2'] + [f'D_{col}' for col in yr_vars]
exog = df_diff[exog_vars]

# 3. Ajustar modelo IV
iv_model = IV2SLS(dependent=y, exog=exog, endog=endog, instruments=instrument)
iv_results = iv_model.fit(cov_type='robust')  # también podrías usar 'clustered' si quisieras

# 4. Mostrar resumen
print(iv_results.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                    D_n   R-squared:                     -13.610
Estimator:                    IV-2SLS   Adj. R-squared:                -14.006
No. Observations:                 494   F-statistic:                    23.516
Date:                Sat, Jun 28 2025   P-value (F-stat)                0.0524
Time:                        19:51:46   Distribution:                 chi2(14)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
D_nL2         -0.9847     2.0197    -0.4875     0.6259     -4.9432      2.9738
D_w           -0.3323     0.5334    -0.6230     0.53

#### **d) A-B one-step GMM.**

In [19]:
import pandas as pd
from pydynpd.panel_data import panel_data
from pydynpd.variable import regular_variable
from pydynpd.info import options_info
from pydynpd.dynamic_panel_model import dynamic_panel_model

In [20]:
# Ordenar y preparar
df_sorted = df.sort_values(['id', 'year']).copy()
ids = df_sorted['id'].values
time = df_sorted['year'].values

# Opciones del modelo (one-step diff-GMM robust)
options = options_info()
options.level = False         # nolevel
options.steps = 1             # one-step
options.transformation = 'fd'
options.timedumm = False      # dummies explícitas
options.robust = True         # robust (White)

In [23]:
# Variables dependiente y explicativas (como en xtabond2)
dep_indep = [
    regular_variable('n', 0),     # n
    regular_variable('n', 1),     # L.n
    regular_variable('n', 2),     # L2.n
    regular_variable('w', 0),
    regular_variable('w', 1),
    regular_variable('k', 0),
    regular_variable('k', 1),
    regular_variable('k', 2),
    regular_variable('ys', 0),
    regular_variable('ys', 1),
    regular_variable('ys', 2),
]

# Dummies de año
yr_cols = ['yr1977', 'yr1978', 'yr1979', 'yr1980', 'yr1981', 'yr1982']
yr_dummies = [regular_variable(col, 0) for col in yr_cols]
dep_indep.extend(yr_dummies)

# Instrumentos GMM (solo L.n → con lags 2 a 5)
Dgmm = [
    regular_variable('n', 2),
    regular_variable('n', 3),
    regular_variable('n', 4),
    regular_variable('n', 5),
]

# Instrumentos estándar (IV)
iv = [
    regular_variable('n', 2),      # L2.n
    regular_variable('w', 0),
    regular_variable('w', 1),
    regular_variable('k', 0),
    regular_variable('k', 1),
    regular_variable('k', 2),
    regular_variable('ys', 0),
    regular_variable('ys', 1),
    regular_variable('ys', 2),
] + yr_dummies

variables = {
    'dep_indep': dep_indep,
    'Dgmm': Dgmm,
    'Lgmm': [],     # No usamos system-GMM
    'iv': iv
}

In [None]:
pdata = panel_data(
    df_sorted,
    identifiers=(ids, time),
    variables={
        'dep_indep': dep_indep,
        'Dgmm': Dgmm,
        'Lgmm': [],       # ← si hacés Difference GMM solamente
        'iv': iv
    },
    options=options
)

TypeError: list indices must be integers or slices, not str

In [7]:
# 1. Crear objeto panel_data
# Asegurarse de que el DataFrame esté indexado correctamente
df_indexed = df.set_index(['id', 'year'])

# Crear el objeto panel_data
pdata = panel_data(df_indexed)

# 2. Definir variables
dep_indep = [regular_variable('n', 0),  # dependiente
             regular_variable('n', 1),  # L.n
             regular_variable('n', 2),  # L2.n
             regular_variable('w', 0),
             regular_variable('w', 1),
             regular_variable('k', 0),
             regular_variable('k', 1),
             regular_variable('k', 2),
             regular_variable('ys', 0),
             regular_variable('ys', 1),
             regular_variable('ys', 2)]

# Dummies de año
yr_dummies = [regular_variable(col, 0) for col in ['yr1977', 'yr1978', 'yr1979', 'yr1980', 'yr1981', 'yr1982']]
dep_indep.extend(yr_dummies)

# GMM instruments: solo L.n
Dgmm = [regular_variable('n', 1, min_lag=2, max_lag=5)]  # L(1/5).n

# IVs estándar
iv = [regular_variable('n', 2),  # L2.n
      regular_variable('w', 0), regular_variable('w', 1),
      regular_variable('k', 0), regular_variable('k', 1), regular_variable('k', 2),
      regular_variable('ys', 0), regular_variable('ys', 1), regular_variable('ys', 2)] + yr_dummies

variables = {'dep_indep': dep_indep, 'Dgmm': Dgmm, 'Lgmm': [], 'iv': iv}

TypeError: panel_data.__init__() missing 3 required positional arguments: 'identifiers', 'variables', and 'options'

In [4]:
from pydynpd import DynPanelGMM

# Reordenar y seleccionar variables relevantes
vars_base = ['n', 'w', 'k', 'ys']
yr_dummies = sorted([col for col in df.columns if col.startswith('yr')])
all_vars = vars_base + yr_dummies

# Estimar modelo: Arellano-Bond Difference GMM (one-step, robust)
model = DynPanelGMM(
    data=df,
    depvar='n',
    exog_vars=['w', 'k', 'ys'] + yr_dummies,
    endog_vars=['n'],
    panel_id='id',
    time_var='year',
    lags=1,           # GMM instruments: L1 to L5 internally
    max_instr_lag=5,  # L(1/5).L.n como en xtabond2
    first_diff=True,
    robust=True,      # errores robustos tipo White
    twostep=False,    # One-step
    drop_missing=True # elimina automáticamente NA
)

result = model.fit()
print(result.summary())

ImportError: cannot import name 'DynPanelGMM' from 'pydynpd' (c:\Users\HP\AppData\Local\Programs\Python\Python312\Lib\site-packages\pydynpd\__init__.py)