In [1]:
# Ver repaso primeros minutos.

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import pyreadstat as st
import numpy as np
import matplotlib.pyplot as plt


path = r"C:\Users\HP\OneDrive\Escritorio\David Guzzi\DiTella\MEC\Materias\2025\2025 1T\[MT09] Econometría de Datos de Panel\Clases prácticas\PS 3-20250531\data\mod_abdata.dta"

df, meta = st.read_dta(path)
df['year'] = df['year'].astype('int')
df.head(1)

Unnamed: 0,c1,ind,year,emp,wage,cap,indoutpt,n,w,k,ys,rec,yearm1,id,nL1,nL2,wL1,kL1,kL2,ysL1,ysL2,yr1977,yr1978,yr1979,yr1980,yr1981,yr1982
0,1-1,7.0,1977,5.041,13.1516,0.5894,95.707199,1.617604,2.576543,-0.52865,4.561294,1.0,1977.0,1.0,,,,,,,,1,0,0,0,0,0


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 828 entries, 0 to 827
Data columns (total 27 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   c1        828 non-null    object 
 1   ind       828 non-null    float64
 2   year      828 non-null    int64  
 3   emp       828 non-null    float64
 4   wage      828 non-null    float64
 5   cap       828 non-null    float64
 6   indoutpt  828 non-null    float64
 7   n         828 non-null    float64
 8   w         828 non-null    float64
 9   k         828 non-null    float64
 10  ys        828 non-null    float64
 11  rec       828 non-null    float64
 12  yearm1    828 non-null    float64
 13  id        828 non-null    float64
 14  nL1       770 non-null    float64
 15  nL2       632 non-null    float64
 16  wL1       770 non-null    float64
 17  kL1       770 non-null    float64
 18  kL2       632 non-null    float64
 19  ysL1      770 non-null    float64
 20  ysL2      632 non-null    float6

#### **a) POLS con errores estándar robustos.**

In [31]:
import statsmodels.api as sm
from linearmodels.panel import PooledOLS


df_1 = df.set_index(['id', 'year'])

# X: todas las explicativas, incluyendo las yr*
X_vars = ['nL1', 'nL2', 'w', 'wL1', 'k', 'kL1', 'kL2', 'ys', 'ysL1', 'ysL2']
yr_vars = sorted([col for col in df_1.columns if col.startswith('yr')])
yr_vars = yr_vars[1:-1]  # eliminar la primera y última dummy automáticamente
X = df_1[X_vars + yr_vars]

# Agregar constante
X = sm.add_constant(X)

# Variable dependiente
y = df_1['n']

model = PooledOLS(y, X)
results = model.fit(cov_type='clustered', cluster_entity=True)

print(results.summary)

                          PooledOLS Estimation Summary                          
Dep. Variable:                      n   R-squared:                        0.9948
Estimator:                  PooledOLS   R-squared (Between):              0.9989
No. Observations:                 632   R-squared (Within):               0.7224
Date:                Sat, Jun 28 2025   R-squared (Overall):              0.9948
Time:                        19:44:36   Log-likelihood                    573.37
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      8456.1
Entities:                         138   P-value                           0.0000
Avg Obs:                       4.5797   Distribution:                  F(14,617)
Min Obs:                       4.0000                                           
Max Obs:                       5.0000   F-statistic (robust):          1.513e+04
                            

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


#### **b) FE con errores estándar robustos.**

In [None]:
from linearmodels.panel import PanelOLS

# Asegurarse de tener panel indexado
df_1 = df.set_index(['id', 'year'])

# Variables explicativas (sin colinearidad perfecta)
X_vars = ['nL1', 'nL2', 'w', 'wL1', 'k', 'kL1', 'kL2', 'ys', 'ysL1', 'ysL2']
yr_vars = sorted([col for col in df_1.columns if col.startswith('yr')])
yr_vars = yr_vars[1:-1]  # omitir la primera y última dummy de año
X = df_1[X_vars + yr_vars]

# Variable dependiente
y = df_1['n']

# No agregar constante: PanelOLS con entity_effects=True ya incluye efectos fijos
model = PanelOLS(y, X, entity_effects=True)
results = model.fit(cov_type='clustered', cluster_entity=True)

print(results.summary)

                          PanelOLS Estimation Summary                           
Dep. Variable:                      n   R-squared:                        0.7708
Estimator:                   PanelOLS   R-squared (Between):              0.4470
No. Observations:                 632   R-squared (Within):               0.7708
Date:                Sat, Jun 28 2025   R-squared (Overall):              0.4530
Time:                        19:48:07   Log-likelihood                    709.13
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      115.31
Entities:                         138   P-value                           0.0000
Avg Obs:                       4.5797   Distribution:                  F(14,480)
Min Obs:                       4.0000                                           
Max Obs:                       5.0000   F-statistic (robust):             128.97
                            

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


#### **c) A-H.**

In [33]:
from linearmodels.iv import IV2SLS

# 1. Crear primeras diferencias
df_diff = df.sort_values(['id', 'year']).copy()

vars_diff = ['n', 'nL1', 'nL2', 'w', 'wL1', 'k', 'kL1', 'kL2', 'ys', 'ysL1', 'ysL2']
for var in vars_diff:
    df_diff[f'D_{var}'] = df_diff.groupby('id')[var].diff()

# dummies de año también en diferencias
yr_vars = sorted([col for col in df.columns if col.startswith('yr')])
yr_vars = yr_vars[1:-1]  # excluir una al inicio y fin
for col in yr_vars:
    df_diff[f'D_{col}'] = df_diff.groupby('id')[col].diff()

# Eliminar filas con NaN (producto de las diferencias)
df_diff = df_diff.dropna(subset=[f'D_{var}' for var in ['n', 'nL1', 'nL2']])

# 2. Definir variables
y = df_diff['D_n']
endog = df_diff['D_nL1']
instrument = df_diff['nL2']  # sin D, como en Stata
exog_vars = ['D_nL2', 'D_w', 'D_wL1', 'D_k', 'D_kL1', 'D_kL2', 'D_ys', 'D_ysL1', 'D_ysL2'] + [f'D_{col}' for col in yr_vars]
exog = df_diff[exog_vars]

# 3. Ajustar modelo IV
iv_model = IV2SLS(dependent=y, exog=exog, endog=endog, instruments=instrument)
iv_results = iv_model.fit(cov_type='robust')  # también podrías usar 'clustered' si quisieras

# 4. Mostrar resumen
print(iv_results.summary)

                          IV-2SLS Estimation Summary                          
Dep. Variable:                    D_n   R-squared:                     -13.610
Estimator:                    IV-2SLS   Adj. R-squared:                -14.006
No. Observations:                 494   F-statistic:                    23.516
Date:                Sat, Jun 28 2025   P-value (F-stat)                0.0524
Time:                        19:51:46   Distribution:                 chi2(14)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
D_nL2         -0.9847     2.0197    -0.4875     0.6259     -4.9432      2.9738
D_w           -0.3323     0.5334    -0.6230     0.53

#### **d) A-B one-step GMM.**

In [17]:
from pydynpd import regression

# Cargar datos (asumiendo que tu DataFrame se llama 'df')
# df = pd.read_csv('tu_archivo.csv')

# pydynpd requiere que especifiques la estructura del panel
# Replicando: xtabond2 n L.n L2.n w L.w L(0/2).(k ys) yr*, gmm(L.n) iv(w L.w L(0/2).(k ys) yr*) nolevel robust

year_dummies_clean = ['yr1979', 'yr1980', 'yr1982']
command_final = ('n L(1:2).n w L1.w L(0:2).k L(0:2).ys ' + 
                 ' '.join(year_dummies_clean) + 
                 ' | gmm(n, 2:6) iv(w L1.w L(0:2).k L(0:2).ys ' + 
                 ' '.join(year_dummies_clean) + ') | nolevel onestep')

print("Comando final:")
print(command_final)
print()

mydpd_final = regression.abond(command_final, df, ['id', 'year'])

Comando final:
n L(1:2).n w L1.w L(0:2).k L(0:2).ys yr1979 yr1980 yr1982 | gmm(n, 2:6) iv(w L1.w L(0:2).k L(0:2).ys yr1979 yr1980 yr1982) | nolevel onestep

 Dynamic panel-data estimation, one-step difference GMM
 Group variable: id                               Number of obs = 414     
 Time variable: year                              Min obs per group: 3    
 Number of instruments = 20                       Max obs per group: 3    
 Number of groups = 138                           Avg obs per group: 3.00 
+--------+------------+---------------------+------------+-----------+-----+
|   n    |   coef.    | Corrected Std. Err. |     z      |   P>|z|   |     |
+--------+------------+---------------------+------------+-----------+-----+
|  L1.n  | 0.5325962  |      0.4060438      | 1.3116717  | 0.1896309 |     |
|  L2.n  | -0.1678165 |      0.1088950      | -1.5410859 | 0.1232959 |     |
|   w    | -0.5435347 |      0.1878885      | -2.8928572 | 0.0038175 |  ** |
|  L1.w  | 0.0465042  |  

#### **e) A-B two-step GMM.**

In [18]:
from pydynpd import regression

year_dummies_clean = ['yr1979', 'yr1980', 'yr1982']
command_twostep = ('n L(1:2).n w L1.w L(0:2).k L(0:2).ys ' + 
                   ' '.join(year_dummies_clean) + 
                   ' | gmm(n, 2:6) iv(w L1.w L(0:2).k L(0:2).ys ' + 
                   ' '.join(year_dummies_clean) + ') | nolevel')

mydpd_twostep = regression.abond(command_twostep, df, ['id', 'year'])

 Dynamic panel-data estimation, two-step difference GMM
 Group variable: id                               Number of obs = 414     
 Time variable: year                              Min obs per group: 3    
 Number of instruments = 20                       Max obs per group: 3    
 Number of groups = 138                           Avg obs per group: 3.00 
+--------+------------+---------------------+------------+-----------+----+
|   n    |   coef.    | Corrected Std. Err. |     z      |   P>|z|   |    |
+--------+------------+---------------------+------------+-----------+----+
|  L1.n  | 0.6080291  |      0.6739661      | 0.9021656  | 0.3669689 |    |
|  L2.n  | -0.1379420 |      0.1688863      | -0.8167743 | 0.4140574 |    |
|   w    | -0.4912924 |      0.2335444      | -2.1036365 | 0.0354102 | *  |
|  L1.w  | 0.1263267  |      0.2293468      | 0.5508109  | 0.5817633 |    |
|   k    | 0.2912765  |      0.0889614      | 3.2741907  | 0.0010597 | ** |
|  L1.k  | -0.0170639 |      0.21907

#### **f) B-B two-step GMM.**

In [2]:
from pydynpd import regression

year_dummies_clean = ['yr1978', 'yr1979', 'yr1980', 'yr1982']  # ajustar según tus datos
command_system = ('n L1.n L(0:1).w L(0:1).k ' + 
                  ' '.join(year_dummies_clean) + 
                  ' | gmm(n, 2:99) gmm(w, 2:99) gmm(k, 2:99) iv(' + 
                  ' '.join(year_dummies_clean) + ')')

mydpd_system = regression.abond(command_system, df, ['id', 'year'])

 Dynamic panel-data estimation, two-step system GMM
 Group variable: id                               Number of obs = 552     
 Time variable: year                              Min obs per group: 4    
 Number of instruments = 47                       Max obs per group: 4    
 Number of groups = 138                           Avg obs per group: 4.00 
+--------+------------+---------------------+------------+-----------+-----+
|   n    |   coef.    | Corrected Std. Err. |     z      |   P>|z|   |     |
+--------+------------+---------------------+------------+-----------+-----+
|  L1.n  | 0.8920570  |      0.0524149      | 17.0191569 | 0.0000000 | *** |
|   w    | -0.4138471 |      0.2405795      | -1.7202092 | 0.0853944 |     |
|  L1.w  | 0.2322100  |      0.1663529      | 1.3958876  | 0.1627483 |     |
|   k    | 0.4563271  |      0.1222931      | 3.7314231  | 0.0001904 | *** |
|  L1.k  | -0.3582417 |      0.1210988      | -2.9582597 | 0.0030938 |  ** |
| yr1978 | 0.0405244  |      0.0

In [18]:
reg_table = mydpd_system.models[0].regression_table
print(reg_table)


# Dejar solo las principales
resumen = reg_table[['variable', 'coefficient', 'std_err', 'z_value', 'p_value', 'sig']]
resumen.set_index('variable', inplace=True)
print(resumen)

  variable  coefficient   std_err    z_value       p_value  sig
0     L1.n     0.892057  0.052415  17.019157  5.921711e-65  ***
1        w    -0.413847  0.240580  -1.720209  8.539443e-02     
2     L1.w     0.232210  0.166353   1.395888  1.627483e-01     
3        k     0.456327  0.122293   3.731423  1.904012e-04  ***
4     L1.k    -0.358242  0.121099  -2.958260  3.093814e-03   **
5   yr1978     0.040524  0.026564   1.525522  1.271289e-01     
6   yr1979     0.051387  0.019861   2.587289  9.673435e-03   **
7   yr1980     0.029608  0.020091   1.473648  1.405764e-01     
8   yr1982     0.029080  0.014089   2.064028  3.901510e-02    *
9     _con     0.671291  0.374447   1.792753  7.301243e-02     
          coefficient   std_err    z_value       p_value  sig
variable                                                     
L1.n         0.892057  0.052415  17.019157  5.921711e-65  ***
w           -0.413847  0.240580  -1.720209  8.539443e-02     
L1.w         0.232210  0.166353   1.395888  1.62