# Librerias

In [1]:
import pandas
import numpy
import os
import sys
import numpy

import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col

import sklearn
import sklearn.compose
import category_encoders
import sklearn.impute

from plotnine import *
from stargazer.stargazer import Stargazer
from IPython.core.display import HTML

import py_hep_functions

In [2]:

color = ["#3a5e8cFF", "#10a53dFF", "#541352FF", "#ffcf20FF", "#2f9aa0FF"]

def seq(start: float, stop: float, by: float, round_n=3) -> list:
    """
    Custom function, used for setting the breaks of plotnine scales.

       Parameters
    ----------
    start : float
        Start of the breaks.
    stop : float
        End of the breaks.
    by : float
        Steps between breaks.
    round_n: int, default=3
        Decimals to round floats in output.
    """
    epsilon = numpy.finfo("float").eps

    return [
        round(x, round_n) for x in list(numpy.arange(start, stop + (by - epsilon), by))
    ]
    
def normalize_frame(frame):
    """Normalize the data frame to make it performant and compatible with
    downstream libraries such as Scikit-Learn and CatBoost.
    
    In particular the following operations are performed:
    - Categorical levels that are not strings are converted into strings
    - Categorical missing values are converted into a distinct "N/A" level
    - Object-typed columns are converted into categorical columns
    """
    for col, series in frame.items():
        dtype = series.dtype
        if pandas.api.types.is_categorical_dtype(dtype):
            recode = False
            if series.hasnans:
                recode = True
            else:
                for value in series.cat.categories:
                    if not isinstance(value, str):
                        recode = True
                        break
            if recode:
                frame[col] = series.astype("str").fillna("N/A").astype(
                    "category")
        elif pandas.api.types.is_object_dtype(dtype):
            frame[col] = series.astype("str").fillna("N/A").astype("category")

    return frame

base_transformer = sklearn.pipeline.Pipeline([
    ("recode", sklearn.compose.ColumnTransformer(transformers=[
        ("category",
             #sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore",
             sklearn.preprocessing.OneHotEncoder(drop="first",handle_unknown="ignore",                                                 
                min_frequency=0.01, max_categories=40, sparse_output= False),
             sklearn.compose.make_column_selector(dtype_exclude=numpy.number)
        ),
        ("number",
            sklearn.impute.SimpleImputer(strategy="mean"),
            sklearn.compose.make_column_selector(dtype_include=numpy.number)
        ),  
    ], n_jobs=-1)),
    #("rescale", sklearn.preprocessing.StandardScaler()),
])

# Analisis

In [3]:
output           = "C:/Users/et396/Dropbox/Docencia/Educate/Econometria/S2/Data"
os.chdir(output)

# Carga de data
base = pandas.read_stata("ENE_2015.dta")
base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9457 entries, 0 to 9456
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   index      9457 non-null   int32   
 1   riruc      9457 non-null   object  
 2   rC20       9457 non-null   object  
 3   rexper     9457 non-null   float64 
 4   rneduca    9457 non-null   category
 5   rmujer     9457 non-null   int32   
 6   rencuesta  9457 non-null   int32   
 7   rDpto      9457 non-null   object  
 8   rcredito   9457 non-null   int32   
 9   rorga      9457 non-null   int32   
 10  rL         9457 non-null   float64 
 11  rexporta   9457 non-null   int32   
 12  rventas    9457 non-null   float64 
 13  rpt        9457 non-null   float64 
 14  rci        9457 non-null   float64 
 15  rpl        9457 non-null   float64 
 16  lnrpl      9457 non-null   float64 
dtypes: category(1), float64(7), int32(6), object(3)
memory usage: 970.0+ KB


In [4]:
# Filtro de variables
response = ['rexporta']
inputs   = ['rmujer','rexper','rDpto','rcredito','rL', 'rventas', 'rC20','rneduca','lnrpl']
data_analysis = base[response + inputs]

# Normalise informacion : number, category columns
data_fit = normalize_frame(data_analysis)
data_fit.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9457 entries, 0 to 9456
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   rexporta  9457 non-null   int32   
 1   rmujer    9457 non-null   int32   
 2   rexper    9457 non-null   float64 
 3   rDpto     9457 non-null   category
 4   rcredito  9457 non-null   int32   
 5   rL        9457 non-null   float64 
 6   rventas   9457 non-null   float64 
 7   rC20      9457 non-null   category
 8   rneduca   9457 non-null   category
 9   lnrpl     9457 non-null   float64 
dtypes: category(3), float64(4), int32(3)
memory usage: 435.3 KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
data_fit.describe()

Unnamed: 0,rexporta,rmujer,rexper,rcredito,rL,rventas,lnrpl
count,9457.0,9457.0,9457.0,9457.0,9457.0,9457.0,9457.0
mean,0.025695,0.274823,13.014698,0.391456,20.228508,725509.9,10.264639
std,0.158233,0.446449,10.217655,0.488102,44.93578,1758707.0,1.119788
min,0.0,0.0,2.0,0.0,1.0,1000.0,4.784164
25%,0.0,0.0,7.0,0.0,5.0,132274.0,9.549316
50%,0.0,0.0,10.0,0.0,10.0,285799.0,10.213642
75%,0.0,1.0,16.0,1.0,21.0,650000.0,10.982297
max,1.0,1.0,146.0,1.0,2457.0,95958440.0,15.43342


# Question 1

In [6]:
response = data_fit[['rexporta']]
predictors_cols1 = ['rmujer','rexper','rcredito','lnrpl','rneduca']
predictors_cols2 = ['rmujer','rexper','rcredito','lnrpl','rneduca','rDpto']

In [7]:
base1        = data_fit[predictors_cols1]
predictors  = data_fit[predictors_cols1]
transformer = sklearn.clone(base_transformer)
inputs      = transformer.fit_transform(predictors)
inputs      = pandas.DataFrame(inputs, index=base1.index,
                               columns=transformer.get_feature_names_out())
inputs      = sm.add_constant(inputs, has_constant="raise")

m0 = sm.Probit(response, inputs).fit(cov_type="HC1")

         Current function value: 0.117207
         Iterations: 35




In [8]:
stargazer = Stargazer([m0])
stargazer.custom_columns(["Exportar"], [1])
stargazer.covariate_order(["number__rmujer", 'number__lnrpl', 'number__rexper', 
                           "number__rcredito"])
stargazer.rename_covariates({"Intercept": "Constant"})
stargazer.add_line("Efectos fijos Dpto", ["No"])
stargazer

0,1
,
,Dependent variable: rexporta
,
,Exportar
,(1)
,
number__rmujer,-0.017
,(0.063)
number__lnrpl,0.111***
,(0.026)


# Question 2

In [9]:
base2        = data_fit[predictors_cols1]
predictors  = data_fit[predictors_cols1]
transformer = sklearn.clone(base_transformer)
inputs      = transformer.fit_transform(predictors)
inputs      = pandas.DataFrame(inputs, index=base2.index,
                               columns=transformer.get_feature_names_out())
inputs      = sm.add_constant(inputs, has_constant="raise")

m1 = sm.Probit(response, inputs).fit()
m2 = sm.Probit(response, inputs).fit(cov_type="HC1")


         Current function value: 0.117207
         Iterations: 35
         Current function value: 0.117207
         Iterations: 35




In [10]:
base3        = data_fit[predictors_cols2]
predictors  = data_fit[predictors_cols2]
transformer = sklearn.clone(base_transformer)
inputs      = transformer.fit_transform(predictors)
inputs      = pandas.DataFrame(inputs, index=base3.index,
                               columns=transformer.get_feature_names_out())
inputs      = sm.add_constant(inputs, has_constant="raise")

m3 = sm.Probit(response, inputs).fit(cov_type="HC1")

         Current function value: 0.109880
         Iterations: 35




In [11]:
stargazer = Stargazer([m1, m2, m3])
stargazer.custom_columns(["Exportar", "Exportar", "Exportar"], [1, 1, 1])
stargazer.covariate_order(["number__rmujer", 'number__lnrpl', 'number__rexper', 
                           "number__rcredito", "category__rneduca_primaria",
                           "category__rneduca_secundaria","category__rneduca_tecnica"])
stargazer.rename_covariates({"Intercept": "Constant"})
stargazer.add_line("Efectos fijos Dpto", ["", "No", "Yes"])
stargazer

0,1,2,3
,,,
,Dependent variable: rexporta,Dependent variable: rexporta,Dependent variable: rexporta
,,,
,Exportar,Exportar,Exportar
,(1),(2),(3)
,,,
number__rmujer,-0.017,-0.017,-0.019
,(0.063),(0.063),(0.066)
number__lnrpl,0.111***,0.111***,0.091***
,(0.024),(0.026),(0.027)


In [15]:
# Efecto marginal
AME = m2.get_margeff(at='mean', method='dydx', atexog=None, dummy=True,  count=False)
print(AME.summary())

       Probit Marginal Effects       
Dep. Variable:               rexporta
Method:                          dydx
At:                              mean
                                          dy/dx    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------
category__rneduca_primaria              -0.0079      0.007     -1.111      0.267      -0.022       0.006
category__rneduca_secundaria            -0.0145      0.003     -4.638      0.000      -0.021      -0.008
category__rneduca_tecnica               -0.0107      0.003     -3.111      0.002      -0.017      -0.004
category__rneduca_infrequent_sklearn    -0.0239      0.002    -14.703      0.000      -0.027      -0.021
number__rmujer                          -0.0010      0.003     -0.277      0.782      -0.008       0.006
number__rexper                         6.89e-05      0.000      0.513      0.608      -0.000       0.000
number__