<center style="font-size: 2em; font-weight: bold;"> Determinantes de las Condiciones de Vida  </center>
<center style="font-size: 2em; font-weight: bold;"> Analisis de base de datos  </center>
<left style="font-size: 4em; font-weight: bold;"> Autor: Edinson Tolentino </left>

# Librerias

In [1]:
import pandas
import numpy
import os
import sys
import numpy

import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col

import sklearn
import sklearn.compose
import category_encoders
import sklearn.impute

from plotnine import *
from stargazer.stargazer import Stargazer
from IPython.core.display import HTML

import py_hep_functions
import seaborn
import matplotlib.pyplot as plt

In [2]:

color = ["#3a5e8cFF", "#10a53dFF", "#541352FF", "#ffcf20FF", "#2f9aa0FF"]

def seq(start: float, stop: float, by: float, round_n=3) -> list:
    """
    Custom function, used for setting the breaks of plotnine scales.

       Parameters
    ----------
    start : float
        Start of the breaks.
    stop : float
        End of the breaks.
    by : float
        Steps between breaks.
    round_n: int, default=3
        Decimals to round floats in output.
    """
    epsilon = numpy.finfo("float").eps

    return [
        round(x, round_n) for x in list(numpy.arange(start, stop + (by - epsilon), by))
    ]
    
def normalize_frame(frame):
    """Normalize the data frame to make it performant and compatible with
    downstream libraries such as Scikit-Learn and CatBoost.
    
    In particular the following operations are performed:
    - Categorical levels that are not strings are converted into strings
    - Categorical missing values are converted into a distinct "N/A" level
    - Object-typed columns are converted into categorical columns
    """
    for col, series in frame.items():
        dtype = series.dtype
        if pandas.api.types.is_categorical_dtype(dtype):
            recode = False
            if series.hasnans:
                recode = True
            else:
                for value in series.cat.categories:
                    if not isinstance(value, str):
                        recode = True
                        break
            if recode:
                frame[col] = series.astype("str").fillna("N/A").astype(
                    "category")
        elif pandas.api.types.is_object_dtype(dtype):
            frame[col] = series.astype("str").fillna("N/A").astype("category")

    return frame

base_transformer = sklearn.pipeline.Pipeline([
    ("recode", sklearn.compose.ColumnTransformer(transformers=[
        ("category",
             #sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore",
             sklearn.preprocessing.OneHotEncoder(drop="first",handle_unknown="ignore",                                                 
                min_frequency=0.01, max_categories=40, sparse_output= False),
             sklearn.compose.make_column_selector(dtype_exclude=numpy.number)
        ),
        ("number",
            sklearn.impute.SimpleImputer(strategy="mean"),
            sklearn.compose.make_column_selector(dtype_include=numpy.number)
        ),  
    ], n_jobs=-1)),
    #("rescale", sklearn.preprocessing.StandardScaler()),
])

# Data

In [3]:
output           = "C:/Users/et396/Dropbox/Docencia/Educate/Econometria/S3/Data"
os.chdir(output)

# Carga de data
base = pandas.read_stata("BD2_Multiproducto_2021.dta")
#base = pandas.read_csv("BD2_Multiproducto_2021.csv")
base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19889 entries, 0 to 19888
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   codigo_persona  19889 non-null  object  
 1   rvida           19889 non-null  category
 2   rsexo           19889 non-null  category
 3   rpareja         19889 non-null  category
 4   redad           19889 non-null  float32 
 5   redadsq         19889 non-null  float32 
 6   reduca          19887 non-null  float32 
 7   rmu             19889 non-null  category
 8   ry              19889 non-null  float32 
 9   rly             19889 non-null  float32 
 10  rmiembros       19889 non-null  float32 
dtypes: category(4), float32(6), object(1)
memory usage: 699.9+ KB


In [4]:
# Filtro de variables
response = ['rvida']
inputs   = ['rsexo', 'rpareja','redad','redadsq','reduca','rmu','rly','rmiembros']
data_analysis = base[response + inputs]

# Normalise informacion : number, category columns
data_fit = normalize_frame(data_analysis)
data_fit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19889 entries, 0 to 19888
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   rvida      19889 non-null  category
 1   rsexo      19889 non-null  category
 2   rpareja    19889 non-null  category
 3   redad      19889 non-null  float32 
 4   redadsq    19889 non-null  float32 
 5   reduca     19887 non-null  float32 
 6   rmu        19889 non-null  category
 7   rly        19889 non-null  float32 
 8   rmiembros  19889 non-null  float32 
dtypes: category(4), float32(5)
memory usage: 466.8 KB




# Analisis

## Descriptivo

In [5]:
data_fit.describe()

Unnamed: 0,redad,redadsq,reduca,rly,rmiembros
count,19889.0,19889.0,19887.0,19889.0,19889.0
mean,50.586906,2776.024658,8.50616,6.359111,3.121726
std,14.730933,1553.588135,4.968214,0.678555,1.739702
min,16.0,256.0,0.0,3.84297,1.0
25%,39.0,1521.0,5.0,5.898921,2.0
50%,50.0,2500.0,9.0,6.342873,3.0
75%,61.0,3721.0,11.0,6.803259,4.0
max,98.0,9604.0,18.0,9.744512,14.0


In [6]:
#data_fit['rvida'].value_counts()
data_fit['rvida'].dtype

CategoricalDtype(categories=['Muy Mal', 'Mal', 'Bien', 'Muy bien'], ordered=True, categories_dtype=object)

# Modelos

In [7]:
data_fit['depend'] = numpy.where(data_fit['rvida']=='Muy Mal', 1,
                                 numpy.where(data_fit['rvida']=='Mal',2,
                                             numpy.where(data_fit['rvida']=='Bien',3,4)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [8]:
#data_fit['depend'].value_counts()
data_fit['rvida'].value_counts()


rvida
Bien        14485
Mal          4806
Muy Mal       349
Muy bien      249
Name: count, dtype: int64

In [16]:
response = data_fit[['rvida']]
predictors_cols1 = ['rsexo', 'rpareja','redad','redadsq','reduca']

In [17]:
import statsmodels 
statsmodels.__version__
#python.__version__

'0.14.1'

In [18]:
# Modelo OLS vs Probit
# Ordered probit
#import statsmodels 

base1       = data_fit[predictors_cols1]
predictors  = data_fit[predictors_cols1]
transformer = sklearn.clone(base_transformer)
inputs      = transformer.fit_transform(predictors)
inputs      = pandas.DataFrame(inputs, index=base1.index,
                               columns=transformer.get_feature_names_out())
inputs      = sm.add_constant(inputs, has_constant="raise")


In [19]:
inputs.head().T

Unnamed: 0,0,1,2,3,4
const,1.0,1.0,1.0,1.0,1.0
category__rsexo_Mujer,0.0,1.0,0.0,1.0,0.0
category__rpareja_Pareja,1.0,0.0,1.0,0.0,1.0
number__redad,56.0,21.0,62.0,62.0,57.0
number__redadsq,3136.0,441.0,3844.0,3844.0,3249.0
number__reduca,17.0,11.0,16.0,0.0,11.0


In [20]:
import statsmodels
m1  = statsmodels.discrete.discrete_model.OrderedResults(response, inputs).fit()
#m1  = statsmodels.miscmodels.ordinal_model.OrderedModel(response, inputs).fit()

AttributeError: 'DataFrame' object has no attribute 'df_model'

In [None]:
help(statsmodels)

In [None]:
help(statsmodels.discrete.discrete_model.OrderedResults)
#statsmodels.multivariate.

In [None]:
statsmodels.discrete.discrete_model.OrderedResults