# Juntamos en un dataframe los features más importantes

## Leenmos nuestras librerías

In [None]:
import pandas as pd
import numpy as np

## Leemos nuestros datos ENAHO limpios

In [None]:
# Leemos todos los datasets
df_2010 = pd.read_csv('/work/enaho-personas-estudios-caracteristicas-data-science/ENAHO-DATA-LIMPIA/2010.csv')
df_2011 = pd.read_csv('/work/enaho-personas-estudios-caracteristicas-data-science/ENAHO-DATA-LIMPIA/2011.csv')
df_2012 = pd.read_csv('/work/enaho-personas-estudios-caracteristicas-data-science/ENAHO-DATA-LIMPIA/2012.csv')
df_2013 = pd.read_csv('/work/enaho-personas-estudios-caracteristicas-data-science/ENAHO-DATA-LIMPIA/2013.csv')
df_2014 = pd.read_csv('/work/enaho-personas-estudios-caracteristicas-data-science/ENAHO-DATA-LIMPIA/2014.csv')
df_2015 = pd.read_csv('/work/enaho-personas-estudios-caracteristicas-data-science/ENAHO-DATA-LIMPIA/2015.csv')
df_2016 = pd.read_csv('/work/enaho-personas-estudios-caracteristicas-data-science/ENAHO-DATA-LIMPIA/2016.csv')
df_2017 = pd.read_csv('/work/enaho-personas-estudios-caracteristicas-data-science/ENAHO-DATA-LIMPIA/2017.csv')
df_2018 = pd.read_csv('/work/enaho-personas-estudios-caracteristicas-data-science/ENAHO-DATA-LIMPIA/2018.csv')
df_2019 = pd.read_csv('/work/enaho-personas-estudios-caracteristicas-data-science/ENAHO-DATA-LIMPIA/2019.csv')
df_2020 = pd.read_csv('/work/enaho-personas-estudios-caracteristicas-data-science/ENAHO-DATA-LIMPIA/2020.csv')
df_2021 = pd.read_csv('/work/enaho-personas-estudios-caracteristicas-data-science/ENAHO-DATA-LIMPIA/2021.csv')

## Creamos nuestra función para unir todo y sacar los features más importantes

In [None]:
def get_df_principal_features(df, year_name):
    if(str(year_name) == '2014'):
        df['SEXO'] = df['SEXO'].apply(lambda x: 'Hombre' if x == 1 else 'Mujer')
    else:
        df['SEXO'] = df['SEXO'].str.title()
    df['TOTAL'] = 1
    df_final = df.groupby(by = ['NOM_GRADO_ESTUDIOS']).sum()['TOTAL'].to_frame()
    df_final.reset_index(drop = False, inplace=True)
    df_final.replace(['Maestria/doctorado', 'post-grado universitario', 'postgrado'], 'maestria_doctorado', inplace = True)
    df_final.replace(['Superior Universitaria Completa', 'superior universitaria completa', 'sup. univ. completa'], 'sup_uni_completa', inplace = True)
    df_final.replace(['Superior Universitaria Incompleta', 'superior universitaria incompleta', 'sup. univ. incompleta'], 'sup_uni_incompleta', inplace = True)
    df_final.set_index('NOM_GRADO_ESTUDIOS', inplace = True)
    df_final = df_final.T
    df_final.reset_index(drop = True, inplace = True)
    df_final['anio'] = year_name

    df_psicologia = df[(df['NOM_CARRERA_UNI'].str.upper().isin(['PSICOLOGÍA', 'PSICOLOGIA']))
        & (df['NOM_GRADO_ESTUDIOS'].isin(['superior universitaria incompleta', 'sup. univ. incompleta', 'Superior Universitaria Incompleta']))]
    
    df_final['pregrado_psicologia'] = df_psicologia.shape[0]
    
    df_sexo_psicologia = df_psicologia.groupby(by = 'SEXO').sum()['TOTAL']

    df_final['hombres_pregrado_psicologia'] = df_sexo_psicologia['Hombre']
    df_final['mujeres_pregrado_psicologia'] = df_sexo_psicologia['Mujer']

    df_pregrado = df[df['NOM_GRADO_ESTUDIOS'].isin(['superior universitaria incompleta', 'sup. univ. incompleta', 'Superior Universitaria Incompleta'])]
    df_pregrado_top_5_carreras = df_pregrado.groupby(by = 'NOM_CARRERA_UNI', as_index = False).sum().sort_values(by = 'TOTAL', ascending = False)
    df_pregrado_top_5_carreras.reset_index(drop = True, inplace = True)
    df_pregrado_top_5_carreras.set_index('NOM_CARRERA_UNI', inplace = True)
    
    df_final['pregrado_derecho'] = df_pregrado_top_5_carreras.loc['Derecho','TOTAL']
    df_final['pregrado_ing_civil'] = df_pregrado_top_5_carreras.loc['Ingeniería Civil','TOTAL']
    df_final['pregrado_contabilidad'] = df_pregrado_top_5_carreras.loc['Contabilidad','TOTAL']
    df_final['pregrado_admin_empresas'] = df_pregrado_top_5_carreras.loc['Administración de Empresas','TOTAL']
    df_final['pregrado_ing_sistemas'] = df_pregrado_top_5_carreras.loc['Ingeniería de Sistemas','TOTAL']
    return df_final


## Creamos nuestro dataframe con la estructura final

In [None]:
df_junto = pd.DataFrame({
    'anio': [],
    'sup_uni_incompleta': [],
    'sup_uni_completa': [],
    'maestria_doctorado': [],
    'pregrado_psicologia': [],
    'hombres_pregrado_psicologia': [],
    'mujeres_pregrado_psicologia': [],
    'pregrado_derecho': [],
    'pregrado_ing_civil': [],
    'pregrado_contabilidad': [],
    'pregrado_admin_empresas': [],
    'pregrado_ing_sistemas': []
})

## Unimos todos nuestros dataframes en uno solo

In [None]:
df_junto = pd.concat([df_junto, get_df_principal_features(df_2010, '2010')], ignore_index=True)
df_junto = pd.concat([df_junto, get_df_principal_features(df_2011, '2011')], ignore_index=True)
df_junto = pd.concat([df_junto, get_df_principal_features(df_2012, '2012')], ignore_index=True)
df_junto = pd.concat([df_junto, get_df_principal_features(df_2013, '2013')], ignore_index=True)
df_junto = pd.concat([df_junto, get_df_principal_features(df_2014, '2014')], ignore_index=True)
df_junto = pd.concat([df_junto, get_df_principal_features(df_2015, '2015')], ignore_index=True)
df_junto = pd.concat([df_junto, get_df_principal_features(df_2016, '2016')], ignore_index=True)
df_junto = pd.concat([df_junto, get_df_principal_features(df_2017, '2017')], ignore_index=True)
df_junto = pd.concat([df_junto, get_df_principal_features(df_2018, '2018')], ignore_index=True)
df_junto = pd.concat([df_junto, get_df_principal_features(df_2019, '2019')], ignore_index=True)
df_junto = pd.concat([df_junto, get_df_principal_features(df_2020, '2020')], ignore_index=True)
df_junto = pd.concat([df_junto, get_df_principal_features(df_2021, '2021')], ignore_index=True)

## Luego de unir verificamos que estén todos los años

In [None]:
df_junto.head()

Unnamed: 0,anio,sup_uni_incompleta,sup_uni_completa,maestria_doctorado,pregrado_psicologia,hombres_pregrado_psicologia,mujeres_pregrado_psicologia,pregrado_derecho,pregrado_ing_civil,pregrado_contabilidad,pregrado_admin_empresas,pregrado_ing_sistemas
0,2010,3393.0,3109.0,606.0,67.0,20.0,47.0,213.0,137.0,287.0,244.0,155.0
1,2011,3964.0,3713.0,650.0,100.0,34.0,66.0,273.0,179.0,362.0,242.0,187.0
2,2012,4396.0,4132.0,809.0,106.0,35.0,71.0,276.0,233.0,442.0,316.0,170.0
3,2013,5345.0,5557.0,1060.0,144.0,30.0,114.0,313.0,323.0,488.0,372.0,233.0
4,2014,5616.0,5578.0,1179.0,146.0,40.0,106.0,338.0,360.0,543.0,416.0,221.0


In [None]:
df_junto.tail()

Unnamed: 0,anio,sup_uni_incompleta,sup_uni_completa,maestria_doctorado,pregrado_psicologia,hombres_pregrado_psicologia,mujeres_pregrado_psicologia,pregrado_derecho,pregrado_ing_civil,pregrado_contabilidad,pregrado_admin_empresas,pregrado_ing_sistemas
7,2017,6035.0,6450.0,,247.0,62.0,185.0,396.0,413.0,491.0,326.0,199.0
8,2018,6401.0,7028.0,1441.0,260.0,77.0,183.0,423.0,413.0,498.0,334.0,218.0
9,2019,5853.0,6685.0,1404.0,260.0,75.0,185.0,401.0,404.0,398.0,311.0,168.0
10,2020,5779.0,6930.0,1219.0,120.0,37.0,83.0,216.0,205.0,174.0,139.0,93.0
11,2021,4885.0,5718.0,1024.0,221.0,66.0,155.0,365.0,302.0,289.0,236.0,153.0


# Guardamos nuestro dataframe que engloba los mejores features

In [None]:
df_junto.to_csv('/work/enaho-personas-estudios-caracteristicas-data-science/ENAHO_2010_2021.csv', index = False)
print('Guardado!')

Guardado!


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=61180446-cfea-4338-ac0e-81e50a732b69' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>