In [10]:
import pandas as pd

In [11]:
df = pd.read_csv("../data/processed/df_res.csv")
df_complete = pd.read_csv("../data/processed/df_cleaned.csv")

In [12]:
variable_types = {
    'outAnal':[
        'FACTOR_y',
        
        ],

    'numeric_continuous': [
        'EDAD',
        't_horas',
        'q_hombres',
        'q_mujeres',
        
        'i_confidence_edu',
        'i_t_tec',
        'i_mental_h',
        'i_tec',
        'i_m_trad',
        'i_m_tech',
    ],

    'binary': [
        'InternetF',
        'inscrito',
        'inscrito_actual',
        'ev_trad',
        'ev_outro',
        'entr_cuidado',
        'apoio',
        'ev_digital',
        'med_virtual',
        'med_presencial',
        'asesorias',
        'extraord',
        'recursado',
        'terminado'
    ],

    'categorical_encoded': [
        'SEXO',
        'nivel_edu',
        'max_nivel_esp',
        'nt_causa',
        'nins_causa',
    ]
}


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15841 entries, 0 to 15840
Data columns (total 30 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SEXO              15841 non-null  float64
 1   EDAD              15841 non-null  int64  
 2   inscrito          15841 non-null  float64
 3   nivel_edu         15841 non-null  float64
 4   terminado         14818 non-null  float64
 5   nt_causa          15841 non-null  float64
 6   asesorias         13461 non-null  float64
 7   extraord          6391 non-null   float64
 8   recursado         6392 non-null   float64
 9   ev_digital        14593 non-null  float64
 10  inscrito_actual   15841 non-null  float64
 11  nins_causa        15841 non-null  float64
 12  med_presencial    14239 non-null  float64
 13  t_horas           15834 non-null  float64
 14  max_nivel_esp     15841 non-null  float64
 15  q_hombres         15841 non-null  int64  
 16  q_mujeres         15841 non-null  int64 

In [14]:
import plotly.express as px

geralcorr = variable_types['binary'] + variable_types['numeric_continuous']


corr = df[geralcorr].corr()


fig = px.imshow(
    corr,
    text_auto='.4f',          
    aspect='auto',            
    title='Matriz de Correlação',
    color_continuous_scale='RdBu'
)

fig.update_layout(
    width=1200,
    height=1000,
    margin=dict(l=100, r=100, t=100, b=100),
)

fig.show()


In [15]:
import numpy as np
import pingouin as pg

pvals = corr.corr(method = lambda x, y: pg.corr(x, y)["p-val"].iloc[0])
np.fill_diagonal(pvals.values, np.nan)

fig = px.imshow(
    pvals,
    text_auto='.4f',
    color_continuous_scale='RdBu_r'
    )

fig.update_layout(
    width=1300,
    height=1100,
    margin=dict(l=100, r=100, t=100, b=100),

)

fig

In [16]:
d = df[['inscrito_actual', 'inscrito']].value_counts()
print(d)

print(df['inscrito'].corr(df["inscrito_actual"]))

inscrito_actual  inscrito
1.0              1.0         14031
0.0              0.0           804
                 1.0           787
1.0              0.0           219
Name: count, dtype: int64
0.5992155307334157


In [17]:
weighted_group = df.groupby(["inscrito_actual", "terminado"])["FACTOR_y"].sum().reset_index(name="qtd_ponderada")

tot_ponderado = df["FACTOR_y"].sum()

weighted_group["relative_ponderada"] = (weighted_group["qtd_ponderada"] / tot_ponderado) * 100

print(weighted_group)

   inscrito_actual  terminado  qtd_ponderada  relative_ponderada
0              0.0        0.0       222707.0            0.806650
1              0.0        1.0      1307181.0            4.734638
2              1.0        0.0       134307.0            0.486463
3              1.0        1.0     23496988.0           85.106608


In [18]:
def check_significance(pvals, alpha=0.05):
  n = len(pvals.columns)
  for i in range(n):
      for j in range(i + 1, n):
          if pvals.iloc[i, j] < alpha:
              var1 = pvals.columns[i]
              var2 = pvals.columns[j]
              print(f"Correlação significativa entre {var1} e {var2} (p-valor = {pvals.iloc[i, j]:.4f})")

check_significance(pvals)

Correlação significativa entre InternetF e i_tec (p-valor = 0.0000)
Correlação significativa entre inscrito e inscrito_actual (p-valor = 0.0000)
Correlação significativa entre inscrito e ev_trad (p-valor = 0.0000)
Correlação significativa entre inscrito e apoio (p-valor = 0.0352)
Correlação significativa entre inscrito e med_virtual (p-valor = 0.0052)
Correlação significativa entre inscrito e extraord (p-valor = 0.0262)
Correlação significativa entre inscrito e recursado (p-valor = 0.0159)
Correlação significativa entre inscrito e terminado (p-valor = 0.0000)
Correlação significativa entre inscrito e EDAD (p-valor = 0.0419)
Correlação significativa entre inscrito_actual e ev_trad (p-valor = 0.0002)
Correlação significativa entre inscrito_actual e entr_cuidado (p-valor = 0.0031)
Correlação significativa entre inscrito_actual e apoio (p-valor = 0.0004)
Correlação significativa entre inscrito_actual e med_virtual (p-valor = 0.0000)
Correlação significativa entre inscrito_actual e EDAD (p-

In [26]:
import scipy.stats as stats 
important = ['inscrito_actual', 'ev_trad', 'ev_outro', 'apoio', 'asesorias', 'recursado', 'terminado'] + variable_types['categorical_encoded']

catImportant = df[important]


def anova(df, num_col, cat_col):

    grouped = df.groupby(cat_col)[num_col]

    groups = [group.values[~pd.isnull(group.values)] for _, group in grouped]

    f_val, p_val = stats.f_oneway(*groups)
    return p_val

results = []

def testing_anova(df, categorical_columns, numerical_cols):

  for cat_col in categorical_columns:
    if len(df[cat_col].unique()) > 2:
      for num_col in numerical_cols:
          p_val = anova(df, num_col, cat_col)

          if p_val is not None and p_val < 0.05:
              print(f"Significativo: {num_col} e {cat_col} (p-valor={p_val:.6f})")
              results.append((cat_col, num_col, p_val))

testing_anova(df, catImportant, geralcorr)



Significativo: InternetF e asesorias (p-valor=0.042447)
Significativo: inscrito_actual e asesorias (p-valor=0.047327)
Significativo: ev_trad e asesorias (p-valor=0.000000)
Significativo: entr_cuidado e asesorias (p-valor=0.000000)
Significativo: med_virtual e asesorias (p-valor=0.000526)
Significativo: med_presencial e asesorias (p-valor=0.010433)
Significativo: asesorias e asesorias (p-valor=0.000000)
Significativo: extraord e asesorias (p-valor=0.000000)
Significativo: recursado e asesorias (p-valor=0.000000)
Significativo: EDAD e asesorias (p-valor=0.000000)
Significativo: i_confidence_edu e asesorias (p-valor=0.020671)
Significativo: i_mental_h e asesorias (p-valor=0.036140)
Significativo: i_tec e asesorias (p-valor=0.000001)
Significativo: i_m_tech e asesorias (p-valor=0.000000)
Significativo: inscrito_actual e recursado (p-valor=0.000000)
Significativo: ev_outro e recursado (p-valor=0.017178)
Significativo: apoio e recursado (p-valor=0.046507)
Significativo: med_virtual e recursa


Each of the input arrays is constant; the F statistic is not defined or infinite


One or more sample arguments is too small; all returned values will be NaN. See documentation for sample size requirements.



Significativo: med_virtual e nins_causa (p-valor=0.000000)
Significativo: asesorias e nins_causa (p-valor=0.041131)
Significativo: extraord e nins_causa (p-valor=0.000000)
Significativo: recursado e nins_causa (p-valor=0.000000)
Significativo: terminado e nins_causa (p-valor=0.000000)
Significativo: EDAD e nins_causa (p-valor=0.000000)
Significativo: t_horas e nins_causa (p-valor=0.000000)
Significativo: q_hombres e nins_causa (p-valor=0.000000)
Significativo: i_confidence_edu e nins_causa (p-valor=0.006493)
Significativo: i_t_tec e nins_causa (p-valor=0.000000)
Significativo: i_mental_h e nins_causa (p-valor=0.000000)
Significativo: i_tec e nins_causa (p-valor=0.000000)
Significativo: i_m_trad e nins_causa (p-valor=0.000000)
Significativo: i_m_tech e nins_causa (p-valor=0.000000)


In [None]:
from scipy.stats import chi2_contingency
# Matriz para armazenar resultados
cramers_matrix = pd.DataFrame(
    np.nan,
    index=important,
    columns=important
)

np.fill_diagonal(cramers_matrix.values, 1.0)


def cramers_v(contingency_table):
    chi2, _, _, _ = chi2_contingency(contingency_table)
    n = contingency_table.sum().sum()
    phi2 = chi2 / n
    r, k = contingency_table.shape
    return np.sqrt(phi2 / min(k-1, r-1))


for i, col1 in enumerate(important):
    for j, col2 in enumerate(important):
        if i < j:
            contingency = pd.crosstab(
                df[col1],
                df[col2]
            )
            cv = cramers_v(contingency.values)
            cramers_matrix.loc[col1, col2] = cv
            cramers_matrix.loc[col2, col1] = cv

fig = px.imshow(
    cramers_matrix,
    text_auto='.4f',
    color_continuous_scale='RdBu_r'
    )

fig.update_layout(
    width=1000,
    height=800,
    margin=dict(l=100, r=100, t=100, b=100),

)

fig




invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide



In [31]:
percentual_evasores = df['inscrito_actual'].value_counts(normalize=True) * 100

display(percentual_evasores)

inscrito_actual
1.0    89.956442
0.0    10.043558
Name: proportion, dtype: float64

In [53]:
for i in df.columns:
    table = df.groupby("inscrito_actual")[i]
    display(i)
    display(table.describe())


'SEXO'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.565682,0.495823,0.0,0.0,1.0,1.0,1.0
1.0,14250.0,0.511719,0.49988,0.0,0.0,1.0,1.0,1.0


'EDAD'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,16.005657,2.659852,6.0,15.0,17.0,18.0,18.0
1.0,14250.0,11.821754,3.609133,6.0,9.0,12.0,15.0,18.0


'inscrito'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.494657,0.500129,0.0,0.0,0.0,1.0,1.0
1.0,14250.0,0.984632,0.123017,0.0,1.0,1.0,1.0,1.0


'nivel_edu'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,2.510371,2.719089,0.0,0.0,0.0,6.0,9.0
1.0,14250.0,3.723368,1.395015,0.0,3.0,3.0,4.0,9.0


'terminado'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,787.0,0.824651,0.380508,0.0,1.0,1.0,1.0,1.0
1.0,14031.0,0.993799,0.078502,0.0,1.0,1.0,1.0,1.0


'nt_causa'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.409177,1.606713,0.0,0.0,0.0,0.0,14.0
1.0,14250.0,0.030877,0.468415,0.0,0.0,0.0,0.0,14.0


'asesorias'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,627.0,0.141946,0.349273,0.0,0.0,0.0,0.0,1.0
1.0,12834.0,0.115864,0.320074,0.0,0.0,0.0,0.0,1.0


'extraord'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,540.0,0.159259,0.366257,0.0,0.0,0.0,0.0,1.0
1.0,5851.0,0.080841,0.272614,0.0,0.0,0.0,0.0,1.0


'recursado'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,537.0,0.080074,0.271662,0.0,0.0,0.0,0.0,1.0
1.0,5855.0,0.03228,0.176758,0.0,0.0,0.0,0.0,1.0


'ev_digital'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,649.0,0.27735,0.448036,0.0,0.0,0.0,1.0,1.0
1.0,13944.0,0.324943,0.46837,0.0,0.0,0.0,1.0,1.0


'inscrito_actual'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,14250.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


'nins_causa'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,5.226901,4.470525,1.0,2.0,3.0,8.0,15.0
1.0,14250.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


'med_presencial'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,0.0,,,,,,,
1.0,14239.0,0.41604,0.492918,0.0,0.0,0.0,1.0,1.0


't_horas'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,14243.0,5.166327,5.693556,0.0,0.0,4.0,10.0,60.0


'max_nivel_esp'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,14250.0,6.315088,2.118651,0.0,7.0,7.0,7.0,8.0


'q_hombres'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,2.738529,1.388911,0.0,2.0,3.0,4.0,9.0
1.0,14250.0,2.509193,1.296337,0.0,2.0,2.0,3.0,12.0


'q_mujeres'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,2.721559,1.51588,0.0,2.0,2.0,3.0,12.0
1.0,14250.0,2.733895,1.363916,0.0,2.0,3.0,3.0,12.0


'InternetF'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.510999,0.500036,0.0,0.0,1.0,1.0,1.0
1.0,14250.0,0.720842,0.448601,0.0,0.0,1.0,1.0,1.0


'FACTOR_y'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,2162.954745,2906.964069,12.0,668.5,1336.0,2518.0,34401.0
1.0,14249.0,1696.092849,2072.600792,33.0,590.0,1097.0,2020.0,34401.0


'i_confidence_edu'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.597528,0.436903,0.0,0.0,0.666667,1.0,1.0
1.0,14250.0,0.633591,0.419275,0.0,0.0,0.666667,1.0,1.0


'i_t_tec'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,14250.0,0.662316,0.472937,0.0,0.0,1.0,1.0,1.0


'i_mental_h'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,14250.0,0.445614,0.497051,0.0,0.0,0.0,1.0,1.0


'i_tec'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.29164,0.454661,0.0,0.0,0.0,1.0,1.0
1.0,14250.0,0.507439,0.499962,0.0,0.0,1.0,1.0,1.0


'i_m_trad'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,14250.0,0.71986,0.449083,0.0,0.0,1.0,1.0,1.0


'i_m_tech'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,14250.0,0.589474,0.398249,0.0,0.5,0.5,1.0,1.0


'ev_trad'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.386549,0.487112,0.0,0.0,0.0,1.0,1.0
1.0,14250.0,0.90793,0.289135,0.0,1.0,1.0,1.0,1.0


'ev_outro'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.231301,0.421797,0.0,0.0,0.0,0.0,1.0
1.0,14250.0,0.56,0.496404,0.0,0.0,1.0,1.0,1.0


'entr_cuidado'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,14250.0,0.634526,0.48158,0.0,0.0,1.0,1.0,1.0


'apoio'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,14250.0,0.716982,0.450481,0.0,0.0,1.0,1.0,1.0


'med_virtual'

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inscrito_actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1591.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,14250.0,0.857123,0.34996,0.0,1.0,1.0,1.0,1.0


In [80]:
import plotly.express as px

mapa_labels = {
    "(5.98, 9.0]": "6–9 anos",
    "(9.0, 12.0]": "10–12 anos",
    "(12.0, 15.0]": "13–15 anos",
    "(15.0, 18.0]": "16–18 anos"
}


df['faixa_etaria'] = pd.cut(df['EDAD'], 4 )


idade = df[['faixa_etaria','inscrito_actual']].groupby('faixa_etaria').mean().reset_index()
idade['inscrito_actual'] *= 100
idade['faixa_etaria'] = idade['faixa_etaria'].astype(str)

idade['faixa_etaria'] = idade['faixa_etaria'].replace(mapa_labels)
display(idade)

fig = px.bar(idade, x='faixa_etaria', 
             y='inscrito_actual', 
             color='inscrito_actual', 
             color_continuous_scale='Reds_r', 

             title = "Taxa de evasão escolar em cada idade")




fig.update_coloraxes(colorbar_title='% Inscritos')
fig.show()






Unnamed: 0,faixa_etaria,inscrito_actual
0,"(5.988, 9.0]",98.386374
1,10–12 anos,97.613883
2,13–15 anos,91.382028
3,16–18 anos,71.608752
