Primero es necesario filtrar las variables de interés *disburments* a solo las instituciones: 'public', 'private' y 'proprietary'

In [149]:
import pandas as pd
import numpy as np
import scipy.stats as st
from scipy.stats import skew, kurtosis
import warnings


df = pd.read_csv("../data/clean/dashboard_2010_clean.csv")

tipos = ['public', 'private', 'proprietary']

df_filtrado = df[df['School Type'].str.lower().isin(tipos)]

df_filtrado

Unnamed: 0,OPE ID,School,State,Zip Code,School Type,FFEL SUBSIDIZED Recipients,FFEL SUBSIDIZED # of Loans Originated,FFEL SUBSIDIZED $ of Loans Originated,FFEL SUBSIDIZED # of Disbursements,FFEL SUBSIDIZED $ of Disbursements,...,FFEL PARENT PLUS Recipients,FFEL PARENT PLUS # of Loans Originated,FFEL PARENT PLUS $ of Loans Originated,FFEL PARENT PLUS # of Disbursements,FFEL PARENT PLUS $ of Disbursements,FFEL GRAD PLUS Recipients,FFEL GRAD PLUS # of Loans Originated,FFEL GRAD PLUS $ of Loans Originated,FFEL GRAD PLUS # of Disbursements,FFEL GRAD PLUS $ of Disbursements
0,106100,ALASKA PACIFIC UNIVERSITY,AK,995084672,PRIVATE,5,5,16999,6,21249,...,0,0,0,1,93,0,0,0,0,0
1,106300,UNIVERSITY OF ALASKA FAIRBANKS,AK,997757500,PUBLIC,99,104,320529,110,316818,...,2,2,8157,2,8157,0,0,0,0,0
2,106500,UNIVERSITY OF ALASKA SOUTHEAST,AK,998018680,PUBLIC,38,40,78165,57,121239,...,0,0,0,0,0,1,1,3233,1,3233
3,1146200,UNIVERSITY OF ALASKA ANCHORAGE,AK,995088050,PUBLIC,10,10,15851,46,80427,...,1,1,5000,2,8333,1,1,8500,1,8500
4,2541000,ALASKA CAREER COLLEGE,AK,995071033,PROPRIETARY,117,118,372926,274,434764,...,5,5,37500,8,23616,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3788,393200,UNIVERSITY OF WYOMING,WY,820713663,PUBLIC,31,31,104491,39,107241,...,9,9,28150,9,28150,6,6,30446,6,30446
3789,393300,WESTERN WYOMING COMMUNITY COLLEGE,WY,829010428,PUBLIC,4,4,11521,6,12820,...,0,0,0,0,0,0,0,0,0,0
3790,728900,CENTRAL WYOMING COLLEGE,WY,825012215,PUBLIC,52,54,82101,54,81908,...,0,0,0,0,0,0,0,0,0,0
3791,915700,WYOTECH,WY,820729519,PROPRIETARY,1805,1933,3101590,3228,4599611,...,559,627,3895466,949,6113387,0,0,0,0,0


Se debe mencionar la asimetría y la curtosis para la elección de escala log

 señala que la curtosis describe la forma de una distribución en términos de su “apuntamiento” y el peso de sus colas, mientras que la asimetría (skewness) refleja la falta de simetría respecto a la media. En conjunto, ambos momentos estandarizados complementan la media y la varianza al caracterizar la forma total de una distribución.

De manera que hacer una transformación logarítmica es ideal cuando los datos presentan alta asimetría o curtosis, esto debido a que ayuda a corregir la falta de normalidad y a estabilizar la varianza, lo que mejora la validez de los análisis posteriores.

In [150]:
def resumen_disbursements(df_filtrado, tipo_col = 'School Type'):
    """
    Calcula asimetría, curtosis y recomendación de transformación logarítmica
    para todas las variables que contengan 'Disbursements', separadas por tipo de universidad.
    Devuelve un DataFrame resumen.
    """
    cols_disb = [c for c in df_filtrado.columns if 'Disbursements' in c]
    tipos = df_filtrado[tipo_col].dropna().unique()

    registros = []

    for col in cols_disb:
        for tipo in tipos:
            data = df_filtrado[df_filtrado[tipo_col] == tipo][col].dropna()

            # Cálculo en escala original
            s_orig = skew(data)
            k_orig = kurtosis(data)

            # Escala log(1+x)
            data_log = np.log1p(data)
            s_log = skew(data_log)
            k_log = kurtosis(data_log)

            # Regla de decisión: preferir log si reduce skew y kurtosis
            mejor_log = (abs(s_log) < abs(s_orig)) and (abs(k_log - 3) < abs(k_orig - 3))
            recomendacion = "usar log" if mejor_log else "mantener original"

            registros.append({
                "variable": col,
                "tipo_universidad": tipo,
                "skew_original": round(s_orig, 3),
                "kurt_original": round(k_orig, 3),
                "skew_log": round(s_log, 3),
                "kurt_log": round(k_log, 3),
                "recomendacion": recomendacion
            })

    resumen = pd.DataFrame(registros)
    return resumen.sort_values(["variable", "tipo_universidad"]).reset_index(drop=True)

In [151]:
resumen_disbursements(df_filtrado)

Unnamed: 0,variable,tipo_universidad,skew_original,kurt_original,skew_log,kurt_log,recomendacion
0,FFEL GRAD PLUS # of Disbursements,PRIVATE,7.702,77.896,1.436,0.92,usar log
1,FFEL GRAD PLUS # of Disbursements,PROPRIETARY,13.205,198.856,5.492,31.234,usar log
2,FFEL GRAD PLUS # of Disbursements,PUBLIC,10.925,151.004,3.017,8.619,usar log
3,FFEL GRAD PLUS $ of Disbursements,PRIVATE,7.861,77.325,0.608,-1.414,usar log
4,FFEL GRAD PLUS $ of Disbursements,PROPRIETARY,19.409,460.677,4.272,16.882,usar log
5,FFEL GRAD PLUS $ of Disbursements,PUBLIC,12.951,237.62,2.017,2.326,usar log
6,FFEL PARENT PLUS # of Disbursements,PRIVATE,12.549,182.789,0.871,0.582,usar log
7,FFEL PARENT PLUS # of Disbursements,PROPRIETARY,8.635,101.739,0.572,-0.64,usar log
8,FFEL PARENT PLUS # of Disbursements,PUBLIC,6.296,52.593,1.156,0.524,usar log
9,FFEL PARENT PLUS $ of Disbursements,PRIVATE,12.435,193.841,-0.425,-1.601,usar log
