<a href="https://colab.research.google.com/github/Areliortiz/emparejamiento_puntuacion_propensi-n/blob/main/emmparejamiento_puntuaacion_poropewnsionipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# Paso 1: Importar las bibliotecas necesarias
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from statsmodels.api import Logit

# Paso 2: Cargar el archivo CSV
mydata = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vQ71BwI_10C3K-Kfx3AR49_aJ5D36aIAGF4Hnxisco-q9EqLxTo6JWu9dnetspG1Z_Md1rV34HZ8a5H/pub?output=csv')
mydata


Unnamed: 0,school,tot,min,dis,stw
0,SKANEATELES MIDDLE SCHOOL,380,0.03,0.00,0
1,MARCUS WHITMAN MIDDLE SCHOOL,276,0.04,0.00,0
2,BLIND BROOK-RYE MIDDLE SCHOOL,376,0.09,0.00,0
3,BRONXVILLE MIDDLE SCHOOL,404,0.11,0.00,0
4,BRIARCLIFF MIDDLE SCHOOL,374,0.12,0.00,0
...,...,...,...,...,...
579,A A KINGSTON MIDDLE SCHOOL-STW,359,0.06,0.30,1
580,MORAVIA JUNIOR-SENIOR HIGH SCHOOL-STW,505,0.03,0.36,1
581,GLENS FALLS MIDDLE SCHOOL-STW,606,0.13,0.47,1
582,LONGWOOD JUNIOR HIGH SCHOOL-STW,1369,0.48,0.50,1


In [20]:
# Modelo logístico para calcular propensity scores
logit_model = Logit(mydata['stw'], mydata[['tot', 'min', 'dis']])
propensity_scores = logit_model.fit().predict()
mydata['propensity_score'] = propensity_scores

Optimization terminated successfully.
         Current function value: 0.165739
         Iterations 9


In [35]:
treated = mydata[mydata['stw'] == 1]
control = mydata[mydata['stw'] == 0]

nn = NearestNeighbors(n_neighbors=1)
nn.fit(control[['propensity_score']])

# Encontrar vecinos más cercanos
distances, indices = nn.kneighbors(treated[['propensity_score']])
matched_controls = control.iloc[indices.flatten()].reset_index(drop=True)

# Combinar datos tratados con sus controles emparejados
matched_data = pd.concat([treated.reset_index(drop=True), matched_controls], axis=0)


In [42]:
def summarize_balance(data, treated_col, covariates):
    """Generar un resumen de balance para los datos."""
    summary = {}
    for covariate in covariates:
        treated_mean = data[data[treated_col] == 1][covariate].mean()
        control_mean = data[data[treated_col] == 0][covariate].mean()
        treated_sd = data[data[treated_col] == 1][covariate].std()
        control_sd = data[data[treated_col] == 0][covariate].std()
        treated_meadian = data[data[treated_col] == 1][covariate].median()
        control_meadian = data[data[treated_col] == 0][covariate].median()
        mean_diff = treated_mean - control_mean
        eQQ_med = treated_meadian - control_meadian
        summary[covariate] = {
            'Treated Mean': treated_mean,
            'Control Mean': control_mean,
            #'Treated SD': treated_sd,
            'Control SD': control_sd,
            'Mean Diff': mean_diff,
            'eQQ Med': eQQ_med
        }
    return pd.DataFrame(summary).T

# Variables a analizar
covariates = ['tot', 'min', 'dis'] #, 'propensity_score'

# Resumen antes del emparejamiento
balance_before = summarize_balance(mydata, 'stw', covariates)
print("Balance antes del emparejamiento:")
print(balance_before)

# Resumen después del emparejamiento
balance_after = summarize_balance(matched_data, 'stw', covariates)
print("\nBalance después del emparejamiento:")
print(balance_after)


Balance antes del emparejamiento:
     Treated Mean  Control Mean  Control SD   Mean Diff  eQQ Med
tot      832.6400    568.899821  333.674606  263.740179   253.00
min        0.1664      0.276673    0.301121   -0.110273    -0.02
dis        0.1840      0.407889    0.249954   -0.223889    -0.25

Balance después del emparejamiento:
     Treated Mean  Control Mean  Control SD  Mean Diff  eQQ Med
tot      832.6400      683.2400  351.475850    149.400    70.00
min        0.1664        0.1684    0.123682     -0.002    -0.01
dis        0.1840        0.1900    0.138022     -0.006    -0.01


In [59]:
import pandas as pd
import numpy as np

# Supongamos que tienes tus datos cargados en un DataFrame llamado mydata
# Las columnas relevantes son 'stw', 'tot', 'min', 'dis'

# Dividir los datos en grupos tratado y control
treated = mydata[mydata['stw'] == 1]
control = mydata[mydata['stw'] == 0]

# Función para calcular eQQ Median, eQQ Mean, y eQQ Max
def calculate_eQQ(treated_values, control_values):
    # Cuantiles del grupo tratado y control
    quantiles = np.linspace(0, 1, 101)  # Cuantiles de 0% a 100%
    treated_quantiles = np.quantile(treated_values, quantiles)
    control_quantiles = np.quantile(control_values, quantiles)

    # Diferencias absolutas entre cuantiles
    differences = np.abs(treated_quantiles - control_quantiles)

    # Métricas eQQ
    eqq_median = np.median(differences)
    eqq_mean = np.mean(differences)
    eqq_max = np.max(differences)

    return eqq_median, eqq_mean, eqq_max

# Calcular las métricas para cada variable
variables = ['tot', 'min', 'dis']  # Variables a analizar
results = {}

for var in variables:
    eqq_median, eqq_mean, eqq_max = calculate_eQQ(treated[var], control[var])
    results[var] = {
        'eQQ Median': eqq_median,
        'eQQ Mean': eqq_mean,
        'eQQ Max': eqq_max
    }

# Mostrar resultados
for var, metrics in results.items():
    print(f"Resultados para {var}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value}")


Resultados para tot:
  eQQ Median: 290.79999999999984
  eQQ Mean: 280.88
  eQQ Max: 1124.0
Resultados para min:
  eQQ Median: 0.019999999999999997
  eQQ Mean: 0.12445742574257428
  eQQ Max: 0.6295999999999997
Resultados para dis:
  eQQ Median: 0.25
  eQQ Mean: 0.22732277227722775
  eQQ Max: 0.49240000000000006


In [60]:
import pandas as pd

def summarize_balance(data, treated_col, covariates):
    """
    Genera un resumen de balance para los datos.
    Calcula métricas como las medias, desviaciones estándar,
    diferencias de medias y eQQ Median.
    """
    summary = {}
    for covariate in covariates:
        # Filtrar los valores para los grupos tratado y control
        treated_values = data[data[treated_col] == 1][covariate]
        control_values = data[data[treated_col] == 0][covariate]

        # Cálculos de las métricas
        treated_mean = treated_values.mean()
        control_mean = control_values.mean()
        treated_sd = treated_values.std()
        control_sd = control_values.std()
        mean_diff = treated_mean - control_mean

        # Cálculo de eQQ Median
        treated_median = treated_values.median()
        control_median = control_values.median()
        eqq_med = treated_median - control_median

        # Guardar resultados en el resumen
        summary[covariate] = {
            'Treated Mean': treated_mean,
            'Control Mean': control_mean,
            'Control SD': control_sd,
            'Mean Diff': mean_diff,
            'eQQ Med': eqq_med
        }

    return pd.DataFrame(summary).T

# Variables a analizar
covariates = ['tot', 'min', 'dis']

# Resumen antes del emparejamiento
balance_before = summarize_balance(mydata, 'stw', covariates)
print("Balance antes del emparejamiento:")
print(balance_before)

# Supongamos que tienes un DataFrame llamado `matched_data` después del emparejamiento
# Resumen después del emparejamiento
balance_after = summarize_balance(matched_data, 'stw', covariates)
print("\nBalance después del emparejamiento:")
print(balance_after)


Balance antes del emparejamiento:
     Treated Mean  Control Mean  Control SD   Mean Diff  eQQ Med
tot      832.6400    568.899821  333.674606  263.740179   253.00
min        0.1664      0.276673    0.301121   -0.110273    -0.02
dis        0.1840      0.407889    0.249954   -0.223889    -0.25

Balance después del emparejamiento:
     Treated Mean  Control Mean  Control SD  Mean Diff  eQQ Med
tot      832.6400      683.2400  351.475850    149.400    70.00
min        0.1664        0.1684    0.123682     -0.002    -0.01
dis        0.1840        0.1900    0.138022     -0.006    -0.01
