In [2]:
# 1. Librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10,6)

# Carga
url = "https://breathecode.herokuapp.com/asset/internal-link?id=733&path=demographic_health_data.csv"
df = pd.read_csv(url)
print("Dataset cargado. Shape:", df.shape)


Dataset cargado. Shape: (3140, 108)


In [3]:
# 2. Inspección inicial
print("Columnas del dataset:")
print(df.columns.tolist())

print("\nPrimeras filas:")
display(df.head())

print("\nValores nulos por columna (top 20):")
display(df.isnull().sum().sort_values(ascending=False).head(20))


Columnas del dataset:
['fips', 'TOT_POP', '0-9', '0-9 y/o % of total pop', '19-Oct', '10-19 y/o % of total pop', '20-29', '20-29 y/o % of total pop', '30-39', '30-39 y/o % of total pop', '40-49', '40-49 y/o % of total pop', '50-59', '50-59 y/o % of total pop', '60-69', '60-69 y/o % of total pop', '70-79', '70-79 y/o % of total pop', '80+', '80+ y/o % of total pop', 'White-alone pop', '% White-alone', 'Black-alone pop', '% Black-alone', 'Native American/American Indian-alone pop', '% NA/AI-alone', 'Asian-alone pop', '% Asian-alone', 'Hawaiian/Pacific Islander-alone pop', '% Hawaiian/PI-alone', 'Two or more races pop', '% Two or more races', 'POP_ESTIMATE_2018', 'N_POP_CHG_2018', 'GQ_ESTIMATES_2018', 'R_birth_2018', 'R_death_2018', 'R_NATURAL_INC_2018', 'R_INTERNATIONAL_MIG_2018', 'R_DOMESTIC_MIG_2018', 'R_NET_MIG_2018', 'Less than a high school diploma 2014-18', 'High school diploma only 2014-18', "Some college or associate's degree 2014-18", "Bachelor's degree or higher 2014-18", 'Perc

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
0,1001,55601,6787,12.206615,7637,13.735364,6878,12.370281,7089,12.749771,...,3644,12.9,11.9,13.8,5462,3.1,2.9,3.3,1326,3
1,1003,218022,24757,11.355276,26913,12.344167,23579,10.814964,25213,11.564429,...,14692,12.0,11.0,13.1,20520,3.2,3.0,3.5,5479,4
2,1005,24881,2732,10.980266,2960,11.896628,3268,13.13452,3201,12.865239,...,2373,19.7,18.6,20.6,3870,4.5,4.2,4.8,887,6
3,1007,22400,2456,10.964286,2596,11.589286,3029,13.522321,3113,13.897321,...,1789,14.1,13.2,14.9,2511,3.3,3.1,3.6,595,2
4,1009,57840,7095,12.266598,7570,13.087828,6742,11.656293,6884,11.901798,...,4661,13.5,12.6,14.5,6017,3.4,3.2,3.7,1507,2



Valores nulos por columna (top 20):


fips                        0
TOT_POP                     0
0-9                         0
0-9 y/o % of total pop      0
19-Oct                      0
10-19 y/o % of total pop    0
20-29                       0
20-29 y/o % of total pop    0
30-39                       0
30-39 y/o % of total pop    0
40-49                       0
40-49 y/o % of total pop    0
50-59                       0
50-59 y/o % of total pop    0
60-69                       0
60-69 y/o % of total pop    0
70-79                       0
70-79 y/o % of total pop    0
80+                         0
80+ y/o % of total pop      0
dtype: int64

A continuación el notebook selecciona automáticamente una variable objetivo entre una lista de candidatas típicas en este tipo de datasets (por ejemplo: 'Life_expectancy', 'mortality_rate', 'poor_health_days', 'obesity_rate', 'uninsured_rate', 'preventable_hospitalizations', 'diabetes_prevalence').  
Si encuentra varias, selecciona la primera en la lista. Si no encuentra ninguna, te pedirá elegir manualmente.


In [10]:
print("=== LISTA COMPLETA DE VARIABLES ===")
for i, columna in enumerate(df.columns):
    print(f"{i+1:2d}. {columna}")

=== LISTA COMPLETA DE VARIABLES ===
 1. fips
 2. TOT_POP
 3. 0-9
 4. 0-9 y/o % of total pop
 5. 19-Oct
 6. 10-19 y/o % of total pop
 7. 20-29
 8. 20-29 y/o % of total pop
 9. 30-39
10. 30-39 y/o % of total pop
11. 40-49
12. 40-49 y/o % of total pop
13. 50-59
14. 50-59 y/o % of total pop
15. 60-69
16. 60-69 y/o % of total pop
17. 70-79
18. 70-79 y/o % of total pop
19. 80+
20. 80+ y/o % of total pop
21. White-alone pop
22. % White-alone
23. Black-alone pop
24. % Black-alone
25. Native American/American Indian-alone pop
26. % NA/AI-alone
27. Asian-alone pop
28. % Asian-alone
29. Hawaiian/Pacific Islander-alone pop
30. % Hawaiian/PI-alone
31. Two or more races pop
32. % Two or more races
33. POP_ESTIMATE_2018
34. N_POP_CHG_2018
35. GQ_ESTIMATES_2018
36. R_birth_2018
37. R_death_2018
38. R_NATURAL_INC_2018
39. R_INTERNATIONAL_MIG_2018
40. R_DOMESTIC_MIG_2018
41. R_NET_MIG_2018
42. Less than a high school diploma 2014-18
43. High school diploma only 2014-18
44. Some college or associate's de

In [11]:
# BUSCANDO VARIABLES OBJETIVO
print("RECURSOS SANITARIOS (OBJETIVO):")
objetivos = [col for col in df.columns if any(palabra in col.lower() for palabra in 
              ['physician', 'hospital', 'nurse', 'bed', 'medical', 'care'])]
for obj in objetivos:
    print(f"• {obj}")

RECURSOS SANITARIOS (OBJETIVO):
• Active Physicians per 100000 Population 2018 (AAMC)
• Total Active Patient Care Physicians per 100000 Population 2018 (AAMC)
• Active Primary Care Physicians per 100000 Population 2018 (AAMC)
• Active Patient Care Primary Care Physicians per 100000 Population 2018 (AAMC)
• Active Patient Care General Surgeons per 100000 Population 2018 (AAMC)
• Total nurse practitioners (2019)
• Total physician assistants (2019)
• Total Hospitals (2019)
• Internal Medicine Primary Care (2019)
• Family Medicine/General Practice Primary Care (2019)
• Total Specialist Physicians (2019)
• ICU Beds_x
