# Explore here

In [3]:
# Standard library imports
from pathlib import Path

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.preprocessing import OrdinalEncoder, PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.exceptions import ConvergenceWarning

# Local imports
from pathlib import Path
print("CWD:", Path.cwd())
print("Found:", list(Path(".").rglob("helper_functions.py")))

# Set display options for pandas
pd.set_option('display.max_rows', 100)

CWD: /workspaces/Jack-Porter-s-Linear-Regression-Using-Pandas-Project/src
Found: []


In [5]:
# Load and save the data

data_url = 'https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv'
data_df = pd.read_csv(data_url, sep=',')

Path('../data/raw').mkdir(parents=True, exist_ok=True)

In [6]:
# Inspect

data_df.head().transpose()
data_df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Data columns (total 108 columns):
 #    Column                                                                         Non-Null Count  Dtype  
---   ------                                                                         --------------  -----  
 0    fips                                                                           3140 non-null   int64  
 1    TOT_POP                                                                        3140 non-null   int64  
 2    0-9                                                                            3140 non-null   int64  
 3    0-9 y/o % of total pop                                                         3140 non-null   float64
 4    19-Oct                                                                         3140 non-null   int64  
 5    10-19 y/o % of total pop                                                       3140 non-null   float64
 6    20-29         

In [7]:
# Initial Feature Selection
# Our label is the rate of any condition per 100 people
label = data_df['anycondition_number'] / (data_df['TOT_POP'] / 100)

In [8]:
# Age Features
# Select age-related features
age_features = data_df.columns[2:20]
age_df = data_df[age_features].copy()

# Remove age count features, keeping only percentages
feature_drops = age_df.filter(regex = '^\d+-\d+$').columns
age_df.drop(columns=feature_drops, inplace=True)
age_df.drop(columns=['19-Oct', '80+'], inplace=True)

age_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   0-9 y/o % of total pop    3140 non-null   float64
 1   10-19 y/o % of total pop  3140 non-null   float64
 2   20-29 y/o % of total pop  3140 non-null   float64
 3   30-39 y/o % of total pop  3140 non-null   float64
 4   40-49 y/o % of total pop  3140 non-null   float64
 5   50-59 y/o % of total pop  3140 non-null   float64
 6   60-69 y/o % of total pop  3140 non-null   float64
 7   70-79 y/o % of total pop  3140 non-null   float64
 8   80+ y/o % of total pop    3140 non-null   float64
dtypes: float64(9)
memory usage: 220.9 KB


In [9]:
# Ethnicity Features
# Select Ethnicity-related features
ethnicity_features = data_df.columns[20:32]
ethnicity_df = data_df[ethnicity_features].copy()

# Remove count features, keeping only percentages
features = ethnicity_df.filter(regex = '^\%').columns
ethnicity_df = ethnicity_df[features]

ethnicity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   % White-alone        3140 non-null   float64
 1   % Black-alone        3140 non-null   float64
 2   % NA/AI-alone        3140 non-null   float64
 3   % Asian-alone        3140 non-null   float64
 4   % Hawaiian/PI-alone  3140 non-null   float64
 5   % Two or more races  3140 non-null   float64
dtypes: float64(6)
memory usage: 147.3 KB


In [10]:
# Population Features
population_features = ['TOT_POP', 'POP_ESTIMATE_2018', 'N_POP_CHG_2018', 'R_birth_2018', 'R_death_2018']
population_df = data_df[population_features].copy()
population_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   TOT_POP            3140 non-null   int64  
 1   POP_ESTIMATE_2018  3140 non-null   int64  
 2   N_POP_CHG_2018     3140 non-null   int64  
 3   R_birth_2018       3140 non-null   float64
 4   R_death_2018       3140 non-null   float64
dtypes: float64(2), int64(3)
memory usage: 122.8 KB


In [11]:
# Educationb Features
education_features = data_df.columns[45:49]
education_df = data_df[education_features].copy()
education_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Data columns (total 4 columns):
 #   Column                                                                   Non-Null Count  Dtype  
---  ------                                                                   --------------  -----  
 0   Percent of adults with less than a high school diploma 2014-18           3140 non-null   float64
 1   Percent of adults with a high school diploma only 2014-18                3140 non-null   float64
 2   Percent of adults completing some college or associate's degree 2014-18  3140 non-null   float64
 3   Percent of adults with a bachelor's degree or higher 2014-18             3140 non-null   float64
dtypes: float64(4)
memory usage: 98.3 KB


In [12]:
# Employment Features
employment_features = data_df.columns[56:62]
employment_df = data_df[employment_features].copy()
employment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Data columns (total 6 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Civilian_labor_force_2018                  3140 non-null   int64  
 1   Employed_2018                              3140 non-null   int64  
 2   Unemployed_2018                            3140 non-null   int64  
 3   Unemployment_rate_2018                     3140 non-null   float64
 4   Median_Household_Income_2018               3140 non-null   int64  
 5   Med_HH_Income_Percent_of_State_Total_2018  3140 non-null   float64
dtypes: float64(2), int64(4)
memory usage: 147.3 KB


In [13]:
# Healthcare Access Features
healthcare_features = data_df.columns[62:75]
healthcare_df = data_df[healthcare_features].copy()
healthcare_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Data columns (total 13 columns):
 #   Column                                                                         Non-Null Count  Dtype  
---  ------                                                                         --------------  -----  
 0   Active Physicians per 100000 Population 2018 (AAMC)                            3140 non-null   float64
 1   Total Active Patient Care Physicians per 100000 Population 2018 (AAMC)         3140 non-null   float64
 2   Active Primary Care Physicians per 100000 Population 2018 (AAMC)               3140 non-null   float64
 3   Active Patient Care Primary Care Physicians per 100000 Population 2018 (AAMC)  3140 non-null   float64
 4   Active General Surgeons per 100000 Population 2018 (AAMC)                      3140 non-null   float64
 5   Active Patient Care General Surgeons per 100000 Population 2018 (AAMC)         3140 non-null   float64
 6   Total nurse practitioner

In [14]:
# Other Features
other_features = data_df.columns[78:82]
other_df = data_df[other_features].copy()
other_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   COUNTY_NAME  3140 non-null   object
 1   STATE_NAME   3140 non-null   object
 2   STATE_FIPS   3140 non-null   int64 
 3   CNTY_FIPS    3140 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 98.3+ KB


In [16]:
# Recombine Selected Features
data_df = pd.concat([age_df,ethnicity_df,population_df,education_df,employment_df,healthcare_df,other_df], axis=1)
data_df['morbidity'] = label

# Rename features with shorter, more plot-friendly names
column_renames = {
    # Age features (percentages)
 
    '0-9 y/o % of total pop': 'pct_0-9',
    '10-19 y/o % of total pop': 'pct_10-19',
    '20-29 y/o % of total pop': 'pct_20-29',
    '30-39 y/o % of total pop': 'pct_30-39',
    '40-49 y/o % of total pop': 'pct_40-49',
    '50-59 y/o % of total pop': 'pct_50-59',
    '60-69 y/o % of total pop': 'pct_60-69',
    '70-79 y/o % of total pop': 'pct_70-79',
    '80+ y/o % of total pop': 'pct_80+',
    
    # Ethnicity features
    '% Not Hispanic or Latino': 'pct_non_hispanic',
    '% Hispanic or Latino': 'pct_hispanic',
    '% American Indian and Alaska Native Alone': 'pct_native_american',
    '% Asian Alone': 'pct_asian',
    '% Black or African American Alone': 'pct_black',
    '% Native Hawaiian and Other Pacific Islander Alone': 'pct_pacific_islander',
    '% Two or More Races': 'pct_multiracial',
    '% White Alone': 'pct_white',
    
    # Population features
    'TOT_POP': 'total_population',
    'POP_ESTIMATE_2018': 'pop_estimate_2018',
    'N_POP_CHG_2018': 'pop_change_2018',
    'R_birth_2018': 'birth_rate_2018',
    'R_death_2018': 'death_rate_2018',
    
    # Education features
    'Percent of adults with less than a high school diploma 2014-18': 'pct_no_hs_diploma',
    'Percent of adults with a high school diploma only 2014-18': 'pct_hs_only',
    'Percent of adults completing some college or associate\'s degree 2014-18': 'pct_some_college',
    'Percent of adults with a bachelor\'s degree or higher 2014-18': 'pct_bachelor_plus',
    
    # Employment features
    'Civilian_labor_force_2018': 'civilian_labor_force',
    'Employed_2018': 'employed_2018',
    'Unemployed_2018': 'unemployed_2018',
    'Unemployment_rate_2018': 'unemployment_rate',
    'Median_Household_Income_2018': 'median_household_income',
    'Med_HH_Income_Percent_of_State_Total_2018': 'income_pct_of_state',
    
    # Healthcare access features
    'Active Physicians per 100000 Population 2018 (AAMC)': 'active_physicians_per_100k',
    'Active Patient Care Primary Care Physicians per 100000 Population 2018 (AAMC)': 'primary_physicians_per_100k',
    'Active Primary Care Physicians per 100000 Population 2018 (AAMC)': 'active_primary_care_per_100k',
    'Active General Surgeons per 100000 Population 2018 (AAMC)': 'general_surgeons_per_100k',
    'Active Patient Care General Surgeons per 100000 Population 2018 (AAMC)': 'surgeons_per_100k',
    'Total Active Patient Care Physicians per 100000 Population 2018 (AAMC)': 'total_physicians_per_100k',
    'Total nurse practitioners (2019)': 'total_nurse_practitioners',
    'Total physician assistants (2019)': 'total_physician_assistants',
    'Total Hospitals (2019)': 'total_hospitals',
    'Internal Medicine Primary Care (2019)': 'internal_medicine_physicians',
    'Family Medicine/General Practice Primary Care (2019)': 'family_medicine_physicians',
    'Total Specialist Physicians (2019)': 'total_specialist_physicians',
}

data_df.rename(columns=column_renames, inplace=True)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Data columns (total 48 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_0-9                       3140 non-null   float64
 1   pct_10-19                     3140 non-null   float64
 2   pct_20-29                     3140 non-null   float64
 3   pct_30-39                     3140 non-null   float64
 4   pct_40-49                     3140 non-null   float64
 5   pct_50-59                     3140 non-null   float64
 6   pct_60-69                     3140 non-null   float64
 7   pct_70-79                     3140 non-null   float64
 8   pct_80+                       3140 non-null   float64
 9   % White-alone                 3140 non-null   float64
 10  % Black-alone                 3140 non-null   float64
 11  % NA/AI-alone                 3140 non-null   float64
 12  % Asian-alone                 3140 non-null   float64
 13  % H