In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

def load_survey(path: str = 'merged_nhis.csv') -> pd.DataFrame:
    """
    Read the merged NHIS CSV file and perform minimal cleanup.

    Parameters
    ----------
    path : str
        File path of the CSV file.

    Returns
    -------
    pd.DataFrame
        Survey data with unnamed columns dropped and index reset.
    """
    survey = pd.read_csv(path)
    survey = survey.loc[:, ~survey.columns.str.contains('^Unnamed')]
    survey.reset_index(drop=True, inplace=True)
    return survey

survey = load_survey()
survey.head()
survey.isnull().sum()

In [None]:
def fit_age_model(data: pd.DataFrame) -> sm.regression.linear_model.RegressionResultsWrapper:
    """
    Fit an OLS regression of diabetes status on age.

    Parameters
    ----------
    data : pd.DataFrame
        Survey dataframe containing 'Age' and 'Diabetes' columns.

    Returns
    -------
    RegressionResultsWrapper
        Fitted statsmodels OLS object.
    """
    X = data['Age']
    y = data['Diabetes']
    X_const = sm.add_constant(X)
    model = sm.OLS(y, X_const).fit()
    return model

age_model = fit_age_model(survey)
print(age_model.summary())

sns.regplot(x='Age', y='Diabetes', data=survey,
            scatter_kws={'s': 10}, line_kws={'color': 'red'})
plt.title('Diabetes vs. Age')
plt.show()

In [None]:
def fit_sex_model(data: pd.DataFrame) -> sm.regression.linear_model.RegressionResultsWrapper:
    """
    Fit an OLS regression of diabetes status on sex.

    Parameters
    ----------
    data : pd.DataFrame
        Survey dataframe containing 'Sex' and 'Diabetes' columns.

    Returns
    -------
    RegressionResultsWrapper
        Fitted statsmodels OLS object.
    """
    X = data['Sex']
    y = data['Diabetes']
    X_const = sm.add_constant(X)
    model = sm.OLS(y, X_const).fit()
    return model

sex_model = fit_sex_model(survey)
print(sex_model.summary())

sns.boxplot(x='Sex', y='Diabetes', data=survey)
plt.title('Diabetes by Sex')
plt.show()