In [1]:
# import the libraries to use
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme()

# Step 1: Problem statement and data collection

We can see the data information in the page <https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv>

**IMPORTANT**

The csv file has a problem with the naming of a column, that name was changed in the local CSV file (the '19-Oct' was changed to '10-19').

In [2]:
from src.utils import load_data, ReadCsvParams, SaveCsvParams

file_path = '../data/raw/demographic_health_data.csv'
url = 'https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv'
read_csv_params: ReadCsvParams = {'delimiter': ','}
save_csv_params: SaveCsvParams = {'sep': ','}

df: pd.DataFrame = load_data(
    file_path=file_path, url=url, read_csv_params=read_csv_params, save_csv_params=save_csv_params
)

Loading data from file: ../data/raw/demographic_health_data.csv


## Problem to solve:
Create a regression model to predict a health-related target value. We chose the target value "anycondition_number".

# Step 2: Exploration and data cleaning

### Dataframe information

Let's see how is the data, the info and a little of its distribution.

In [3]:
# head of the dataframe
df.head()

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,10-19,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
0,1001,55601,6787,12.206615,7637,13.735364,6878,12.370281,7089,12.749771,...,3644,12.9,11.9,13.8,5462,3.1,2.9,3.3,1326,3
1,1003,218022,24757,11.355276,26913,12.344167,23579,10.814964,25213,11.564429,...,14692,12.0,11.0,13.1,20520,3.2,3.0,3.5,5479,4
2,1005,24881,2732,10.980266,2960,11.896628,3268,13.13452,3201,12.865239,...,2373,19.7,18.6,20.6,3870,4.5,4.2,4.8,887,6
3,1007,22400,2456,10.964286,2596,11.589286,3029,13.522321,3113,13.897321,...,1789,14.1,13.2,14.9,2511,3.3,3.1,3.6,595,2
4,1009,57840,7095,12.266598,7570,13.087828,6742,11.656293,6884,11.901798,...,4661,13.5,12.6,14.5,6017,3.4,3.2,3.7,1507,2


In [4]:
# tail of the dataframe
df.tail()

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,10-19,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
3135,56037,43051,6104,14.178532,6326,14.6942,5359,12.448027,6577,15.277229,...,2098,8.9,8.3,9.6,2834,2.6,2.4,2.8,821,5
3136,56039,23081,2384,10.328842,2185,9.466661,2967,12.854729,4093,17.7332,...,928,7.2,6.5,8.0,1360,2.4,2.2,2.6,447,5
3137,56041,20299,3121,15.375142,3205,15.788955,2153,10.606434,2702,13.311001,...,1163,10.4,9.5,11.2,1500,3.0,2.8,3.2,430,5
3138,56043,7885,858,10.88142,1113,14.115409,715,9.06785,903,11.452124,...,506,11.3,10.3,12.1,686,3.4,3.2,3.7,207,6
3139,56045,6967,780,11.195637,779,11.181283,681,9.774652,906,13.004162,...,480,11.7,10.7,12.7,644,3.4,3.1,3.6,185,6


In [5]:
# info of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Columns: 108 entries, fips to Urban_rural_code
dtypes: float64(61), int64(45), object(2)
memory usage: 2.6+ MB


In [6]:
# describe the dataframe
df.describe()

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,10-19,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
count,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,...,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0,3140.0
mean,30401.640764,104189.4,12740.3,11.871051,13367.98,12.694609,14469.33,12.283979,13916.49,11.751535,...,5827.242357,13.073503,12.088089,14.053726,9326.577707,3.446242,3.207516,3.710478,2466.234076,4.63535
std,15150.559265,333583.4,41807.3,2.124081,42284.39,1.815044,49577.73,3.126297,48990.95,1.696599,...,15720.551934,2.724351,2.622948,2.824828,29754.601185,0.568059,0.52774,0.613069,7730.422067,1.510447
min,1001.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,6.092789,...,7.0,6.1,5.5,6.7,11.0,1.8,1.7,1.9,3.0,1.0
25%,18180.5,10963.25,1280.5,10.594639,1374.5,11.674504,1263.75,10.496774,1232.75,10.689322,...,815.0,11.2,10.3,12.1,1187.75,3.1,2.9,3.3,314.75,3.0
50%,29178.0,25800.5,3057.0,11.802727,3274.0,12.687422,3108.0,11.772649,3000.5,11.580861,...,1963.5,12.8,11.8,13.8,2743.0,3.4,3.2,3.7,718.0,5.0
75%,45081.5,67913.0,8097.0,12.95184,8822.25,13.659282,8976.25,13.18226,8314.25,12.639379,...,4727.0,14.8,13.7,15.9,6679.25,3.8,3.5,4.1,1776.25,6.0
max,56045.0,10105520.0,1208253.0,25.460677,1239139.0,23.304372,1557073.0,37.570198,1501844.0,22.225129,...,434075.0,25.6,24.2,27.0,952335.0,6.2,5.8,6.6,237766.0,6.0


## Cols for the different types of data

In [7]:
# categorical columns
categorical_cols: list[str] = ['COUNTY_NAME', 'STATE_NAME']

# target variable
target: str = 'anycondition_number'

## Eliminate duplicates

In [8]:
df.duplicated(subset=['fips']).sum()

0

Not duplicates, so we do not have to do any work.

## Drop the fips feature

In [9]:
df.drop('fips', axis=1, inplace=True)

## Missing value analysis

A **missing** value is a space that has no value assigned to it in the observation of a specific variable. These types of values are quite common and can arise for many reasons. For example, there could be an error in data collection, someone may have refused to answer a question in a survey, or it could simply be that certain information is not available or not applicable.

In [10]:
# verify non values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Columns: 107 entries, TOT_POP to Urban_rural_code
dtypes: float64(61), int64(44), object(2)
memory usage: 2.6+ MB


No missing values.

# Step 3. Divide the set into train and test,

In [11]:
from sklearn.model_selection import train_test_split


# variables
X: pd.DataFrame = df.drop(target, axis=1)
y: pd.Series = df[target]

# divide the dataset into training and test samples
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [12]:
X_train

Unnamed: 0,TOT_POP,0-9,0-9 y/o % of total pop,10-19,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,40-49,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
1292,26625,3221,12.097653,3463,13.006573,2922,10.974648,2829,10.625352,2831,...,2314,13.7,12.6,14.9,2823,3.8,3.5,4.1,771,6
2302,51266,5272,10.283619,5751,11.217961,5137,10.020286,5341,10.418211,5880,...,4097,13.1,11.9,14.2,5416,3.5,3.2,3.8,1454,5
761,37779,3915,10.362900,5118,13.547209,6202,16.416528,4363,11.548744,4451,...,2792,12.2,11.2,13.1,3698,2.9,2.7,3.1,871,2
2194,91984,11163,12.135806,12646,13.748043,11595,12.605453,11357,12.346712,11444,...,5716,11.2,10.4,12.0,7913,3.0,2.8,3.2,2118,3
1241,134487,16698,12.416070,17666,13.135842,17281,12.849569,15993,11.891856,15845,...,10002,12.5,11.7,13.4,12987,3.4,3.2,3.6,3490,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3092,42555,4478,10.522853,6520,15.321349,6989,16.423452,4822,11.331218,4879,...,1914,8.0,7.3,8.7,2699,2.5,2.3,2.7,838,2
1095,56031,7624,13.606753,7993,14.265317,7669,13.687066,7740,13.813782,7959,...,3631,11.1,10.2,12.0,4670,2.6,2.4,2.8,1094,3
1130,33443,4680,13.993960,4568,13.659062,4778,14.286996,4199,12.555692,3763,...,3111,16.0,15.0,16.9,3999,4.0,3.8,4.3,1013,6
1294,5795,331,5.711821,493,8.507334,335,5.780846,379,6.540121,540,...,661,16.1,14.6,17.7,816,4.5,4.2,5.0,229,6


# Step 4. Feature scaling and OneHotEncoder

We are going to scale the data using a MinMaxScaler and use OneHotEncoder for categorical values.

In [13]:
# calculate the columns to scale
columns_to_scale = df.drop(columns=categorical_cols + [target], axis=1).columns.tolist()
columns_to_scale

['TOT_POP',
 '0-9',
 '0-9 y/o % of total pop',
 '10-19',
 '10-19 y/o % of total pop',
 '20-29',
 '20-29 y/o % of total pop',
 '30-39',
 '30-39 y/o % of total pop',
 '40-49',
 '40-49 y/o % of total pop',
 '50-59',
 '50-59 y/o % of total pop',
 '60-69',
 '60-69 y/o % of total pop',
 '70-79',
 '70-79 y/o % of total pop',
 '80+',
 '80+ y/o % of total pop',
 'White-alone pop',
 '% White-alone',
 'Black-alone pop',
 '% Black-alone',
 'Native American/American Indian-alone pop',
 '% NA/AI-alone',
 'Asian-alone pop',
 '% Asian-alone',
 'Hawaiian/Pacific Islander-alone pop',
 '% Hawaiian/PI-alone',
 'Two or more races pop',
 '% Two or more races',
 'POP_ESTIMATE_2018',
 'N_POP_CHG_2018',
 'GQ_ESTIMATES_2018',
 'R_birth_2018',
 'R_death_2018',
 'R_NATURAL_INC_2018',
 'R_INTERNATIONAL_MIG_2018',
 'R_DOMESTIC_MIG_2018',
 'R_NET_MIG_2018',
 'Less than a high school diploma 2014-18',
 'High school diploma only 2014-18',
 "Some college or associate's degree 2014-18",
 "Bachelor's degree or higher 201

In [14]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


# create a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', MinMaxScaler(), columns_to_scale),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ],
    remainder='passthrough'  # keep other columns unchanged
)

# fit and transform to the train data and transform the test data
train_transformed_data = preprocessor.fit_transform(X_train)
test_transformed_data = preprocessor.transform(X_test)

# get the output feature names after transformation
feature_names = preprocessor.get_feature_names_out()

# create a new DataFrame with the transformed data
X_train_transformed = pd.DataFrame(train_transformed_data, columns=feature_names)
X_test_transformed = pd.DataFrame(test_transformed_data, columns=feature_names)

# Step 6: Save the data

In [15]:
from src.constants import X_TRAIN_PATH, X_TEST_PATH, Y_TRAIN_PATH, Y_TEST_PATH

# save the processed data to their corresponding files
X_train_transformed.to_csv(path_or_buf=X_TRAIN_PATH, sep=',', index=False, )
X_test_transformed.to_csv(path_or_buf=X_TEST_PATH, sep=',', index=False, )

y_train.to_csv(path_or_buf=Y_TRAIN_PATH, sep=',', index=False, )
y_test.to_csv(path_or_buf=Y_TEST_PATH, sep=',', index=False, )