In [1]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.formula.api as smf
import sklearn.linear_model 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ARDRegression

The code below reads an Excel file, can be set to another data just by changing the
path to the file.

In [2]:
#reading excel file birthweight_low.xlsx
file = './__datasets/birthweight_low.xlsx'
data = pd.read_excel(io = file, header = 0, sheet_name = 0)
data.head(5)

Unnamed: 0,mage,meduc,monpre,npvis,fage,feduc,omaps,fmaps,cigs,drink,male,mwhte,mblck,moth,fwhte,fblck,foth,bwght
0,69,,5,2.0,62,,4,7,23,9,1,0,1,0,0,1,0,697
1,68,12.0,3,10.0,61,11.0,4,6,25,11,1,1,0,0,1,0,0,1290
2,71,12.0,3,6.0,46,12.0,2,7,21,12,1,0,1,0,0,1,0,1490
3,59,16.0,1,8.0,48,16.0,7,8,21,10,0,0,0,1,0,0,1,1720
4,48,12.0,4,6.0,39,12.0,2,9,17,13,0,1,0,0,1,0,0,1956


In [3]:
# formatting and printing the dimensions of the dataset
print(f"""
Size of Original Dataset
------------------------
Observations: {data.shape[0]}
Features:     {data.shape[1]}
""")


Size of Original Dataset
------------------------
Observations: 196
Features:     18



In [4]:
# Overall information about each variable
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mage    196 non-null    int64  
 1   meduc   193 non-null    float64
 2   monpre  196 non-null    int64  
 3   npvis   193 non-null    float64
 4   fage    196 non-null    int64  
 5   feduc   189 non-null    float64
 6   omaps   196 non-null    int64  
 7   fmaps   196 non-null    int64  
 8   cigs    196 non-null    int64  
 9   drink   196 non-null    int64  
 10  male    196 non-null    int64  
 11  mwhte   196 non-null    int64  
 12  mblck   196 non-null    int64  
 13  moth    196 non-null    int64  
 14  fwhte   196 non-null    int64  
 15  fblck   196 non-null    int64  
 16  foth    196 non-null    int64  
 17  bwght   196 non-null    int64  
dtypes: float64(3), int64(15)
memory usage: 27.7 KB


In [5]:
# descriptive statistics for numeric data
data.describe(include = 'number').round(2)

Unnamed: 0,mage,meduc,monpre,npvis,fage,feduc,omaps,fmaps,cigs,drink,male,mwhte,mblck,moth,fwhte,fblck,foth,bwght
count,196.0,193.0,196.0,193.0,196.0,189.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0
mean,40.15,13.91,2.34,11.6,39.29,13.85,8.19,8.96,10.93,5.4,0.55,0.27,0.38,0.35,0.35,0.34,0.31,3334.09
std,10.25,2.06,1.36,4.27,8.98,2.63,1.58,0.65,6.1,3.0,0.5,0.45,0.49,0.48,0.48,0.48,0.46,646.7
min,23.0,8.0,1.0,2.0,23.0,1.0,2.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,697.0
25%,33.0,12.0,2.0,10.0,34.75,12.0,8.0,9.0,6.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2916.25
50%,39.0,14.0,2.0,12.0,38.0,14.0,9.0,9.0,11.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3452.0
75%,46.0,16.0,3.0,12.0,43.0,16.0,9.0,9.0,15.25,7.25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3759.5
max,71.0,17.0,8.0,35.0,73.0,17.0,10.0,10.0,25.0,14.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4933.0


In [6]:
data.isnull().sum(axis = 0)

mage      0
meduc     3
monpre    0
npvis     3
fage      0
feduc     7
omaps     0
fmaps     0
cigs      0
drink     0
male      0
mwhte     0
mblck     0
moth      0
fwhte     0
fblck     0
foth      0
bwght     0
dtype: int64

<h3>Working with missing values</h3>
1) Imputing average education to mother and father education lists, average education stands for
finished high school what is likely to be true, and there are only 3 and 7 missing values, so it will
not dramatically change the model
2) For number of visits missing values will be imputed with the median, what is common number of visits

In [7]:
# instantiating an imputation value
fill = 13


# imputing 'Mother and Father Education'
data['meduc'] = data['meduc'].fillna(fill)
data['feduc'] = data['feduc'].fillna(fill)

In [8]:
# instantiating an imputation value
fill = 12.0


# imputing 'Number of Visits'
data['npvis'] = data['npvis'].fillna(fill)

In [9]:
# turning float to integer
data['npvis'] = data['npvis'].apply(np.int64)
data['meduc'] = data['meduc'].apply(np.int64)
data['feduc'] = data['feduc'].apply(np.int64)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   mage    196 non-null    int64
 1   meduc   196 non-null    int64
 2   monpre  196 non-null    int64
 3   npvis   196 non-null    int64
 4   fage    196 non-null    int64
 5   feduc   196 non-null    int64
 6   omaps   196 non-null    int64
 7   fmaps   196 non-null    int64
 8   cigs    196 non-null    int64
 9   drink   196 non-null    int64
 10  male    196 non-null    int64
 11  mwhte   196 non-null    int64
 12  mblck   196 non-null    int64
 13  moth    196 non-null    int64
 14  fwhte   196 non-null    int64
 15  fblck   196 non-null    int64
 16  foth    196 non-null    int64
 17  bwght   196 non-null    int64
dtypes: int64(18)
memory usage: 27.7 KB


Researches showed that ofter 37 the probability of having a healthy baby drops significantly
Whats why we identifying a "risk" group mothers.

https://www.healthline.com/health/womens-health/childbearing-age#:~:text=Experts%20say%20the%20best%20time,a%20first%20child%20as%2030.5.
    
    

In [11]:
# placeholder variables 
data['mage_under37']  = 0
data['mage_over_37']    = 0


for index, value in data.iterrows():
    
    # Creating a dummy for mage under 23 years old
    if data.loc[index, 'mage'] < 37:
        data.loc[index, 'mage_under37'] = 1

        
    # Creating a dummy for mage between 23 and 35 years old
    if data.loc[index, 'mage'] >= 37:
        data.loc[index, 'mage_over_37'] = 1

        
data.head(5)

Unnamed: 0,mage,meduc,monpre,npvis,fage,feduc,omaps,fmaps,cigs,drink,male,mwhte,mblck,moth,fwhte,fblck,foth,bwght,mage_under37,mage_over_37
0,69,13,5,2,62,13,4,7,23,9,1,0,1,0,0,1,0,697,0,1
1,68,12,3,10,61,11,4,6,25,11,1,1,0,0,1,0,0,1290,0,1
2,71,12,3,6,46,12,2,7,21,12,1,0,1,0,0,1,0,1490,0,1
3,59,16,1,8,48,16,7,8,21,10,0,0,0,1,0,0,1,1720,0,1
4,48,12,4,6,39,12,2,9,17,13,0,1,0,0,1,0,0,1956,0,1


In [12]:
# placeholder variables 
data['fage_under37']  = 0
data['fage_over_37']    = 0


for index, value in data.iterrows():
    
    # Creating a dummy for mage under 23 years old
    if data.loc[index, 'fage'] < 37:
        data.loc[index, 'fage_under37'] = 1

        
    # Creating a dummy for mage between 23 and 35 years old
    if data.loc[index, 'fage'] >= 37:
        data.loc[index, 'fage_over_37'] = 1

        
data.head(5)

Unnamed: 0,mage,meduc,monpre,npvis,fage,feduc,omaps,fmaps,cigs,drink,...,mblck,moth,fwhte,fblck,foth,bwght,mage_under37,mage_over_37,fage_under37,fage_over_37
0,69,13,5,2,62,13,4,7,23,9,...,1,0,0,1,0,697,0,1,0,1
1,68,12,3,10,61,11,4,6,25,11,...,0,0,1,0,0,1290,0,1,0,1
2,71,12,3,6,46,12,2,7,21,12,...,1,0,0,1,0,1490,0,1,0,1
3,59,16,1,8,48,16,7,8,21,10,...,0,1,0,0,1,1720,0,1,0,1
4,48,12,4,6,39,12,2,9,17,13,...,0,0,1,0,0,1956,0,1,0,1


In [13]:
# applying modelin scikit-learn
x_variables = [ 'npvis', 'cigs', 'drink', 'male', 'monpre', 'mage_over_37', 'fage']

# preparing x-variables from the OLS model
lasso_data = data.loc[ : , x_variables]


# preparing response variable
target = data.loc[ : , 'bwght']

# applying modelin scikit-learn
x_variables = ['mage', 'meduc', 'monpre', 'npvis', 'fage', 'feduc', 'omaps', 'fmaps', 
'cigs', 'drink', 'male', 'mwhte', 'mblck', 'moth', 'fwhte', 'fblck', 'foth', 'fage_under37',
'fage_over_37', 'mage_over_37', 'mage_under37']

# preparing x-variables from the OLS model
lasso_data = data.loc[ : , x_variables]


# preparing response variable
target = data.loc[ : , 'bwght']

# applying modelin scikit-learn
x_variables = ['npvis', 'cigs', 'drink', 'male', 'fage_under37', 'mage_over_37']

# preparing x-variables from the OLS model
lasso_data = data.loc[ : , x_variables]


# preparing response variable
target = data.loc[ : , 'bwght']

In [14]:

###############################################
## setting up more than one train-test split ##
###############################################
# FULL X-dataset (normal Y)
x_train, x_test, y_train, y_test = train_test_split(
            lasso_data, # x-variables
            target,# y-variable
            test_size = 0.25,
            random_state = 219)


In [15]:
# preparing explanatory variable data
data   = data.drop(['bwght', 'omaps', 'fmaps', 'fage', 'feduc', 'mage', 'monpre', 'moth', 'foth', 'fwhte', 'fblck' ],
                               axis = 1)
# INSTANTIATING a model object
lr = LinearRegression()


# FITTING to the training data
lr_fit = lr.fit(x_train, y_train)


# PREDICTING on new data
lr_pred = lr_fit.predict(x_test)


# SCORING the results
print('OLS Training Score :', lr.score(x_train, y_train).round(4))  # using R-square
print('OLS Testing Score  :',  lr.score(x_test, y_test).round(4)) # using R-square

lr_train_score = lr.score(x_train, y_train).round(4)
lr_test_score  = lr.score(x_test, y_test).round(4)

# displaying and saving the gap between training and testing
print('OLS Train-Test Gap :', abs(lr_train_score - lr_test_score).round(4))
lr_test_gap = abs(lr_train_score - lr_test_score).round(4)

OLS Training Score : 0.6841
OLS Testing Score  : 0.6942
OLS Train-Test Gap : 0.0101


In [16]:
# INSTANTIATING a model object
lasso_model = sklearn.linear_model.Lasso(alpha = 1.0,
                                         normalize = True) # default magitude


# FITTING to the training data
lasso_fit = lasso_model.fit(x_train, y_train)


# PREDICTING on new data
lasso_pred = lasso_fit.predict(x_test)


# SCORING the results
print('Lasso Training Score :', lasso_model.score(x_train, y_train).round(4))
print('Lasso Testing Score  :', lasso_model.score(x_test, y_test).round(4))


## the following code has been provided for you ##

# saving scoring data for future use
lasso_train_score = lasso_model.score(x_train, y_train).round(4) # using R-square
lasso_test_score  = lasso_model.score(x_test, y_test).round(4)   # using R-square


# displaying and saving the gap between training and testing
print('Lasso Train-Test Gap :', abs(lasso_train_score - lasso_test_score).round(4))
lasso_test_gap = abs(lasso_train_score - lasso_test_score).round(4)

Lasso Training Score : 0.6827
Lasso Testing Score  : 0.706
Lasso Train-Test Gap : 0.0233


In [17]:
# zipping each feature name to its coefficient
lasso_model_values = zip(data.columns, lasso_fit.coef_.round(decimals = 2))


# setting up a placeholder list to store model features
lasso_model_lst = [('intercept', lasso_fit.intercept_.round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in lasso_model_values:
    lasso_model_lst.append(val)
    

# checking the results
for pair in lasso_model_lst:
    print(pair)

('intercept', 4868.01)
('meduc', 0.34)
('npvis', -35.64)
('cigs', -115.27)
('drink', 36.56)
('male', -13.02)
('mwhte', -79.3)
('mblck', -11.47)


In [18]:
from sklearn.linear_model import ARDRegression 
# INSTANTIATING a model object
ard_model = ARDRegression()


# FITTING the training data
ard_fit = ard_model.fit(x_train, y_train)


# PREDICTING on new data
ard_pred = ard_fit.predict(x_test)


print('Training Score:', ard_model.score(x_train, y_train))
print('Testing Score :',  ard_model.score(x_test, y_test))


# saving scoring data for future use
ard_train_score = ard_model.score(x_train, y_train)
ard_test_score  = ard_model.score(x_test, y_test)


# displaying and saving the gap between training and testing
print('ARD Train-Test Gap :', abs(ard_train_score - ard_test_score).round(4))
ard_test_gap = abs(ard_train_score - ard_test_score).round(4)

Training Score: 0.6794886156258924
Testing Score : 0.7191626718306878
ARD Train-Test Gap : 0.0397


In [19]:
# INSTANTIATING a model object
lr = LinearRegression()


# FITTING to the training data
lr_fit = lr.fit(x_train, y_train)


# PREDICTING on new data
lr_pred = lr_fit.predict(x_test)


# SCORING the results
print('OLS Training Score :', lr.score(x_train, y_train).round(4))  # using R-square
print('OLS Testing Score  :',  lr.score(x_test, y_test).round(4)) # using R-square

lr_train_score = lr.score(x_train, y_train).round(4)
lr_test_score  = lr.score(x_test, y_test).round(4)

# displaying and saving the gap between training and testing
print('OLS Train-Test Gap :', abs(lr_train_score - lr_test_score).round(4))
lr_test_gap = abs(lr_train_score - lr_test_score).round(4)

OLS Training Score : 0.6841
OLS Testing Score  : 0.6942
OLS Train-Test Gap : 0.0101


Final model

In [20]:
from sklearn.linear_model import ARDRegression 
# INSTANTIATING a model object
ard_model = ARDRegression()


# FITTING the training data
ard_fit = ard_model.fit(x_train, y_train)


# PREDICTING on new data
ard_pred = ard_fit.predict(x_test)


print('Training Score:', ard_model.score(x_train, y_train))
print('Testing Score :',  ard_model.score(x_test, y_test))


# saving scoring data for future use
ard_train_score = ard_model.score(x_train, y_train)
ard_test_score  = ard_model.score(x_test, y_test)


# displaying and saving the gap between training and testing
print('ARD Train-Test Gap :', abs(ard_train_score - ard_test_score).round(4))
ard_test_gap = abs(ard_train_score - ard_test_score).round(4)

Training Score: 0.6794886156258924
Testing Score : 0.7191626718306878
ARD Train-Test Gap : 0.0397
