In [1]:
"""
    Introduction:
        Goal: To predict the birthweight using sklearn models
     
    Note:   All files are in the same folder (Script and Excel file)
            
    
    Known bugs: No bug check was done. we ran out of time.
"""

# importing libraries
import pandas as pd # data science essentials
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # enhanced data visualization
import numpy as np # mathematical essentials
import statsmodels.formula.api as smf # regression modeling
from sklearn.model_selection import train_test_split # train/test split
from sklearn.linear_model import LinearRegression
import sklearn.linear_model # linear models


# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


# specifying file names
file   = './birthweight_low.xlsx'


# reading the files into Python
birthweight_raw     = pd.read_excel(file)

# checking the file
birthweight_raw.head(n = 5)

Unnamed: 0,mage,meduc,monpre,npvis,fage,feduc,omaps,fmaps,cigs,drink,male,mwhte,mblck,moth,fwhte,fblck,foth,bwght
0,69,,5,2.0,62,,4,7,23,9,1,0,1,0,0,1,0,697
1,68,12.0,3,10.0,61,11.0,4,6,25,11,1,1,0,0,1,0,0,1290
2,71,12.0,3,6.0,46,12.0,2,7,21,12,1,0,1,0,0,1,0,1490
3,59,16.0,1,8.0,48,16.0,7,8,21,10,0,0,0,1,0,0,1,1720
4,48,12.0,4,6.0,39,12.0,2,9,17,13,0,1,0,0,1,0,0,1956


In [2]:
birthweight_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mage    196 non-null    int64  
 1   meduc   193 non-null    float64
 2   monpre  196 non-null    int64  
 3   npvis   193 non-null    float64
 4   fage    196 non-null    int64  
 5   feduc   189 non-null    float64
 6   omaps   196 non-null    int64  
 7   fmaps   196 non-null    int64  
 8   cigs    196 non-null    int64  
 9   drink   196 non-null    int64  
 10  male    196 non-null    int64  
 11  mwhte   196 non-null    int64  
 12  mblck   196 non-null    int64  
 13  moth    196 non-null    int64  
 14  fwhte   196 non-null    int64  
 15  fblck   196 non-null    int64  
 16  foth    196 non-null    int64  
 17  bwght   196 non-null    int64  
dtypes: float64(3), int64(15)
memory usage: 27.7 KB


In [3]:
#Filling missing values for mothers education
fill = birthweight_raw['meduc'].median()
birthweight_raw['meduc'] = birthweight_raw['meduc'].fillna(fill)

#Filling missing values for npvis
fill = birthweight_raw['npvis'].median()
birthweight_raw['npvis'] = birthweight_raw['npvis'].fillna(fill)

#Filling missing values for fathers education
fill = birthweight_raw['feduc'].median()
birthweight_raw['feduc'] = birthweight_raw['feduc'].fillna(fill)

# making sure all missing values have been taken care of
birthweight_raw.isnull().any().any()

False

Transformations 

omaps and fmap removed because they are taken after birth
source: https://www.acog.org/clinical/clinical-guidance/committee-opinion/articles/2015/10/the-apgar-score


Number of drinks less than 3 set to 0 because less than 3 drinks would not affect the birth weight
source : https://www.health.harvard.edu/blog/study-no-connection-between-drinking-alcohol-early-in-pregnancy-and-birth-problems-201309106667

In [4]:
# log transforming birthweight and saving it to the dataset
birthweight_raw['log_bwght'] = np.log(birthweight_raw['bwght'])


# placeholder variables
#birthweight_raw['meduc_b'] = 0
#birthweight_raw['feduc_b']     = 0
#birthweight_raw['monpre_b']    = 0
#birthweight_raw['drink_b']      = 0


# iterating over each original column to
# change values in the new feature columns
for index, value in birthweight_raw.iterrows():


    # 
 #   if birthweight_raw.loc[index, 'meduc'] < 16:
#        birthweight_raw.loc[index, 'meduc'] = 0
        
        
    # 
  #  if birthweight_raw.loc[index, 'feduc'] < 16:
   #     birthweight_raw.loc[index, 'feduc'] = 0
        
        
    # #
#    if birthweight_raw.loc[index, 'cigs'] > 17:
 #       birthweight_raw.loc[index, 'cigs'] = 20
  #  if birthweight_raw.loc[index, 'cigs'] < 3:
   #     birthweight_raw.loc[index, 'cigs'] = 1
        
        
    # 
    if birthweight_raw.loc[index, 'drink'] < 3:
        birthweight_raw.loc[index, 'drink'] = 0




birthweight = birthweight_raw

In [5]:
# preparing explanatory variable data
birthweight_data   = birthweight_raw.drop(['bwght',
                                           'fblck',
                                           'moth',
                                           'mwhte',
                                           'fmaps',
                                           'omaps',
                                           'monpre',
                                           'foth',
                                           'mblck',
                                           'fwhte',
                                           
                                           

                                           'log_bwght'
                                           ],
                                           axis = 1)


# preparing response variables
birthweight_target = birthweight_raw.loc[ : , 'bwght']
log_birthweight_target = birthweight_raw.loc[ : , 'log_bwght']


# preparing training and testing sets (all letters are lowercase)
x_train, x_test, y_train, y_test = train_test_split(
            birthweight_data,
            birthweight_target,
            test_size = 0.25,
            random_state = 219)


# checking the shapes of the datasets
print(f"""
Training Data
-------------
X-side: {x_train.shape}
y-side: {y_train.shape}


Testing Data
------------
X-side: {x_test.shape}
y-side: {y_test.shape}
""")


Training Data
-------------
X-side: (147, 8)
y-side: (147,)


Testing Data
------------
X-side: (49, 8)
y-side: (49,)



In [6]:
# applying model in scikit-learn


# Preparing the target variable
birthweight_target = birthweight.loc[ : , 'bwght']


###############################################
## setting up train-test split ##
###############################################
# FULL X-dataset (normal Y)
x_train_FULL, x_test_FULL, y_train_FULL, y_test_FULL = train_test_split(
            birthweight_data,     # x-variables
            birthweight_target,   # y-variable
            test_size = 0.25,
            random_state = 219)





In [7]:
import sklearn.linear_model # linear models

In [8]:
# INSTANTIATING a model object
lasso_model = sklearn.linear_model.Lasso(alpha     = 3.0,  # default shrinkage
                                         normalize = True
                                         ) # default magitude


# FITTING to the training data
lasso_fit = lasso_model.fit(x_train_FULL, y_train_FULL)


# PREDICTING on new data
lasso_pred = lasso_fit.predict(x_test_FULL)


# SCORING the results
print('Lasso Training Score :', lasso_model.score(x_train_FULL, y_train_FULL).round(4))
print('Lasso Testing Score  :', lasso_model.score(x_test_FULL, y_test_FULL).round(4))


## the following code has been provided for you ##

# saving scoring data for future use
lasso_train_score = lasso_model.score(x_train_FULL, y_train_FULL).round(4) # using R-square
lasso_test_score  = lasso_model.score(x_test_FULL, y_test_FULL).round(4)   # using R-square


# displaying and saving the gap between training and testing
print('Lasso Train-Test Gap :', abs(lasso_train_score - lasso_test_score).round(4))
lasso_test_gap = abs(lasso_train_score - lasso_test_score).round(4)

Lasso Training Score : 0.7101
Lasso Testing Score  : 0.6608
Lasso Train-Test Gap : 0.0493


In [9]:
# comparing results

print(f"""
Model      Train Score      Test Score     Train-Test Gap
-----      -----------      ----------     --------------

Lasso      {lasso_train_score}           {lasso_test_score}    \
     {lasso_test_gap}

Final Model
""")




Model      Train Score      Test Score     Train-Test Gap
-----      -----------      ----------     --------------

Lasso      0.7101           0.6608         0.0493

Final Model

