In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pandas as pd
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
#data source: https://data.nber.org/data/vital-statistics-natality-data.html

In [28]:
#import the csv file via spark way
#create DF
births = spark.read.csv('nat18.csv', header = True)

In [29]:
#print the data schema
births.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- birth_place: string (nullable = true)
 |-- mothers_age: string (nullable = true)
 |-- fathers_age: string (nullable = true)
 |-- prental_care: string (nullable = true)
 |-- cigarettes_before_pregnancy: string (nullable = true)
 |-- cigarettes_1_trimester: string (nullable = true)
 |-- cigarettes_2_trimester: string (nullable = true)
 |-- cigarettes_3_trimester: string (nullable = true)
 |-- mothers_height: string (nullable = true)
 |-- bmi: string (nullable = true)
 |-- prepregnancy_weight: string (nullable = true)
 |-- delivery_weight: string (nullable = true)
 |-- weight_gain: string (nullable = true)
 |-- prepregnancy_diabetes: string (nullable = true)
 |-- gestational_diabetes: string (nullable = true)
 |-- prepregnancy_hypertension: string (nullable = true)
 |-- gestational_hypertension: string (nullable = true)
 |-- hypertension_eclampsia: string (nullable = true)
 |-- previous_preterm_birth: string (nullable = true)
 |-- infant_sex: s

In [4]:
#First we need to filter our dataset to get rid of any unknown records
#according to our user guide, 'U' means unknown
births = births.filter((births.infant_live != 'U'))
births = births.filter((births.prepregnancy_diabetes != 'U'))
births = births.filter((births.gestational_diabetes != 'U'))
births = births.filter((births.prepregnancy_hypertension != 'U'))
births = births.filter((births.gestational_hypertension!= 'U'))
births = births.filter((births.hypertension_eclampsia!= 'U'))
births = births.filter((births.previous_preterm_birth!= 'U'))
births = births.filter((births.fathers_age!= 99))
births = births.filter((births.cigarettes_before_pregnancy!= 99))
births = births.filter((births.cigarettes_1_trimester!= 99))
births = births.filter((births.cigarettes_2_trimester!= 99))
births = births.filter((births.cigarettes_3_trimester!= 99))
births = births.filter((births.mothers_height!= 99))
births = births.filter((births.bmi!= 99.9))
births = births.filter((births.prepregnancy_weight!= 999))
births = births.filter((births.delivery_weight!= 999))
births = births.filter((births.weight_gain!= 99))

In [5]:
#count the frequency distribution of living cases
births.groupby('infant_live').count().show()
#It seems our dataset is imbalanced.

+-----------+-------+
|infant_live|  count|
+-----------+-------+
|          Y|3224846|
|          N|   2213|
+-----------+-------+



In [6]:
#Basical dataset statistical check
#step1: if the dataset contains duplicate rows:
print('Count of rows: {0}'.format(births.count()))
print('Count of distinct rows: {0}'.format(births.distinct().count()))
#the results show our dataset contains no same rows

Count of rows: 3227059
Count of distinct rows: 3227059


In [7]:
#step2: check if the dataset contains missing values:
bool(births.head(1)) #if the result is False，then the dataset has missing values，otherwise no.

True

In [8]:
#step3: For numerical data, apply the describe() to get statistical summay.
numerical = ['mothers_age', 'fathers_age', 'cigarettes_before_pregnancy','cigarettes_1_trimester'
            ,'cigarettes_2_trimester','cigarettes_3_trimester','mothers_height','bmi'
            ,'prepregnancy_weight','delivery_weight','weight_gain']
desc = births.describe(numerical)
desc.toPandas()

Unnamed: 0,summary,mothers_age,fathers_age,cigarettes_before_pregnancy,cigarettes_1_trimester,cigarettes_2_trimester,cigarettes_3_trimester,mothers_height,bmi,prepregnancy_weight,delivery_weight,weight_gain
0,count,3227059.0,3227059.0,3227059.0,3227059.0,3227059.0,3227059.0,3227059.0,3227059.0,3227059.0,3227059.0,3227059.0
1,mean,29.369372236454307,31.78780741225989,0.9047463960218888,0.536430229506185,0.4000633394059421,0.3547152376203843,64.14300017446226,27.0217423356628,158.3141612843149,187.8532930448436,29.630798817127296
2,stddev,5.65609680246426,6.794544600008535,4.268990604512946,3.0017986187169297,2.466927699787348,2.3158727974928977,2.82244356691906,6.603225258403328,40.895831495380655,40.439737222800055,14.838047763424376
3,min,12.0,11.0,0.0,0.0,0.0,0.0,36.0,13.0,100.0,100.0,0.0
4,max,50.0,95.0,98.0,98.0,98.0,98.0,78.0,69.9,99.0,400.0,98.0


In [9]:
#feature engineering - covert categorical data to dummy variable
import pyspark.sql.functions as fn
#1.
categ = births.select('infant_live').distinct().rdd.flatMap(lambda x:x).collect()
exprs = [fn.when(fn.col('infant_live') == Y,1).otherwise(0)\
            .alias(str(Y)) for Y in categ]
births = births.select(exprs+births.columns)
#drop useless columns and rename target column
births = births.drop('N')
births = births.drop('_c0')
births = births.drop('infant_live')
births = births.withColumnRenamed('Y','infant_live_encoded')

In [22]:
births = births.withColumn('prepregnancy_diabetes_int',births['prepregnancy_diabetes']
                          .cast(typ.IntegerType()))

In [26]:
#test cell - OneHotEncoding rather than hard coding:
import pyspark.ml.feature as ft
import pyspark.sql.types as typ
from pyspark.ml import Pipeline
#births = births.withColumn('prepregnancy_diabetes_int',births['prepregnancy_diabetes']
                          #.cast(typ.IntegerType()))
encoder = ft.OneHotEncoder(inputCol = 'prepregnancy_diabetes',
                          outputCol = 'prepregnancy_diabetes_vec')
ft = ft.VectorAssembler(inputCols = ['prepregnancy_diabetes_vec'],outputCol = 'feature1')
test_pipe = Pipeline(stages=[encoder,ft]).fit(births)
births.groupby('prepregnancy_diabetes_int').count().show()

IllegalArgumentException: requirement failed: Column prepregnancy_diabetes must be of type numeric but was actually of type string.

In [10]:
#2.
cat2 = births.select('prepregnancy_diabetes').distinct().rdd.flatMap(lambda x:x).collect()
expr2 = [fn.when(fn.col('prepregnancy_diabetes') == Y,1).otherwise(0)\
            .alias(str(Y)) for Y in cat2]
births = births.select(expr2+births.columns)
births = births.drop('N')
births = births.drop('prepregnancy_diabetes')
births = births.withColumnRenamed('Y','prepregnancy_diabetes_encoded')

In [11]:
#3.
cat3 = births.select('gestational_diabetes').distinct().rdd.flatMap(lambda x:x).collect()
expr3 = [fn.when(fn.col('gestational_diabetes') == Y,1).otherwise(0)\
            .alias(str(Y)) for Y in cat3]
births = births.select(expr3+births.columns)
births = births.drop('N')
births = births.drop('gestational_diabetes')
births = births.withColumnRenamed('Y','gestational_diabetes_encoded')

In [12]:
#4.
cat4 = births.select('prepregnancy_hypertension').distinct().rdd.flatMap(lambda x:x).collect()
expr4 = [fn.when(fn.col('prepregnancy_hypertension') == Y,1).otherwise(0)\
            .alias(str(Y)) for Y in cat4]
births = births.select(expr4+births.columns)
births = births.drop('N')
births = births.drop('prepregnancy_hypertension')
births = births.withColumnRenamed('Y','prepregnancy_hypertension_encoded')

In [13]:
#5.
cat5 = births.select('gestational_hypertension').distinct().rdd.flatMap(lambda x:x).collect()
expr5 = [fn.when(fn.col('gestational_hypertension') == Y,1).otherwise(0)\
            .alias(str(Y)) for Y in cat5]
births = births.select(expr5 + births.columns)
births = births.drop('N')
births = births.drop('gestational_hypertension')
births = births.withColumnRenamed('Y','gestational_hypertension_encoded')

In [14]:
#6.
cat6 = births.select('hypertension_eclampsia').distinct().rdd.flatMap(lambda x:x).collect()
expr6 = [fn.when(fn.col('hypertension_eclampsia') == Y,1).otherwise(0)\
            .alias(str(Y)) for Y in cat6]
births = births.select(expr6 + births.columns)
births = births.drop('N')
births = births.drop('hypertension_eclampsia')
births = births.withColumnRenamed('Y','hypertension_eclampsia_encoded')

In [15]:
#7.
cat7 = births.select('previous_preterm_birth').distinct().rdd.flatMap(lambda x:x).collect()
expr7 = [fn.when(fn.col('previous_preterm_birth') == Y,1).otherwise(0)\
            .alias(str(Y)) for Y in cat7]
births = births.select(expr7 + births.columns)
births = births.drop('N')
births = births.drop('previous_preterm_birth')
births = births.withColumnRenamed('Y','previous_preterm_birth_encoded')

In [16]:
#8.
cat8 = births.select('infant_sex').distinct().rdd.flatMap(lambda x:x).collect()
expr8 = [fn.when(fn.col('infant_sex') == M,1).otherwise(0)\
            .alias(str(M)) for M in cat8]
births = births.select(expr8 + births.columns)
births = births.drop('F')
births = births.drop('infant_sex')
births = births.withColumnRenamed('M','infant_sex_encoded')

In [19]:
#cast data type:
from pyspark.sql.functions import col
import pyspark.sql.types as typ
births = births.withColumn("birth_place", col("birth_place").cast(typ.IntegerType()))
births = births.withColumn("mothers_age", col("mothers_age").cast(typ.IntegerType()))
births = births.withColumn("fathers_age", col("fathers_age").cast(typ.IntegerType()))
births = births.withColumn("prental_care", col("prental_care").cast(typ.IntegerType()))
births = births.withColumn("cigarettes_before_pregnancy", col("cigarettes_before_pregnancy").cast(typ.IntegerType()))
births = births.withColumn("cigarettes_1_trimester", col("cigarettes_1_trimester").cast(typ.IntegerType()))
births = births.withColumn("cigarettes_2_trimester", col("cigarettes_2_trimester").cast(typ.IntegerType()))
births = births.withColumn("cigarettes_3_trimester", col("cigarettes_3_trimester").cast(typ.IntegerType()))
births = births.withColumn("mothers_height", col("mothers_height").cast(typ.IntegerType()))
births = births.withColumn("bmi", col("bmi").cast(typ.FloatType()))
births = births.withColumn("prepregnancy_weight", col("prepregnancy_weight").cast(typ.IntegerType()))
births = births.withColumn("delivery_weight", col("delivery_weight").cast(typ.IntegerType()))
births = births.withColumn("weight_gain", col("weight_gain").cast(typ.IntegerType()))

In [22]:
#Add one more pairplot
births.groupby('infant_sex_encoded').count().show()

+------------------+-------+
|infant_sex_encoded|  count|
+------------------+-------+
|                 1|1650185|
|                 0|1576874|
+------------------+-------+



In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
#infant sex and infant live case
ax = sns.pairplot(births.toPandas()[['infant_live_encoded','infant_sex_encoded']])
ax.fig.suptitle("Vital Data Smoking Pair Plot", y=1.01)

In [None]:
#mother's age and smoking before pregnancy
ax = sns.pairplot(births.toPandas()[['mothers_age','cigarettes_before_pregnancy']])
ax.fig.suptitle("Vital Data Smoking Pair Plot", y=1.01)

In [24]:
#make a copy of the original data frame
births_v1 = births
births_v1.printSchema()

root
 |-- infant_sex_encoded: integer (nullable = false)
 |-- previous_preterm_birth_encoded: integer (nullable = false)
 |-- hypertension_eclampsia_encoded: integer (nullable = false)
 |-- gestational_hypertension_encoded: integer (nullable = false)
 |-- prepregnancy_hypertension_encoded: integer (nullable = false)
 |-- gestational_diabetes_encoded: integer (nullable = false)
 |-- prepregnancy_diabetes_encoded: integer (nullable = false)
 |-- infant_live_encoded: integer (nullable = false)
 |-- birth_place: integer (nullable = true)
 |-- mothers_age: integer (nullable = true)
 |-- fathers_age: integer (nullable = true)
 |-- prental_care: integer (nullable = true)
 |-- cigarettes_before_pregnancy: integer (nullable = true)
 |-- cigarettes_1_trimester: integer (nullable = true)
 |-- cigarettes_2_trimester: integer (nullable = true)
 |-- cigarettes_3_trimester: integer (nullable = true)
 |-- mothers_height: integer (nullable = true)
 |-- bmi: float (nullable = true)
 |-- prepregnancy_wei

In [25]:
#Baseline model: logistic model
import pyspark.ml.feature as ft
from pyspark.ml import Pipeline
feature_pipe = ft.VectorAssembler(inputCols=['birth_place', 'mothers_age', 'fathers_age', 'prental_care', 'cigarettes_before_pregnancy', 'cigarettes_1_trimester'
                                    ,'cigarettes_2_trimester','cigarettes_3_trimester','mothers_height'
                                    ,'bmi','prepregnancy_weight','delivery_weight','weight_gain','prepregnancy_diabetes_encoded'
                                    ,'gestational_diabetes_encoded','prepregnancy_hypertension_encoded','gestational_hypertension_encoded'
                                ,'hypertension_eclampsia_encoded','previous_preterm_birth_encoded','infant_sex_encoded'], outputCol = 'features')


In [26]:
import pyspark.ml.classification as cl
logistic = cl.LogisticRegression(
                    featuresCol= 'features',
                    labelCol ='infant_live_encoded')

In [27]:
#build a simple pipeline
pipeline = Pipeline(stages = [feature_pipe, logistic])

In [28]:
#split dataset into training and testing sets
births_train, births_test = births_v1.randomSplit([0.75,0.25],seed = 666)
#fit and transform 
model = pipeline.fit(births_train)
test_model = model.transform(births_test)
#check the model details
test_model.take(1)

[Row(infant_sex_encoded=0, previous_preterm_birth_encoded=0, hypertension_eclampsia_encoded=0, gestational_hypertension_encoded=0, prepregnancy_hypertension_encoded=0, gestational_diabetes_encoded=0, prepregnancy_diabetes_encoded=0, infant_live_encoded=0, birth_place=1, mothers_age=25, fathers_age=25, prental_care=1, cigarettes_before_pregnancy=0, cigarettes_1_trimester=0, cigarettes_2_trimester=0, cigarettes_3_trimester=0, mothers_height=68, bmi=28.700000762939453, prepregnancy_weight=189, delivery_weight=216, weight_gain=27, features=SparseVector(20, {0: 1.0, 1: 25.0, 2: 25.0, 3: 1.0, 8: 68.0, 9: 28.7, 10: 189.0, 11: 216.0, 12: 27.0}), rawPrediction=DenseVector([-7.5138, 7.5138]), probability=DenseVector([0.0005, 0.9995]), prediction=1.0)]

In [29]:
import pyspark.ml.evaluation as ev
#Binary Classification evaluation
evaluator = ev.BinaryClassificationEvaluator(
                rawPredictionCol = 'probability',
                labelCol = 'infant_live_encoded')
#rawPredictionCol can be either rawPredictionCol or probability
print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'}))
print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'}))

0.5984844382239075
0.999449061898995


In [30]:
#As we noticed before, our dataset is imbalanced, so in the following steps, we are goning to fix this issue
#step1. make a copy and calculate the label ratio
births_v2 = births
major_df = births_v2.filter(fn.col("infant_live_encoded")==1)
minor_df = births_v2.filter(fn.col("infant_live_encoded")==0)
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))

ratio: 1457


In [31]:
#we have two options: oversampling or undersampling
#for our project, we tried oversampling:
#Oversampling:  duplicate the samples from under-represented class, 
#to inflate the numbers till it reaches the same level as the dominant class
r = range(ratio)
## duplicate the minority rows
oversampled_df = minor_df.withColumn("dummy", fn.explode(fn.array([fn.lit(x) for x in r]))).drop('dummy')
# combine both oversampled minority rows and previous majority rows 
combined_df = major_df.unionAll(oversampled_df)

In [32]:
#We added the number of target label 0 almost equals to the number of label 1
combined_df.groupby('infant_live_encoded').count().show()

+-------------------+-------+
|infant_live_encoded|  count|
+-------------------+-------+
|                  1|3224846|
|                  0|3224341|
+-------------------+-------+



In [33]:
#start new pipe to see if oversampling imporve our model performance or not
train, test = combined_df.randomSplit([0.75,0.25],seed = 666)
model2 = pipeline.fit(train)
test_model2 = model.transform(test)
#print out the results:
print(evaluator.evaluate(test_model2, {evaluator.metricName: 'areaUnderROC'}))
print(evaluator.evaluate(test_model2, {evaluator.metricName: 'areaUnderPR'}))

0.6160859082989266
0.5848010544284474


In [37]:
#the above results show oversampling improve the ROC about 2% and significently decreased our
#PR by 41%. Now we are going to do more feature engineering
#build vector pipe:
vectorizer = ft.VectorAssembler(inputCols=['birth_place', 'mothers_age', 'fathers_age', 'prental_care', 'cigarettes_before_pregnancy', 'cigarettes_1_trimester'
                                    ,'cigarettes_2_trimester','cigarettes_3_trimester','mothers_height'
                                    ,'bmi','prepregnancy_weight','delivery_weight','weight_gain','prepregnancy_diabetes_encoded'
                                    ,'gestational_diabetes_encoded','prepregnancy_hypertension_encoded','gestational_hypertension_encoded'
                                ,'hypertension_eclampsia_encoded','previous_preterm_birth_encoded','infant_sex_encoded'], outputCol = 'features')
#StandardScaler pipe:
normalizer = ft.StandardScaler(
                            inputCol = 'features',
                            outputCol = 'norm_features',
                            withMean = True,
                            withStd = True)
#build a new lr 
lr = cl.LogisticRegression(
                    maxIter = 10,
                    regParam = 0.3,
                    elasticNetParam=0.8,
                    labelCol ='infant_live_encoded')
pipe3 = Pipeline(stages = [vectorizer, normalizer, lr])

In [40]:
train, val, test = combined_df.randomSplit([0.7,0.2,0.1],seed = 666)
lr_model = pipe3.fit(train)
val_model = lr_model.transform(val)

In [39]:
print(evaluator.evaluate(val_model, {evaluator.metricName: 'areaUnderROC'}))
print(evaluator.evaluate(val_model, {evaluator.metricName: 'areaUnderPR'}))

0.6155965088091023
0.584430343629382
