# Imports

### Fundamental imports

In [None]:
import pandas as pd
import numpy as np

### Visualization

In [None]:
import matplotlib.pyplot as plt

### Separating training data and testing data

In [None]:
from sklearn.model_selection import train_test_split

### Modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

### Installing `eli5` because not available on Colab. Will use for deciding on which features have an effect on the target.

In [None]:
!pip install eli5

Collecting eli5
  Downloading eli5-0.11.0-py2.py3-none-any.whl (106 kB)
[?25l[K     |███                             | 10 kB 18.2 MB/s eta 0:00:01[K     |██████▏                         | 20 kB 23.0 MB/s eta 0:00:01[K     |█████████▎                      | 30 kB 12.0 MB/s eta 0:00:01[K     |████████████▍                   | 40 kB 4.9 MB/s eta 0:00:01[K     |███████████████▌                | 51 kB 4.7 MB/s eta 0:00:01[K     |██████████████████▌             | 61 kB 5.4 MB/s eta 0:00:01[K     |█████████████████████▋          | 71 kB 5.8 MB/s eta 0:00:01[K     |████████████████████████▊       | 81 kB 5.4 MB/s eta 0:00:01[K     |███████████████████████████▉    | 92 kB 6.0 MB/s eta 0:00:01[K     |███████████████████████████████ | 102 kB 5.4 MB/s eta 0:00:01[K     |████████████████████████████████| 106 kB 5.4 MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.11.0


###  To know which features are important

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

# Reading details about the file

In [None]:
data = pd.read_csv('FetalHealth.csv')
data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2116,2117,2118,2119,2120,2121,2122,2123,2124,2125
baseline value,120.0,132.0,133.0,134.0,132.0,134.0,134.0,122.0,122.0,122.0,...,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0,142.0
accelerations,0.0,0.006,0.003,0.003,0.007,0.001,0.001,0.0,0.0,0.0,...,0.004,0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.001,0.002
fetal_movement,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002
uterine_contractions,0.0,0.006,0.008,0.008,0.008,0.01,0.013,0.0,0.002,0.003,...,0.004,0.008,0.006,0.007,0.005,0.007,0.007,0.007,0.006,0.008
light_decelerations,0.0,0.003,0.003,0.003,0.0,0.009,0.008,0.0,0.0,0.0,...,0.0,0.0,0.001,0.001,0.001,0.0,0.0,0.0,0.0,0.0
severe_decelerations,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
prolongued_decelerations,0.0,0.0,0.0,0.0,0.0,0.002,0.003,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abnormal_short_term_variability,73.0,17.0,16.0,16.0,16.0,26.0,29.0,83.0,84.0,86.0,...,80.0,79.0,79.0,79.0,77.0,79.0,78.0,79.0,78.0,74.0
mean_value_of_short_term_variability,0.5,2.1,2.1,2.4,2.4,5.9,6.3,0.5,0.5,0.3,...,0.2,0.3,0.5,0.6,0.7,0.2,0.4,0.4,0.4,0.4
percentage_of_time_with_abnormal_long_term_variability,43.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,5.0,6.0,...,36.0,20.0,26.0,27.0,17.0,25.0,22.0,20.0,27.0,36.0


In [None]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
baseline value,2126.0,133.303857,9.840844,106.0,126.0,133.0,140.0,160.0
accelerations,2126.0,0.003178,0.003866,0.0,0.0,0.002,0.006,0.019
fetal_movement,2126.0,0.009481,0.046666,0.0,0.0,0.0,0.003,0.481
uterine_contractions,2126.0,0.004366,0.002946,0.0,0.002,0.004,0.007,0.015
light_decelerations,2126.0,0.001889,0.00296,0.0,0.0,0.0,0.003,0.015
severe_decelerations,2126.0,3e-06,5.7e-05,0.0,0.0,0.0,0.0,0.001
prolongued_decelerations,2126.0,0.000159,0.00059,0.0,0.0,0.0,0.0,0.005
abnormal_short_term_variability,2126.0,46.990122,17.192814,12.0,32.0,49.0,61.0,87.0
mean_value_of_short_term_variability,2126.0,1.332785,0.883241,0.2,0.7,1.2,1.7,7.0
percentage_of_time_with_abnormal_long_term_variability,2126.0,9.84666,18.39688,0.0,0.0,0.0,11.0,91.0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 22 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   baseline value                                          2126 non-null   float64
 1   accelerations                                           2126 non-null   float64
 2   fetal_movement                                          2126 non-null   float64
 3   uterine_contractions                                    2126 non-null   float64
 4   light_decelerations                                     2126 non-null   float64
 5   severe_decelerations                                    2126 non-null   float64
 6   prolongued_decelerations                                2126 non-null   float64
 7   abnormal_short_term_variability                         2126 non-null   float64
 8   mean_value_of_short_term_variability  

No missing values according to `data.info()`


---



In [None]:
data.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [None]:
data.tail()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
2121,140.0,0.0,0.0,0.007,0.0,0.0,0.0,79.0,0.2,25.0,...,137.0,177.0,4.0,0.0,153.0,150.0,152.0,2.0,0.0,2.0
2122,140.0,0.001,0.0,0.007,0.0,0.0,0.0,78.0,0.4,22.0,...,103.0,169.0,6.0,0.0,152.0,148.0,151.0,3.0,1.0,2.0
2123,140.0,0.001,0.0,0.007,0.0,0.0,0.0,79.0,0.4,20.0,...,103.0,170.0,5.0,0.0,153.0,148.0,152.0,4.0,1.0,2.0
2124,140.0,0.001,0.0,0.006,0.0,0.0,0.0,78.0,0.4,27.0,...,103.0,169.0,6.0,0.0,152.0,147.0,151.0,4.0,1.0,2.0
2125,142.0,0.002,0.002,0.008,0.0,0.0,0.0,74.0,0.4,36.0,...,117.0,159.0,2.0,1.0,145.0,143.0,145.0,1.0,0.0,1.0


# Making the first model

This first model will be used for the `eli5` to know which features are **important** and which are **not**. By only including the **important** features, the model may get a **higher accuracy** on predicting the **target**.

### Separating the columns into features and target

In [None]:
target = 'fetal_health'
y = data[target]
X = data.drop([target], axis = 1)

#### Features

In [None]:
X

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency
0,120.0,0.000,0.000,0.000,0.000,0.0,0.0,73.0,0.5,43.0,...,64.0,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0
1,132.0,0.006,0.000,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,130.0,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0
2,133.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,130.0,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0
3,134.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,117.0,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0
4,132.0,0.007,0.000,0.008,0.000,0.0,0.0,16.0,2.4,0.0,...,117.0,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,140.0,0.000,0.000,0.007,0.000,0.0,0.0,79.0,0.2,25.0,...,40.0,137.0,177.0,4.0,0.0,153.0,150.0,152.0,2.0,0.0
2122,140.0,0.001,0.000,0.007,0.000,0.0,0.0,78.0,0.4,22.0,...,66.0,103.0,169.0,6.0,0.0,152.0,148.0,151.0,3.0,1.0
2123,140.0,0.001,0.000,0.007,0.000,0.0,0.0,79.0,0.4,20.0,...,67.0,103.0,170.0,5.0,0.0,153.0,148.0,152.0,4.0,1.0
2124,140.0,0.001,0.000,0.006,0.000,0.0,0.0,78.0,0.4,27.0,...,66.0,103.0,169.0,6.0,0.0,152.0,147.0,151.0,4.0,1.0


#### Target

In [None]:
y

0       2.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
2121    2.0
2122    2.0
2123    2.0
2124    2.0
2125    1.0
Name: fetal_health, Length: 2126, dtype: float64

#### Spliting training set and testing set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)
pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(X_train, y_train)
old_model_accuracy = pipe.score(X_test, y_test)
old_model_accuracy

0.886039886039886

# Finding Important Features

Use Permutation Importance and eli5

In [None]:
perm = PermutationImportance(pipe, random_state=0).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X.columns.tolist())

Weight,Feature
0.0561  ± 0.0145,histogram_mean
0.0499  ± 0.0220,accelerations
0.0382  ± 0.0049,abnormal_short_term_variability
0.0311  ± 0.0061,histogram_median
0.0251  ± 0.0073,prolongued_decelerations
0.0179  ± 0.0145,histogram_variance
0.0142  ± 0.0018,percentage_of_time_with_abnormal_long_term_variability
0.0120  ± 0.0053,uterine_contractions
0.0094  ± 0.0061,baseline value
0.0066  ± 0.0196,histogram_mode


#### Usable features

In [None]:
usable_features = ['histogram_mean', 'accelerations', 'abnormal_short_term_variability', 
                     'histogram_median', 'prolongued_decelerations', 'histogram_variance', 
                     'percentage_of_time_with_abnormal_long_term_variability',
                     'baseline value', 'uterine_contractions', 'histogram_mode', 'histogram_number_of_peaks',
                     'histogram_max', 'mean_value_of_short_term_variability', 'histogram_min',
                     'fetal_movement', 'histogram_number_of_zeroes', 'histogram_width', 'mean_value_of_short_term_variability', 
                   'fetal_health']

In [None]:
data = data[usable_features]
data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2116,2117,2118,2119,2120,2121,2122,2123,2124,2125
histogram_mean,137.0,136.0,135.0,134.0,136.0,107.0,107.0,122.0,122.0,122.0,...,148.0,143.0,142.0,141.0,143.0,150.0,148.0,148.0,147.0,143.0
accelerations,0.0,0.006,0.003,0.003,0.007,0.001,0.001,0.0,0.0,0.0,...,0.004,0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.001,0.002
abnormal_short_term_variability,73.0,17.0,16.0,16.0,16.0,26.0,29.0,83.0,84.0,86.0,...,80.0,79.0,79.0,79.0,77.0,79.0,78.0,79.0,78.0,74.0
histogram_median,121.0,140.0,138.0,137.0,138.0,107.0,106.0,123.0,123.0,123.0,...,149.0,145.0,145.0,145.0,145.0,152.0,151.0,152.0,151.0,145.0
prolongued_decelerations,0.0,0.0,0.0,0.0,0.0,0.002,0.003,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
histogram_variance,73.0,12.0,13.0,13.0,11.0,170.0,215.0,3.0,3.0,1.0,...,1.0,1.0,2.0,1.0,2.0,2.0,3.0,4.0,4.0,1.0
percentage_of_time_with_abnormal_long_term_variability,43.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,5.0,6.0,...,36.0,20.0,26.0,27.0,17.0,25.0,22.0,20.0,27.0,36.0
baseline value,120.0,132.0,133.0,134.0,132.0,134.0,134.0,122.0,122.0,122.0,...,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0,140.0,142.0
uterine_contractions,0.0,0.006,0.008,0.008,0.008,0.01,0.013,0.0,0.002,0.003,...,0.004,0.008,0.006,0.007,0.005,0.007,0.007,0.007,0.006,0.008
histogram_mode,120.0,141.0,141.0,137.0,137.0,76.0,71.0,122.0,122.0,122.0,...,147.0,144.0,145.0,144.0,145.0,153.0,152.0,153.0,152.0,145.0


#### Making new variables for features and target

In [None]:
new_X = data.drop([target], axis=1)
new_y = data[target]

#### Training the new variables

In [None]:
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_X, new_y, test_size = 0.33, random_state = 0)
pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(new_X_train, new_y_train)
new_model_accuracy = pipe.score(new_X_test, new_y_test)
new_model_accuracy

0.8888888888888888

#### Comparison of old and new models

In [None]:
print('score of old:', old_model_accuracy)
print('score of new:', new_model_accuracy)

score of old: 0.886039886039886
score of new: 0.8888888888888888


# Downloading the new dataset

In [None]:
data.to_csv('reduced_FetalHealth.csv')