In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report

In [2]:
import os
os.chdir('C:\\Users\\admin\\Downloads')

In [3]:
df = pd.read_csv("Maternal Health Risk Data Set.csv",sep=';')

In [4]:
df.head(10)

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk
5,23,140,80,7.01,98.0,70,high risk
6,23,130,70,7.01,98.0,78,mid risk
7,35,85,60,11.0,102.0,86,high risk
8,32,120,90,6.9,98.0,70,mid risk
9,42,130,80,18.0,98.0,70,high risk


In [5]:
df.dtypes

Age              int64
SystolicBP       int64
DiastolicBP      int64
BS             float64
BodyTemp       float64
HeartRate        int64
RiskLevel       object
dtype: object

In [6]:
df.shape

(1014, 7)

Missing Values


In [7]:
#Miss Values
df.isnull().sum()

Age            0
SystolicBP     0
DiastolicBP    0
BS             0
BodyTemp       0
HeartRate      0
RiskLevel      0
dtype: int64

In [8]:
#Duplicated value 
data_dup = df.duplicated().any()
data_dup


True

In [9]:
df = df.drop_duplicates()

In [10]:
data_dup = df.duplicated().any()
data_dup

False

In [11]:
ca_val=[]
co_val=[]

for column in df.columns:
    if df[column].nunique() <=10:
        ca_val.append(column)
    else:
        co_val.append(column)

In [12]:
#Categorical Data
ca_val

['BodyTemp', 'RiskLevel']

Feature Engineering

In [13]:
#Data Scaling

In [14]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler() # Standardization

In [15]:
# Standardization

In [16]:
df['Age'] = ss.fit_transform(df[['Age']])
df['SystolicBP'] = ss.fit_transform(df[['SystolicBP']])
df['DiastolicBP'] =ss.fit_transform(df[['DiastolicBP']])
df['BS'] = ss.fit_transform(df[['BS']])
df['BodyTemp'] = ss.fit_transform(df[['BodyTemp']])
df['HeartRate'] = ss.fit_transform(df[['HeartRate']])
df.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,-0.305021,1.08931,0.333484,2.354439,-0.491351,1.479009,high risk
1,0.422139,1.649455,1.061321,1.646744,-0.491351,-0.484676,high risk
2,-0.014157,-1.151273,-0.394352,-0.122492,0.927758,0.742627,high risk
3,0.058559,1.649455,0.697402,-0.47634,-0.491351,-0.484676,high risk
4,0.422139,0.529164,-1.122188,-0.794802,-0.491351,0.251706,low risk


In [17]:
df['BodyTemp'].unique()

array([-0.49135062,  0.92775787,  2.34686636,  1.63731212,  3.05642061,
       -0.20752892,  0.21820363, -0.06561807])

In [18]:
df['RiskLevel'].unique()

array(['high risk', 'low risk', 'mid risk'], dtype=object)

In [19]:
RiskLevel = {'low risk':1, 
        'mid risk':2, 
        'high risk':3}

# apply using map
df['RiskLevel'] = df['RiskLevel'].map(RiskLevel).astype(float)
df


Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,-0.305021,1.089310,0.333484,2.354439,-0.491351,1.479009,3.0
1,0.422139,1.649455,1.061321,1.646744,-0.491351,-0.484676,3.0
2,-0.014157,-1.151273,-0.394352,-0.122492,0.927758,0.742627,3.0
3,0.058559,1.649455,0.697402,-0.476340,-0.491351,-0.484676,3.0
4,0.422139,0.529164,-1.122188,-0.794802,-0.491351,0.251706,1.0
...,...,...,...,...,...,...,...
673,-1.250328,-0.591127,-1.850025,-0.688648,-0.491351,-0.484676,2.0
674,-1.032180,-0.591127,-1.122188,-0.830187,-0.491351,0.742627,1.0
703,-1.032180,-0.591127,-1.922808,-0.264031,-0.491351,0.374436,1.0
704,-1.250328,-0.591127,-1.850025,-0.830187,-0.491351,-0.484676,2.0


In [20]:
df['RiskLevel'].value_counts()

1.0    234
3.0    112
2.0    106
Name: RiskLevel, dtype: int64

In [21]:
df.describe()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
count,452.0,452.0,452.0,452.0,452.0,452.0,452.0
mean,-2.019034e-16,1.49831e-16,6.022714e-16,3.758056e-17,-1.002394e-15,-3.910343e-16,1.730088
std,1.001108,1.001108,1.001108,1.001108,1.001108,1.001108,0.833169
min,-1.39576,-2.271564,-1.922808,-0.8301869,-0.4913506,-8.216687,1.0
25%,-0.7413166,-1.151273,-0.7582701,-0.5117243,-0.4913506,-0.4846762,1.0
50%,-0.3050209,0.5291641,0.3334843,-0.2994159,-0.4913506,0.2517058,1.0
75%,0.4221386,0.5291641,0.770186,-0.157877,-0.4913506,0.7426271,2.0
max,2.967197,2.769747,1.789157,3.769828,3.056421,1.969931,3.0


Corrélations

In [22]:
df[['RiskLevel','Age']].corr()

Unnamed: 0,RiskLevel,Age
RiskLevel,1.0,0.18301
Age,0.18301,1.0


In [23]:
df[['RiskLevel','SystolicBP']].corr()

Unnamed: 0,RiskLevel,SystolicBP
RiskLevel,1.0,0.327365
SystolicBP,0.327365,1.0


In [24]:
X = df.drop('RiskLevel',axis=1)
y = df['RiskLevel']

In [25]:
X

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate
0,-0.305021,1.089310,0.333484,2.354439,-0.491351,1.479009
1,0.422139,1.649455,1.061321,1.646744,-0.491351,-0.484676
2,-0.014157,-1.151273,-0.394352,-0.122492,0.927758,0.742627
3,0.058559,1.649455,0.697402,-0.476340,-0.491351,-0.484676
4,0.422139,0.529164,-1.122188,-0.794802,-0.491351,0.251706
...,...,...,...,...,...,...
673,-1.250328,-0.591127,-1.850025,-0.688648,-0.491351,-0.484676
674,-1.032180,-0.591127,-1.122188,-0.830187,-0.491351,0.742627
703,-1.032180,-0.591127,-1.922808,-0.264031,-0.491351,0.374436
704,-1.250328,-0.591127,-1.850025,-0.830187,-0.491351,-0.484676


In [26]:
y

0      3.0
1      3.0
2      3.0
3      3.0
4      1.0
      ... 
673    2.0
674    1.0
703    1.0
704    2.0
705    1.0
Name: RiskLevel, Length: 452, dtype: float64

Splitting The dataset

In [27]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,
                                               random_state=42)
y_test

425    3.0
39     1.0
492    2.0
484    1.0
192    3.0
      ... 
29     1.0
171    3.0
377    1.0
26     1.0
7      3.0
Name: RiskLevel, Length: 136, dtype: float64

In [28]:
print(f'Training Shape x:',X_train.shape)
print(f'Testing Shape x:',X_test.shape)
print('*****___________*****___________*****')
print(f'Training Shape y:',y_train.shape)
print(f'Testing Shape y:',y_test.shape)

Training Shape x: (316, 6)
Testing Shape x: (136, 6)
*****___________*****___________*****
Training Shape y: (316,)
Testing Shape y: (136,)


In [29]:
#StandardScaler

ss = StandardScaler()

X_train = ss.fit_transform(X_train)

X_test= ss.transform(X_test)

**XGBoost**

In [30]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.1-py3-none-win_amd64.whl (89.1 MB)
     -                                        4.0/89.1 MB 3.6 kB/s eta 6:31:13


ERROR: Exception:
Traceback (most recent call last):
  File "C:\Users\admin\anaconda3\Nouveau dossier\lib\site-packages\pip\_vendor\urllib3\response.py", line 435, in _error_catcher
    yield
  File "C:\Users\admin\anaconda3\Nouveau dossier\lib\site-packages\pip\_vendor\urllib3\response.py", line 516, in read
    data = self._fp.read(amt) if not fp_closed else b""
  File "C:\Users\admin\anaconda3\Nouveau dossier\lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 90, in read
    data = self.__fp.read(amt)
  File "C:\Users\admin\anaconda3\Nouveau dossier\lib\http\client.py", line 463, in read
    n = self.readinto(b)
  File "C:\Users\admin\anaconda3\Nouveau dossier\lib\http\client.py", line 507, in readinto
    n = self.fp.readinto(b)
  File "C:\Users\admin\anaconda3\Nouveau dossier\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\admin\anaconda3\Nouveau dossier\lib\ssl.py", line 1242, in recv_into
    return self.read(nbytes, buffer

In [31]:
#Applying XGBoost
import xgboost as xgb

ModuleNotFoundError: No module named 'xgboost'

In [32]:
#Create XGBClassifier
xgb_clf = xgb.XGBClassifier(objective='multi:softmax')

NameError: name 'xgb' is not defined

In [None]:
#train model
xgb_clf = xgb_clf.fit(X_train, y_train)

In [None]:
print("Train accuracy:",xgb_clf.score(X_train,y_train))
print("Test accuracy:",xgb_clf.score(X_test,y_test))

In [None]:
y_pred = xgb_clf.predict(X_test)
print(y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f'CM:',cm)
print(f'Accuracy:',accuracy_score(y_test, y_pred)* 100 ,'%')
print(classification_report(y_test, xgb_clf.predict(X_test)))

RandomForest

In [33]:
#Applying RandomForest
from sklearn.ensemble import RandomForestClassifier

In [34]:
#Create Random Forest object
random_forest = RandomForestClassifier()

In [35]:
#Train model
random_forest.fit(X_train, y_train)

RandomForestClassifier()

In [36]:
print("Train accuracy:",random_forest.score(X_train,y_train))
print("Test accuracy:",random_forest.score(X_test,y_test))

Train accuracy: 0.9430379746835443
Test accuracy: 0.6470588235294118


In [37]:
y_pred = random_forest.predict(X_test)
print(y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f'CM:',cm)
print(f'Accuracy:',accuracy_score(y_test, y_pred)* 100 ,'%')
print(classification_report(y_test, random_forest.predict(X_test)))

[3. 1. 2. 2. 2. 1. 3. 3. 1. 1. 3. 3. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2. 1. 3.
 3. 1. 1. 1. 2. 1. 1. 1. 1. 2. 1. 1. 1. 2. 3. 1. 1. 2. 1. 2. 1. 3. 2. 3.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 3. 1. 1. 1. 1. 1. 1. 1. 2. 3. 3.
 1. 2. 3. 1. 1. 1. 1. 3. 3. 2. 2. 1. 1. 1. 1. 1. 1. 1. 2. 1. 3. 3. 2. 2.
 2. 1. 3. 1. 2. 3. 2. 1. 3. 1. 1. 1. 3. 2. 2. 1. 3. 3. 1. 1. 3. 3. 1. 2.
 3. 2. 1. 1. 1. 1. 3. 1. 3. 1. 1. 1. 3. 1. 1. 3.]
CM: [[54  9  2]
 [24 10  5]
 [ 3  5 24]]
Accuracy: 64.70588235294117 %
              precision    recall  f1-score   support

         1.0       0.67      0.83      0.74        65
         2.0       0.42      0.26      0.32        39
         3.0       0.77      0.75      0.76        32

    accuracy                           0.65       136
   macro avg       0.62      0.61      0.61       136
weighted avg       0.62      0.65      0.62       136



Logistic regression


In [38]:
from sklearn.linear_model import LogisticRegression

In [39]:
lr = LogisticRegression()
lr.fit(X_train , y_train)

LogisticRegression()

In [40]:
lr.score(X_train , y_train)

0.6835443037974683

In [41]:
lr.score(X_test , y_test)

0.6397058823529411

In [42]:
y_pred =lr.predict(X_test)
print(y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f'CM:',cm)
print(f'Accuracy:',accuracy_score(y_test, y_pred)* 100 ,'%')
print(classification_report(y_test, lr.predict(X_test)))

[3. 1. 2. 1. 1. 1. 3. 1. 1. 1. 3. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 3. 3.
 3. 1. 1. 1. 3. 1. 1. 3. 1. 3. 1. 1. 1. 1. 3. 1. 1. 1. 1. 1. 1. 3. 2. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 3. 1. 3. 1. 1. 1. 1. 1. 1. 3. 1.
 1. 1. 3. 1. 1. 1. 1. 3. 3. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 3. 1. 2.
 1. 1. 3. 3. 1. 3. 2. 1. 3. 3. 1. 1. 3. 1. 2. 1. 2. 3. 1. 1. 3. 3. 1. 3.
 1. 3. 1. 1. 1. 1. 3. 1. 3. 1. 1. 1. 1. 1. 1. 3.]
CM: [[62  0  3]
 [27  4  8]
 [ 9  2 21]]
Accuracy: 63.970588235294116 %
              precision    recall  f1-score   support

         1.0       0.63      0.95      0.76        65
         2.0       0.67      0.10      0.18        39
         3.0       0.66      0.66      0.66        32

    accuracy                           0.64       136
   macro avg       0.65      0.57      0.53       136
weighted avg       0.65      0.64      0.57       136



GaussianNB

In [43]:
#Applying GaussianNB
from sklearn.naive_bayes import GaussianNB

In [44]:
nb = GaussianNB()

In [45]:
nb.fit(X_train , y_train)

GaussianNB()

In [46]:
print("Train accuracy:",nb.score(X_train,y_train))
print("Test accuracy:",nb.score(X_test,y_test))

Train accuracy: 0.6708860759493671
Test accuracy: 0.6323529411764706


In [47]:
y_pred = nb.predict(X_test)
print(y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f'CM:',cm)
print(f'Accuracy:',accuracy_score(y_test, y_pred)* 100 ,'%')
print(classification_report(y_test, nb.predict(X_test)))

[3. 1. 1. 1. 1. 1. 3. 1. 1. 1. 3. 3. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 3. 3.
 2. 1. 1. 1. 3. 1. 1. 3. 1. 3. 1. 1. 1. 1. 3. 1. 1. 1. 1. 1. 1. 3. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 3. 1. 2. 1. 1. 1. 1. 1. 1. 3. 1.
 1. 1. 3. 1. 1. 1. 1. 3. 3. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 3. 1. 1.
 1. 1. 3. 2. 1. 3. 1. 1. 3. 1. 1. 1. 3. 1. 1. 1. 2. 2. 1. 1. 3. 3. 1. 1.
 1. 1. 1. 1. 1. 1. 3. 1. 3. 1. 1. 1. 1. 1. 1. 3.]
CM: [[63  0  2]
 [32  3  4]
 [10  2 20]]
Accuracy: 63.23529411764706 %
              precision    recall  f1-score   support

         1.0       0.60      0.97      0.74        65
         2.0       0.60      0.08      0.14        39
         3.0       0.77      0.62      0.69        32

    accuracy                           0.63       136
   macro avg       0.66      0.56      0.52       136
weighted avg       0.64      0.63      0.56       136



Deep Learning


<h1>visualisation</h1>

In [49]:
input_data = (25,144,87,18,90,88)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = random_forest.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 1):
  print('The risk level is low risk')
elif (prediction[0] == 2):
  print('The risk level is mid risk')
else:
  print('The risk level is high risk')

[3.]
The risk level is high risk


In [50]:
import pickle

In [None]:

filename = 'trained_model.sav'
pickle.dump(random_forest, open(filename, 'wb'))
os.get_exec_path()

In [None]:
import os
os.getcwd()

In [None]:
# loading the saved model
loaded_model = pickle.load(open('trained_model.sav', 'rb'))
location

In [None]:
input_data = (5,166,72,19,175,25.8,0.587,51)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 1):
  print('The risk level is low risk')
elif (prediction[0] == 2):
  print('The risk level is mid risk')
else:
  print('The risk level is high risk')