Import Libraries needed

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Fetch training data

In [2]:
murmur_data = pd.read_csv('training_data.csv')

murmur_data.head()

Unnamed: 0,Patient ID,Recording locations:,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur pitch,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Outcome,Campaign,Additional ID
0,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
1,9979,AV+PV+TV+MV,Child,Female,103.0,13.1,False,Present,AV+MV+PV+TV,TV,...,High,Harsh,,,,,,Abnormal,CC2015,
2,9983,AV+PV+TV+MV,Child,Male,115.0,19.1,False,Unknown,,,...,,,,,,,,Abnormal,CC2015,
3,13918,AV+PV+TV+MV,Child,Male,98.0,15.9,False,Present,TV,TV,...,Low,Blowing,,,,,,Abnormal,CC2015,
4,14241,AV+PV+TV+MV,Child,Male,87.0,11.2,False,Present,AV+MV+PV+TV,PV,...,Low,Harsh,,,,,,Abnormal,CC2015,


Data processing/filtering

In [3]:
# unnecessary data
murmur_data.drop("Patient ID", axis=1, inplace=True)
murmur_data.drop("Campaign", axis=1, inplace=True)
murmur_data.drop("Additional ID", axis=1, inplace=True)
# we do not care where was the recording, we care if there is a murmur
murmur_data.drop("Recording locations:", axis=1, inplace=True)
# we do not care where is the most audible location, we care if there is a murmur
murmur_data.drop("Most audible location", axis=1, inplace=True)

# replace NAN with mean value for height and weight columns
murmur_data['Height'] = murmur_data['Height'].fillna(murmur_data['Height'].mean())
murmur_data['Weight'] = murmur_data['Weight'].fillna(murmur_data['Weight'].mean())

# replace true or false by 1 or 0
pregnancy_status = murmur_data['Pregnancy status'].astype(int)

sex = pd.get_dummies(murmur_data['Sex'], drop_first=True).astype(int)
age = pd.get_dummies(murmur_data['Age'], drop_first=True).astype(int)
outcome = pd.get_dummies(murmur_data['Outcome'], drop_first=True).astype(int)
systolic_murmur_timing = pd.get_dummies(murmur_data['Systolic murmur timing'], drop_first=True, prefix='Systolic').astype(int)
systolic_murmur_shape = pd.get_dummies(murmur_data['Systolic murmur shape'], drop_first=True, prefix='Systolic').astype(int)
systolic_murmur_grading = pd.get_dummies(murmur_data['Systolic murmur grading'], drop_first=True, prefix='Systolic').astype(int)
systolic_murmur_pitch = pd.get_dummies(murmur_data['Systolic murmur pitch'], drop_first=True, prefix='Systolic').astype(int)
systolic_murmur_quality = pd.get_dummies(murmur_data['Systolic murmur quality'], drop_first=True, prefix='Systolic').astype(int)
diastolic_murmur_timing = pd.get_dummies(murmur_data['Diastolic murmur timing'], drop_first=True, prefix='Diastolic').astype(int)
diastolic_murmur_shape = pd.get_dummies(murmur_data['Diastolic murmur shape'], drop_first=True, prefix='Diastolic').astype(int)
diastolic_murmur_grading = pd.get_dummies(murmur_data['Diastolic murmur grading'], drop_first=True, prefix='Diastolic').astype(int)
diastolic_murmur_pitch = pd.get_dummies(murmur_data['Diastolic murmur pitch'], drop_first=True, prefix='Diastolic').astype(int)
diastolic_murmur_quality = pd.get_dummies(murmur_data['Diastolic murmur quality'], drop_first=True, prefix='Diastolic').astype(int)

murmur_data.drop(['Sex', 'Age', 'Outcome', 'Pregnancy status', 'Diastolic murmur quality', 'Systolic murmur timing', 'Diastolic murmur pitch', 'Diastolic murmur shape', 'Diastolic murmur grading', 'Systolic murmur shape', 'Systolic murmur grading', 'Diastolic murmur timing', 'Systolic murmur pitch', 'Systolic murmur quality'], axis=1, inplace=True)
murmur_data = pd.concat([murmur_data, sex, age, outcome, pregnancy_status,  diastolic_murmur_quality, diastolic_murmur_pitch, systolic_murmur_timing, diastolic_murmur_grading, diastolic_murmur_shape, diastolic_murmur_timing, systolic_murmur_shape, systolic_murmur_grading, systolic_murmur_pitch, systolic_murmur_quality], axis=1)


In [4]:
def convert_abbreviations(value):
    abbreviations = ['PV', 'TV', 'AV', 'MV', 'Phc']
    return [1 if abbr in value else 0 for abbr in abbreviations]

murmur_data['Murmur locations'].fillna('', inplace=True)
murmur_data[['PV', 'TV', 'AV', 'MV', 'Phc']] = murmur_data['Murmur locations'].apply(lambda x: pd.Series(convert_abbreviations(x)))
murmur_data.drop(columns=['Murmur locations'], inplace=True)

murmur_data.head()

Unnamed: 0,Height,Weight,Murmur,Male,Child,Infant,Neonate,Normal,Pregnancy status,Diastolic_Harsh,...,Systolic_III/VI,Systolic_Low,Systolic_Medium,Systolic_Harsh,Systolic_Musical,PV,TV,AV,MV,Phc
0,98.0,15.9,Absent,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,103.0,13.1,Present,0,1,0,0,0,0,0,...,1,0,0,1,0,1,1,1,1,0
2,115.0,19.1,Unknown,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,98.0,15.9,Present,1,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4,87.0,11.2,Present,1,1,0,0,0,0,0,...,0,1,0,1,0,1,1,1,1,0


Train and Test batches

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(murmur_data.drop('Murmur', axis=1),
                                                    murmur_data['Murmur'], test_size=0.3,
                                                    random_state=42)

Scaling Data

In [6]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(x_train)
scaler.fit(x_test)

x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

logistic_regression = LogisticRegression()

logistic_regression.fit(x_train_scaled, y_train)
logistic_predictions = logistic_regression.predict(x_test_scaled)

print(classification_report(y_test, logistic_predictions))

              precision    recall  f1-score   support

      Absent       0.92      1.00      0.96       214
     Present       1.00      1.00      1.00        50
     Unknown       0.00      0.00      0.00        19

    accuracy                           0.93       283
   macro avg       0.64      0.67      0.65       283
weighted avg       0.87      0.93      0.90       283



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM Model

In [8]:
from sklearn.svm import SVC

# converting a pandas series to a one-dimensional array
y_train_array = y_train.values.reshape(-1)

svm_model = SVC(gamma='auto')

svm_model.fit(x_train_scaled, y_train_array)
svm_predictions = svm_model.predict(x_test_scaled)

print(classification_report(y_test, svm_predictions))

              precision    recall  f1-score   support

      Absent       0.92      1.00      0.96       214
     Present       1.00      1.00      1.00        50
     Unknown       0.00      0.00      0.00        19

    accuracy                           0.93       283
   macro avg       0.64      0.67      0.65       283
weighted avg       0.87      0.93      0.90       283



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

decision_tree = DecisionTreeClassifier()

decision_tree.fit(x_train_scaled, y_train)
decision_tree_predictions = decision_tree.predict(x_test_scaled)

print(classification_report(y_test, decision_tree_predictions))

NameError: name 'predictions' is not defined

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=200)

random_forest.fit(x_train_scaled, y_train)
random_forest_predictions = random_forest.predict(x_test_scaled)

print(classification_report(y_test, random_forest_predictions))

Neural Network

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

neural_network = Sequential()

neural_network.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 30))
neural_network.add(Dropout(0.1, input_shape=(30,)))
neural_network.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
neural_network.add(Dense(units=2, activation='sigmoid', kernel_initializer='uniform'))

neural_network.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

neural_network.fit(x_train_scaled, y_train, batch_size=10, epochs=100)
