# Task 1: A classification example: fetal heart condition diagnosis

In [1]:
#Imports
import pandas as pd
from pandas import Series, DataFrame
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


## Step 1. Reading the data

In [2]:
data= pd.read_csv('CTG.csv', skiprows=1)
column_names = list((data.columns))
selected_cols = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV',
                 'MLTV', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean',
                 'Median', 'Variance', 'Tendency', 'NSP']
data_selected = data[selected_cols].dropna()  
data_selected.sample(frac = 1.0, random_state=0)
X = data_selected.drop('NSP', axis=1)
def to_label(y):
    return [None, 'normal', 'suspect', 'pathologic'][(int(y))]
y = data_selected['NSP'].apply(to_label)
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)


20% - 80% splitting technique was used.

## Step 2. Training the baseline classifier

In [3]:
#Cross validation of baseline tree classifier
clf_freq = DummyClassifier(strategy = 'most_frequent')
cross_valid_values = cross_val_score(clf_freq, X_train, y_train)
print(np.mean(cross_valid_values))
print(cross_valid_values)


0.7817647058823529
[0.78235294 0.78235294 0.78235294 0.78235294 0.77941176]


We implemented cross-validation on the Dummy Classifier model. cross_val_score() function calculates the accuracy of the training data with 5 different splits. The average of these accuracies is given above.

In [4]:
#DummyClassifier
clf_freq.fit(X_train,y_train)
Yguess = clf_freq.predict(X_test)
clf_freq.score(X_test,y_test)  #I am not sure this gives the accuray. 
print(accuracy_score(y_test, Yguess)) # Accuracy of Test
#Accuracy of the train set is 76% with most_frequent strategy

y_train_predict = clf_freq.predict(X_train)
clf_freq.score(X_train,y_train) 
print(accuracy_score(y_train, y_train_predict)) # Accuracy of Test


0.7652582159624414
0.7817647058823529


The accuracy of the train set is 78% with the most_frequent strategy. We split the prediction for the Train and test because we want to measure the accuracy of test dataset and training dataset

## Step 3. Trying out some different classifiers

## Tree-based classifiers:

In [5]:
#Cross validation of Decision tree classifier
clf_tree= DecisionTreeClassifier(criterion= 'gini',splitter='best',max_depth=None,min_samples_split=2,random_state=0)
decision_tree_cross_val = cross_val_score(clf_tree, X_train, y_train)
print(np.mean(decision_tree_cross_val))
print(decision_tree_cross_val)

0.9352941176470588
[0.92941176 0.94117647 0.90882353 0.95588235 0.94117647]


In [6]:
#Cross validation of random forest classifier

randomforest_model = RandomForestClassifier()
random_forest_cross_val = cross_val_score(randomforest_model, X_train, y_train)
print(np.mean(random_forest_cross_val))
print(random_forest_cross_val)


0.9476470588235294
[0.95294118 0.95588235 0.92941176 0.95882353 0.94117647]


In [7]:
grad_model = GradientBoostingClassifier(n_estimators = 100 , learning_rate = 0.1 , max_depth = 2 ,random_state = 0 )
grad_model_cross_val = cross_val_score(grad_model, X_train, y_train)
print(np.mean(grad_model_cross_val))
print(grad_model_cross_val)

0.951764705882353
[0.96176471 0.95588235 0.94705882 0.95       0.94411765]


The tree-based classifiers are types of classification models that predict the categorial class based on the decision trees. The decision tree classifier works by splitting the data into samples to predict the values. On the other hand, the Random forest and Gradient boosting classifiers are the ensemble method that forms multiple decision trees to improve the accuracy. In the Random Forest classifier, the model is trained based on the prediction of each decision tree. Gradient boosting classifier works by training the model at each level of the decision tree by correcting the errors of the previous tree in the next level to improve the accuracy of the model.

## Linear classifiers:

Perceptron makes predictions based on assigning weights to the input to categorize the output. Linear SVC works by find the boundary by maximizing the margin to divide the classes to improve the performance.

In [8]:
#Perceptron cross validation 
percep_clf = Perceptron(random_state=0)
print(cross_val_score(percep_clf,X_train, y_train))
print(np.mean(cross_val_score(percep_clf,X_train, y_train)))

[0.85294118 0.86176471 0.83823529 0.83235294 0.75588235]
0.8282352941176472


In [9]:
#Logistic Regression model creating and cross validation
s_scalar = StandardScaler()
X_val = X.copy()
X_var1 = s_scalar.fit_transform(X_val)
X_train = s_scalar.fit_transform(X_train)
X_test = s_scalar.fit_transform(X_test)
lr_model = LogisticRegression( max_iter = 10000)
lr_cross_val=cross_val_score(lr_model,X_var1,y,cv= 5)
print(np.mean(lr_cross_val))
print(lr_cross_val)

0.85230930682132
[0.842723   0.87058824 0.92       0.89882353 0.72941176]


In [10]:
#we also need to do the same thing for the Linear SVC as you will do in logistic regression
s_scalar = StandardScaler()
X_val = X.copy()
svc_clf = LinearSVC(C=1.0, max_iter = 10000)
X_var1 = s_scalar.fit_transform(X_val)
X_train = s_scalar.fit_transform(X_train)
X_test = s_scalar.fit_transform(X_test)
svc_cross_val =cross_val_score(svc_clf,X_var1,y,cv= 5)
print(np.mean(svc_cross_val))
print(svc_cross_val)

0.8471361502347419
[0.83568075 0.87058824 0.91529412 0.90352941 0.71058824]


## Neural network classifier:

The neural network works by inputting the data in the input layer and passing the data to the hidden layers and the output layer provides the probability distribution of the classes.

In [11]:
# Neural networks
neural_clf = MLPClassifier(max_iter = 3000)
X_value = s_scalar.fit_transform(X_val)
X_train = s_scalar.fit_transform(X_train)
X_test = s_scalar.fit_transform(X_test)
nn_cross_val =cross_val_score(neural_clf,X_value,y,cv= 5)

In [12]:
print(np.mean(nn_cross_val))
print(nn_cross_val)

0.8344324772162386
[0.83098592 0.88470588 0.90117647 0.87764706 0.67764706]


## Step 4 Final evaluation

The general performance of tree-based classifiers is better than others. Maybe we can get better performance by tuning the hyperparameters from the MLP classifier but with the default parameters, as we observed above, the highest accuracy of cross-validation is the gradient boosting classifier model.

In [13]:
grad_model = GradientBoostingClassifier(n_estimators = 100 , learning_rate = 0.1 , max_depth = 2 ,random_state = 0 ).fit(X_train,y_train)
grad_model.predict(X_test)
print(grad_model.score(X_test,y_test))

0.9436619718309859


After fitting the model, the accuracy of the test dataset is 94%. 

Gradient Boosting Classifier is a boosting algorithm that involves building decision trees in iterations, where the predicted values of each subsequent tree aim to reduce the loss function of the previous tree, with its own set of predicted values. The model's overall prediction accuracy is achieved by combining the predictions of all trees at each level of iteration.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=82bb48ff-b539-4abb-b8e9-17b011c6b53b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>