# Part V. Semi-supervised Classification on  Cardiotocography Data Set

In [75]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [33]:
def report(model,Xttest,yttest):
    print("Model perform on test dataset")
    print("classification accuracy:", model.score(Xtest,ytest))
    print("mis-classification error:",1-model.score(Xtest,ytest))
    print("confusion matrix:\n",confusion_matrix(model.predict(Xtest),ytest))

Import all dataset and preprocess by droping NA

In [84]:
train_data_labeled = pd.read_csv("../data/CTG/CTG_Train_labeled.csv").dropna()
train_data_unlabeled = pd.read_csv("../data/CTG/CTG_Train_unlabeled.csv").dropna()
test_data = pd.read_csv("../data/CTG/CTG_Test.csv").dropna()

## 1. Try three classification methods 
using only labelled training data, and test their performances on 
the test data. Please try to optimize the tuning parameters as best as you can.  Report the results 
and conclusions.

I will use LDA, SVM, MLP in this section.

Preprocess data by using StandardScaler

In [8]:
scaler = StandardScaler()
X = scaler.fit_transform(train_data_labeled.loc[:,:"Tendency"])
y = train_data_labeled.NSP
Xtest = scaler.transform(test_data.loc[:,:"Tendency"])
ytest = test_data.NSP

### (1) LDA

In [34]:
# 1 LDA
lda = LDA()
lda.fit(X,y)

report(lda,Xtest,ytest)

Model perform on test dataset
classification accuracy: 0.8937875751503006
mis-classification error: 0.1062124248496994
confusion matrix:
 [[366  25   4]
 [ 11  46   6]
 [  3   4  34]]


### (2) SVM

In [71]:
# 2 SVM with radical kernel
params = {'C': (5, 10), 'gamma': (0.1, 0.05, 0.01)}
svms = GridSearchCV(SVC(kernel='rbf'), param_grid=params,cv=5, n_jobs=-1)
svms.fit(X, y)
pd.DataFrame({'params': svms.cv_results_[
             'params'], '5-fold CV Error': svms.cv_results_['mean_test_score']})


Unnamed: 0,params,5-fold CV Error
0,"{'C': 5, 'gamma': 0.1}",0.892
1,"{'C': 5, 'gamma': 0.05}",0.892
2,"{'C': 5, 'gamma': 0.01}",0.888
3,"{'C': 10, 'gamma': 0.1}",0.892
4,"{'C': 10, 'gamma': 0.05}",0.894
5,"{'C': 10, 'gamma': 0.01}",0.886


In [72]:
svm_best = svms.best_estimator_
svm_best

In [73]:
report(svm_best,Xtest,ytest)

Model perform on test dataset
classification accuracy: 0.8877755511022044
mis-classification error: 0.11222444889779559
confusion matrix:
 [[360  25   2]
 [ 17  46   5]
 [  3   4  37]]


### (3) Multi-layer perceptron (MLP) algorithm

In [67]:
mlp = MLPClassifier(alpha=5e-3,learning_rate_init=1e-3,hidden_layer_sizes=(32,32), random_state=5054, max_iter=1000)
mlp.fit(X,y)

report(mlp,Xtest,ytest)

Model perform on test dataset
classification accuracy: 0.8917835671342685
mis-classification error: 0.10821643286573146
confusion matrix:
 [[363  21   3]
 [ 15  47   6]
 [  2   7  35]]


Compared to SVM and MLP, LDA perform better on the test dataset, since the other two model may be esay to overfit the train dataset and hard to tune parameter.

## 2. Try two semi-supervised classification methods 
using both labelled training data and unlabelled 
training data,  and test their performances on the test data. Please try to optimise the tuning 
parameters as best as you can.  Report the results and compare with the results using only 
labelled data. 

Preprocess train dataset

In [92]:
train_data_unlabeled['NSP'] = -1
train_data= pd.concat([train_data_labeled,train_data_unlabeled])
train_data

Unnamed: 0,AC,FM,UC,DL,DS,DP,ASTV,MSTV,ALTV,MLTV,...,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP
0,0.000,0.0,0.007,0.000,0.0,0.0,60.0,0.4,17.0,6.8,...,142.0,163.0,0.0,0.0,153.0,152.0,153.0,1.0,0.0,2
1,0.002,0.0,0.005,0.000,0.0,0.0,40.0,0.8,7.0,10.3,...,119.0,154.0,2.0,0.0,138.0,139.0,140.0,2.0,0.0,1
2,0.000,0.0,0.005,0.003,0.0,0.0,31.0,1.4,17.0,14.6,...,82.0,164.0,4.0,0.0,147.0,135.0,139.0,30.0,1.0,1
3,0.002,0.0,0.003,0.000,0.0,0.0,49.0,0.7,3.0,9.8,...,140.0,184.0,3.0,0.0,162.0,158.0,162.0,4.0,0.0,2
4,0.009,0.0,0.003,0.006,0.0,0.0,19.0,2.3,0.0,4.0,...,76.0,176.0,6.0,1.0,145.0,138.0,143.0,36.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1122,0.000,0.0,0.005,0.001,0.0,0.0,77.0,0.7,17.0,6.0,...,124.0,155.0,2.0,0.0,145.0,143.0,145.0,2.0,0.0,-1
1123,0.000,0.0,0.007,0.000,0.0,0.0,79.0,0.2,25.0,7.2,...,137.0,177.0,4.0,0.0,153.0,150.0,152.0,2.0,0.0,-1
1124,0.001,0.0,0.007,0.000,0.0,0.0,78.0,0.4,22.0,7.1,...,103.0,169.0,6.0,0.0,152.0,148.0,151.0,3.0,1.0,-1
1125,0.001,0.0,0.007,0.000,0.0,0.0,79.0,0.4,20.0,6.1,...,103.0,170.0,5.0,0.0,153.0,148.0,152.0,4.0,1.0,-1


In [93]:
scaler = StandardScaler()
X = scaler.fit_transform(train_data.loc[:,:"Tendency"])
y = train_data.NSP
Xtest = scaler.transform(test_data.loc[:,:"Tendency"])
ytest = test_data.NSP

### (1) KNN by SelfTraining method

In [98]:
semi_knn = SelfTrainingClassifier(KNeighborsClassifier(n_neighbors=4))
semi_knn.fit(X,y)

report(semi_knn,Xtest,ytest)

Model perform on test dataset
classification accuracy: 0.8777555110220441
mis-classification error: 0.12224448897795592
confusion matrix:
 [[374  39   6]
 [  5  32   6]
 [  1   4  32]]


### (2) SVM by SelfTraining method

In [104]:
semi_svm = SelfTrainingClassifier(
    SVC(kernel='rbf', probability=True, gamma="auto", C=10))
semi_svm.fit(X, y)

report(semi_svm, Xtest, ytest)

Model perform on test dataset
classification accuracy: 0.8837675350701403
mis-classification error: 0.11623246492985972
confusion matrix:
 [[361  27   2]
 [ 16  43   5]
 [  3   5  37]]


Both two semi-supervised classification methods perform not better than those three supervised classification methods, since the size of labeled training dataset is enough for training and shares the approximate data distribution with test dataset, so in this situation, the train dataset with unlabeled training dataset couldn't bring more value information for this classification task. Only when the size of labeled training dataset is very small and has bias on data distribution, the semi-supervised learning method can show it's advantage.