# Malaria Parasite Detection

In [1]:
import cv2
import os
import numpy as np
import csv
import glob
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import tensorflow as tf

##### Function to load the dataset

In [2]:
def load_dataset(path, classes) :
    class_images = []
    for cls in classes :
        cls_imgs = []
        img_names = os.listdir(path + cls + "/")
        for img_name in img_names :
                try :
                    img = cv2.imread(path + cls + "/" + img_name)
                    if not (img is None):
                        cls_imgs.append(img)
                except Exception as e :
                    pass
        class_images.append(np.array(cls_imgs))
    return np.concatenate(class_images)

In [3]:
DATADIR = "/home/ajayrr/Parasite/Parasite/train/"
dataset = load_dataset(DATADIR, ["Uninfected", "Parasitized"])

#### Function to apply gamma correction 

In [4]:
def adjust_gamma(image, gamma=1.0):
    	# build a lookup table mapping the pixel values [0, 255] to
    # their adjusted gamma values
    invGamma = 1.0 / gamma
    table = np.array([((i / 255.0) ** invGamma) * 255
    for i in np.arange(0, 256)]).astype("uint8")
    # apply gamma correction using the lookup table
    return cv2.LUT(image, table)

#### Blob Detection hyper parameters

In [5]:
# Blob detection (Hyperparameters)
params = cv2.SimpleBlobDetector_Params()
params.filterByArea = True
params.maxArea = 200
# Filter by Convexity
params.filterByConvexity = True
params.minConvexity = 0.3
# Filter by Circularity
params.filterByCircularity = True
params.minCircularity = 0.0
# Change thresholds
params.minThreshold = -3;
params.maxThreshold = 150;
# Filter by Inertia
params.filterByInertia = True
params.minInertiaRatio = 0.0
detector = cv2.SimpleBlobDetector_create(params)
# other hyperparameters
IMG_SIZE = 90
gamma = 0.95
smoothening_kernel_size = 5
smoothening_degree = 50

#### Blob Feature Extraction

In [6]:
# takes in the path of the image files and returns a dataframe with the number of blobs as feature
def get_blob_features(dataset):
    dataframe = [[],[],[]]
    for img in dataset:
        if not (img is None):
            img_ = cv2.resize(img,(IMG_SIZE, IMG_SIZE))
            kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(2,2))
            img_ = cv2.dilate(img_,kernel,iterations = 1)
            img_ = adjust_gamma(img_, gamma)
            keypoints = detector.detect(img_)
            dataframe[0].append(len(keypoints))
            if(len(keypoints) >= 2):
                dataframe[1].append(keypoints[0].size)
                dataframe[2].append(keypoints[1].size)
            elif(len(keypoints) == 1):
                dataframe[1].append(keypoints[0].size)
                dataframe[2].append(0)
            else:
                dataframe[1].append(0)
                dataframe[2].append(0)
    return np.asarray(dataframe).T

#### Contour Feature Extraction

In [7]:
# takes in the path of the image files and returns a dataframe with the 5 features
def get_contour_features(dataset):
    dataframe = [[],[],[],[],[]]
    for img in dataset:
        img_ = cv2.GaussianBlur(img, (3,3), 2)
        if not (img_ is None):
            img_gray = cv2.cvtColor(img_, cv2.COLOR_BGR2GRAY)
            ret, thresh = cv2.threshold(img_gray,127,255,0)
            _,contours,_ = cv2.findContours(thresh,1,2)

            for i in range(5):
                try:
                    area = cv2.contourArea(contours[i])
                    dataframe[i].append(area)
                except:
                    dataframe[i].append(0)

    return (np.asarray(dataframe).T)

In [8]:
def get_number_of_contours(dataset):
    dataframe = []
    for img in dataset:
        if not (img is None):
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            edged = cv2.Canny(gray, 30 , 200)
            _,contours, hierarchy = cv2.findContours(edged, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
            dataframe.append(len(contours))
    return (np.asarray(dataframe).T)

#### Bag of Visual Words 
##### 1. Feature extraction

In [9]:
def generate_LUCID_features(train_img):
    lucid_keypoints = []
    count = 0
    rmv_index_train = []
    for image in train_img :
         if not (image is None):
            image = cv2.resize(image, (50, 50))
            # image =cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
            detector = cv2.FastFeatureDetector_create()
            kp = detector.detect(image,None)
            lucid = cv2.xfeatures2d.LUCID_create()
            kp, descriptors = lucid.compute(image,kp)
            if descriptors is not None:
                descriptors = np.array(descriptors)
                lucid_keypoints.append(descriptors)

                temp = descriptors
            else:

                lucid_keypoints.append(temp)

           
    lucid_keypoints = np.concatenate(lucid_keypoints, axis=0)
    kmeans = KMeans(n_clusters = 16).fit(lucid_keypoints)
    print(lucid_keypoints.shape)
    print("--------Computed descriptors--------")

    x_Siftfeat_train = calculate_lucid_histogram(train_img, kmeans)
    print("------Computed Histogram-----")

    return x_Siftfeat_train

The FAST (Features from Accelerated Segment Test) algorithm was used to detect the keypoints of the images.
The LUCID (Locally Uniform Comparision Image Descriptor) is used to get the descriptors for the keypoints. The points are clustered using the K-means clustering algorithm. The value of K was determined using the elbow method.

In [10]:
def calculate_lucid_histogram(images, model):
    feature_vectors=[]
    rmv_index_test = []
    for image in images :
        if not (image is None):
            # image = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
            #feature extraction
            detector = cv2.FastFeatureDetector_create()
            kp = detector.detect(image,None)
            lucid = cv2.xfeatures2d.LUCID_create()
            kp, descriptors = lucid.compute(image,kp)
            #classification of all descriptors in the model
            if descriptors is not None :
                predict_kmeans = model.predict(descriptors)
                #calculates the histogram
                hist, bin_edges = np.histogram(predict_kmeans, bins = 16)
                #histogram is the feature vector
                feature_vectors.append(hist)
                temp = hist
            else :
                feature_vectors.append(temp)

    feature_vectors=np.asarray(feature_vectors)

    return np.array(feature_vectors)

The vocabulary is constructed by creating a histogram and using the histogram as a feature vector.

#### Obtaining contour features

In [11]:
data1 = get_contour_features(dataset)
df1 = pd.DataFrame(data = data1)

#### Obtaining blob features

In [12]:
data2 = get_blob_features(dataset)
df2 = pd.DataFrame(data = data2)

#### Obtaining Bag of Visual Words Features

In [13]:
data3 = generate_LUCID_features(dataset)
df3 = pd.DataFrame(data = data3)

(480177, 27)
--------Computed descriptors--------
------Computed Histogram-----


#### Obtaining number of contours feature

In [14]:
data4 = get_number_of_contours(dataset)
df4 = pd.DataFrame(data = data4)

In [15]:
training_label = np.concatenate([np.zeros(int(dataset.shape[0]/2.)), np.ones(int(dataset.shape[0]/2.))])

creating the training label

In [16]:
dfT = pd.DataFrame(data = training_label)
df = pd.concat([dfT, df2, df1, df3, df4], axis = 1)

In [17]:
df.to_csv("features_combined.csv")

In [18]:
data = pd.read_csv("features_combined.csv")
data = data.iloc[:,1:]

""" Building the train_set and the test_set """

# seperating the training and validation test
X = data.iloc[:,1:]
y = data.iloc[:,0]

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#### Scaling the features

In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#### Using the Random Forest Classifier

In [28]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
model = RandomForestClassifier(n_estimators=115,max_depth=10)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import accuracy_score
x = accuracy_score(y_pred, y_test)
print("Random Forest accuracy = ",x)

Random Forest accuracy =  0.9529785303900816


In [29]:
from sklearn.neighbors import KNeighborsClassifier
# K-Nearest neighbour classifier
knn_classifier = KNeighborsClassifier(n_neighbors = 5)
knn_classifier.fit(X_train, y_train)
knn_predictions = knn_classifier.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)
print("knn accuracy = " + str(knn_accuracy))

knn accuracy = 0.9384638645297853


In [36]:
#SVM
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("SVM accuracy = ",accuracy_score(y_pred, y_test))

SVM accuracy =  0.9467795585122467
