# Binary Classification with Logistic Regression

In [249]:
# import modules for preprocessing 
import os
import skimage
from skimage import data
from skimage import io
from skimage.io import imread, imshow
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import classification_report,accuracy_score
import natsort





## Preprocess the data

### Function image_to_vec() will:
####  - Read in the image file names in the directory and sort them in ascending order.
####  - Add the directory path to each image name, so all images can be found.
#### - Find the images using the paths.
#### - Read in the images as grayscale
#### - Convert the images into arrays
#### - Return a list of image arrays and a list of each file name.


##### 1. Read in the binary labels.

In [250]:
# Loading the csv file
tumors = pd.read_csv('./dataset/sample_labels_0_1.csv', sep=',')
print(tumors)

          file_name  label
0    IMAGE_0000.jpg      1
1    IMAGE_0001.jpg      0
2    IMAGE_0002.jpg      1
3    IMAGE_0003.jpg      1
4    IMAGE_0004.jpg      1
..              ...    ...
195  IMAGE_0195.jpg      1
196  IMAGE_0196.jpg      1
197  IMAGE_0197.jpg      1
198  IMAGE_0198.jpg      1
199  IMAGE_0199.jpg      1

[200 rows x 2 columns]


#### 2. Split the file names and label data into their own lists. Run the file names and labels through the function that will find the images in the directory, and convert them into an array.

In [251]:

# Split the data
im_files = list(tumors['file_name']) # converting to list in order to access each filename
labels = tumors['label']
#print(labels)

im_vecs,filenames = image_to_vec(im_files,directory) 

# plt.imshow(im_vecs[0], cmap='gray') #check that im_vecs has image vectors in correct order


#### 3. Merge the image and label data.

In [252]:
data = list(zip(im_vecs,labels)) # join the image vectors and their respective labels
data # output should show that each array is paired with its label
#print('data type',type(data))
#for i in range(len(data)):
    #print('index:',i,'label:',labels[i])


[(array([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
  1),
 (array([[0.1372549 , 0.14117647, 0.14509804, ..., 0.21960784, 0.39215686,
          0.47843137],
         [0.10980392, 0.10980392, 0.11372549, ..., 0.2       , 0.36470588,
          0.44705882],
         [0.05490196, 0.05882353, 0.05882353, ..., 0.15294118, 0.31372549,
          0.39215686],
         ...,
         [0.00784314, 0.00784314, 0.01176471, ..., 0.11764706, 0.28235294,
          0.36470588],
         [0.00784314, 0.01176471, 0.01176471, ..., 0.11764706, 0.28627451,
          0.36862745],
         [0.00784314, 0.01176471, 0.01176471, ..., 0.11764706, 0.29019608,
          0.37254902]]),
  0),
 (array([[0.08235294, 0.04705882, 0.00784314, ..., 0.        , 0.        ,
          0.        ],
         [0.07843137, 0.054901

#### 4. Convert the image and label data into arrays X and Y.

In [253]:
# Loading the data file: 

def list_to_array(list_of_tuples): # Convert the data from list type to array
    X=[]
    Y=[]
    count=0
    for pair in list_of_tuples:
        temp=pair[0]
        temp2=pair[1]
        X.append(temp)
        Y.append(temp2)
        count+=1
        #print(count,'tuples converted to array') #comment out to see check function is running
    X=np.asarray(X)
    Y=np.asarray(Y)
    
    return X,Y

X,Y=list_to_array(data)
X=X.reshape(200,262144) # MAKE SURE THIS MATCHES COUNT OF DATA SAMPLES


# Shuffle
X, Y = shuffle(X,Y)
#print('x shape',X.shape, 'y shape',Y.shape)

#Split the data into training and test (validation) set, if there's no real test data 
##x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, random_state=0)

#print('x_train shape',x_train.shape,'y_train shape',y_train.shape)
#print('x_test shape',x_test.shape,'y_test shape',y_test.shape)

### Preprocess the test data

In [254]:
imagefile_list_test = os.listdir('test/image/') # test dataset of 200
directory_test = 'test/image/'
tumors_test = pd.read_csv('test/labels_0_1.csv', sep=',') # binary class labels
#print(tumors_test)


# Split the data
im_files_test = list(tumors_test['file_name']) # converting to list in order to access each filename
labels_test = tumors_test['label2']
#print(labels_test)

# Convert image to array
im_vecs_test,filenames_test = image_to_vec(im_files_test,directory_test) 

#plt.imshow(im_vecs_test[0], cmap='gray') #check that im_vecs has image vectors in correct order

# Join image vectors with labels
data_test = list(zip(im_vecs_test,labels_test))

# Convert list of arrays into array of arrays
X_test,Y_test=list_to_array(data_test)
X_test=X_test.reshape(200,262144) # MAKE SURE THIS MATCHES COUNT OF DATA SAMPLES

print('X_test.shape',X_test.shape,'Y_test.shape',Y_test.shape)
print('X.shape',X.shape,'Y.shape',Y.shape)


# extracting training and validation data from training dataset 
x_train, x_val, y_train, y_val = train_test_split(X, Y, train_size=0.7, random_state=0)

X_test,Y_test = shuffle(X_test,Y_test)
# extracting test data from test dataset and discarding training portion by asssigning
# it to dummy variables that won't be used.
x_dummy, x_test, y_dummy, y_test = train_test_split(X_test, Y_test, test_size=0.3, random_state=0)





X_test.shape (200, 262144) Y_test.shape (200,)
X.shape (200, 262144) Y.shape (200,)


### Define the logistic regression model

In [255]:
print('y_train.shape[0]',y_train.shape[0]) # count of samples


def predict(z):
    return 1. / (1. + np.exp(-z))  # z=feature vector xTrain times parameter vector theta  


def param_update(xTrain, yTrain):
    print('xTrain shape before',xTrain.shape) 
    new_col = np.ones((xTrain.shape[0], 1)) 
    xTrain = np.append(xTrain, new_col, axis=1)
    print('xTrain shape after',xTrain.shape)
    
    #initialise parameters
    theta = np.zeros(xTrain.shape[1])
    epoch = 300
    alpha = 0.01 
    for i in range(epoch): 
        z = np.dot(xTrain, theta) # feature vector times parameter vector
        h = predict(z)  
        gradient = theta - alpha * np.dot(xTrain.T,(h-yTrain))/yTrain.shape[0]
        theta = gradient       
    return theta

def train_predict(xTrain, yTrain, xVal,yTest):
    #print('xVal shape before',xVal.shape) 
    theta = param_update(xTrain, yTrain)
    new_col = np.ones((xVal.shape[0], 1)) 
    xVal = np.append(xVal, new_col, axis=1)
    #print('xVal shape after',xVal.shape)
    z = np.dot(xVal,theta)
    h = predict(z)
    y_pred = h >= 0.5 # true or false assignment 
    score = accuracy_score(yTest,y_pred)
    return y_pred, score


y_train.shape[0] 140


### Train, validate and test the model

In [257]:
  
y_pred,score = train_predict(x_train, y_train, x_val,y_test) 
print('Accuracy score',score)
print('y_pred', y_pred)
print(classification_report(y_test,y_pred)) # main classification metrics




xTrain shape before (140, 262144)
xTrain shape after (140, 262145)
Accuracy score 0.7333333333333333
y_pred [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
 False  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        15
           1       0.75      0.98      0.85        45

    accuracy                           0.73        60
   macro avg       0.37      0.49      0.42        60
weighted avg       0.56      0.73      0.63        60

