# Binary Classification with Logistic Regression

In [13]:
# import modules for preprocessing 
import os
import skimage
from skimage import data
from skimage import io
from skimage.io import imread, imshow
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import classification_report,accuracy_score
import natsort





## Preprocess the data

### Function image_to_vec() will:
####  - Read in the image file names in the directory and sort them in ascending order.
####  - Add the directory path to each image name, so all images can be found.
#### - Find the images using the paths.
#### - Read in the images as grayscale
#### - Convert the images into arrays
#### - Return a list of image arrays and a list of each file name.

In [14]:
imagefile_list = os.listdir('dataset/sample_images/') # dataset of 200
directory = 'dataset/sample_images/'

# Function to take image files in dir, convert them into vectors and save them to a list

def image_to_vec(file_list, directory):    
    vec_list = []
    files = []
    for filename in file_list:
        if filename.endswith('.jpg'):
            files.append(filename)
    files = natsort.natsorted(files) #sort file names in ascending order, so it matches labels
    #print(files)
    for file in files:
      image_path = directory+file
      #print(image_path)
      image = np.array(imread(image_path, as_gray=True)) #read in as grayscale
      vec_list.append(image)

    return vec_list,files 
   
# Calling the function
vectors,image_files = image_to_vec(imagefile_list, directory)


# Checking that first vector on the list looks right
# print('First vector', vectors[0])

# Checkng shape of the first vector
# print('Shape of first vector', vectors[0].shape)

# more tests
#print('vectors type', type(vectors))
# plt.imshow(vectors[0], cmap='gray') # test that first array produces first image
# print(image_files[0]) 



##### 1. Read in the binary labels.

In [15]:
# Loading the csv file
tumors = pd.read_csv('./dataset/sample_labels_0_1.csv', sep=',')
print(tumors)

          file_name  label
0    IMAGE_0000.jpg      1
1    IMAGE_0001.jpg      0
2    IMAGE_0002.jpg      1
3    IMAGE_0003.jpg      1
4    IMAGE_0004.jpg      1
..              ...    ...
195  IMAGE_0195.jpg      1
196  IMAGE_0196.jpg      1
197  IMAGE_0197.jpg      1
198  IMAGE_0198.jpg      1
199  IMAGE_0199.jpg      1

[200 rows x 2 columns]


#### 2. Split the file names and label data into their own lists. Run the file names and labels through the function that will find the images in the directory, and convert them into an array.

In [16]:

# Split the data
im_files = list(tumors['file_name']) # converting to list in order to access each filename
labels = tumors['label']
# print(labels)

im_vecs,filenames = image_to_vec(im_files,directory) 

# plt.imshow(im_vecs[0], cmap='gray') #check that im_vecs has image vectors in correct order


#### 3. Merge the image and label data.

In [17]:
data = list(zip(im_vecs,labels)) # join the image vectors and their respective labels
data # output should show that each array is paired with its label
#print('data type',type(data))


[(array([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
  1),
 (array([[0.1372549 , 0.14117647, 0.14509804, ..., 0.21960784, 0.39215686,
          0.47843137],
         [0.10980392, 0.10980392, 0.11372549, ..., 0.2       , 0.36470588,
          0.44705882],
         [0.05490196, 0.05882353, 0.05882353, ..., 0.15294118, 0.31372549,
          0.39215686],
         ...,
         [0.00784314, 0.00784314, 0.01176471, ..., 0.11764706, 0.28235294,
          0.36470588],
         [0.00784314, 0.01176471, 0.01176471, ..., 0.11764706, 0.28627451,
          0.36862745],
         [0.00784314, 0.01176471, 0.01176471, ..., 0.11764706, 0.29019608,
          0.37254902]]),
  0),
 (array([[0.08235294, 0.04705882, 0.00784314, ..., 0.        , 0.        ,
          0.        ],
         [0.07843137, 0.054901

#### 4. Convert the image and label data into arrays X and Y.

In [18]:
# Loading the data file: 

def list_to_array(list_of_tuples): # Convert the data from list type to array
    X=[]
    Y=[]
    count=0
    for pair in list_of_tuples:
        temp=pair[0]
        temp2=pair[1]
        X.append(temp)
        Y.append(temp2)
        count+=1
        print(count,'tuples converted to array')
    X=np.asarray(X)
    Y=np.asarray(Y)
    
    return X,Y

X,Y=list_to_array(data)
X=X.reshape(200,262144) # MAKE SURE THIS MATCHES COUNT OF DATA SAMPLES
#print(X)
#print(Y)

# Shuffle and split the data into training and test set
X, Y = shuffle(X,Y)
print('x shape',X.shape, 'y shape',Y.shape)
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, random_state=0)
print('x_train shape',x_train.shape,'y_train shape',y_train.shape)
print('x_test shape',x_test.shape,'y_test shape',y_test.shape)

1 tuples converted to array
2 tuples converted to array
3 tuples converted to array
4 tuples converted to array
5 tuples converted to array
6 tuples converted to array
7 tuples converted to array
8 tuples converted to array
9 tuples converted to array
10 tuples converted to array
11 tuples converted to array
12 tuples converted to array
13 tuples converted to array
14 tuples converted to array
15 tuples converted to array
16 tuples converted to array
17 tuples converted to array
18 tuples converted to array
19 tuples converted to array
20 tuples converted to array
21 tuples converted to array
22 tuples converted to array
23 tuples converted to array
24 tuples converted to array
25 tuples converted to array
26 tuples converted to array
27 tuples converted to array
28 tuples converted to array
29 tuples converted to array
30 tuples converted to array
31 tuples converted to array
32 tuples converted to array
33 tuples converted to array
34 tuples converted to array
35 tuples converted to 

## Run the data through a logistic regression model

In [19]:
print('y_train.shape[0]',y_train.shape[0]) # count of samples


def predict(z):
    return 1. / (1. + np.exp(-z))  # z=feature vector xTrain times parameter vector theta  


 
def param_update(xTrain, yTrain):
    print('xTrain shape before',xTrain.shape) 
    new_col = np.ones((xTrain.shape[0], 1)) 
    xTrain = np.append(xTrain, new_col, axis=1)
    print('xTrain shape after',xTrain.shape)
    
    #initialise parameters
    theta = np.zeros(xTrain.shape[1])
    epoch = 300
    alpha = 0.01 
    for i in range(epoch): 
        z = np.dot(xTrain, theta) # feature vector times parameter vector
        h = predict(z)  
        gradient = theta - alpha * np.dot(xTrain.T,(h-yTrain))/yTrain.shape[0]
        theta = gradient       
    return theta


def train_validate(xTrain, yTrain,xTest):
    #print('xTest shape before',xTest.shape) 
    theta = param_update(xTrain, yTrain)
    new_col = np.ones((xTest.shape[0], 1)) 
    xTest = np.append(xTest, new_col, axis=1)
    #print('xTest shape after',xTest.shape)
    z = np.dot(xTest,theta)
    h = predict(z)
    y_pred = h >= 0.5 # true or false assignment    
    return y_pred  



y_train.shape[0] 140


In [20]:
  
y_pred = train_validate(x_train, y_train,x_test) # train_validate was called predict before
print('y_pred', y_pred)
print('Accuracy on test set: '+str(accuracy_score(y_test,y_pred)))
print(classification_report(y_test,y_pred)) #text report showing the main classification metrics





xTrain shape before (140, 262144)
xTrain shape after (140, 262145)
y_pred [ True False  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True False  True  True  True  True  True  True  True]
Accuracy on test set: 0.9333333333333333
              precision    recall  f1-score   support

           0       1.00      0.33      0.50         6
           1       0.93      1.00      0.96        54

    accuracy                           0.93        60
   macro avg       0.97      0.67      0.73        60
weighted avg       0.94      0.93      0.92        60

