In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import skimage.io as io
import skimage.color as color
import time
from scipy.special import expit as sigmoid
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

In [2]:
def labelprocessing(label_trainpath,label_testpath):
    #Valid data split for read-in training csv, rename table headers, replace all -1's with 0's in order to cater for the 0-1 classification of the formula
    raw_data_train=pd.read_csv(label_trainpath,names=["mixed_information_train"])
    data_unmixed_train = raw_data_train["mixed_information_train"].str.split("\t",expand=True)
    data_unmixed_train.columns=['index','img_name','gender','smiling']
    data_unmixed_train.drop(columns=['index','img_name','smiling'],inplace=True)
    data_unmixed_train.drop(index=0,inplace=True)
    data_unmixed_train[data_unmixed_train<"1"]=0
    
    #Perform the same processing on the read-in test set results
    raw_data_test=pd.read_csv(label_testpath,names=["mixed_information_test"])
    data_unmixed_test = raw_data_test["mixed_information_test"].str.split("\t",expand=True)
    data_unmixed_test.columns=['index','img_name','gender','smiling']
    data_unmixed_test.drop(columns=['index','img_name','smiling'],inplace=True)
    data_unmixed_test.drop(index=0,inplace=True)
    data_unmixed_test[data_unmixed_test<"1"]=0
    
    #Converting label information into a form that is actually used 
    YTrain=data_unmixed_train
    YTest=data_unmixed_test
    YTrain=YTrain.values
    YTest=YTest.values
    YTrain=np.array([int(y) for y in YTrain])
    YTest=np.array([int(y) for y in YTest])
    YTrain=YTrain.reshape(len(YTrain),1)
    YTest=YTest.reshape(len(YTest),1)
    
    return(YTrain,YTest)

def data_preprocessing(data_trainpath,data_testpath):
    XTrain=[]
    XTest=[]
    n_components=300
    coll_train = io.ImageCollection(data_trainpath)
    coll_test = io.ImageCollection(data_testpath)

    for img in coll_train:
        intern_train=change_shape(color.rgb2gray(img))
        XTrain.append(intern_train)
    XTrain=np.array(XTrain)/255.
    pca=PCA(n_components=n_components,svd_solver='randomized',whiten=True).fit(XTrain)
    XTrain = pca.transform(XTrain)
    for img in coll_test:
        intern_test=change_shape(color.rgb2gray(img))
        XTest.append(intern_test)
    XTest=np.array(XTest)/255.
    XTest = pca.transform(XTest)
    XTrain=np.insert(XTrain,0,values=1,axis=1)
    XTest=np.insert(XTest,0,values=1,axis=1)
    return(XTrain,XTest)

def change_shape(img):
    [row,col]=img.shape
    img=img.reshape(row*col,)
    return img

def costfunction(X,Y,theta,lamda):
    h_theta=sigmoid(X@theta)
    firstpart=Y*np.log(h_theta + 1e-6)
    secondpart=(1-Y)*np.log(1-h_theta + 1e-6)
    thirdpart=(lamda/(2*len(X)))*np.sum(np.power(theta[1:],2))
    return (-np.sum(firstpart+secondpart)/len(X))+thirdpart

def gradient_descent(X,Y,theta,rounds,alpha,lamda):
    costs=[]
    for i in range(rounds):
        thirdpart_derivative=theta[1:]*(lamda/len(X))
        thirdpart_derivative=np.insert(thirdpart_derivative,0,values=0,axis=0)
        h_theta=sigmoid(X@theta)
        theta=theta-(alpha/len(X))*X.T@(h_theta-Y)-thirdpart_derivative
        cost=costfunction(X,Y,theta,lamda)
        costs.append(cost)
    return costs,theta

def average_num(array):
    sum=0
    for i in range (len(array)):
        sum=sum+array[i]
    return sum/len(array)

def find_best_lamda(theta_inital,rounds,alpha,lamda,Trainingset):
    costs_all=[]
    theta_all=[]
    accurancy_all=[]
    for train, test in KF.split(Trainingset):
        XTrain=Trainingset[train][:,:-1]
        YTrain=Trainingset[train][:,-1]
        XTest_crossvalidation=Trainingset[test][:,:-1]
        YTest_crossvalidatio=Trainingset[test][:,-1]
    
        YTest_crossvalidatio=YTest_crossvalidatio.reshape(len(YTest_crossvalidatio),1)
        YTrain=YTrain.reshape(len(YTrain),1)
    
        costs,theta=gradient_descent(XTrain,YTrain,theta_inital,rounds,alpha,lamda)
        accurancy=Judge(XTest_crossvalidation,theta,YTest_crossvalidatio)
        costs_all.append(costs)
        theta_all.append(theta)
        accurancy_all.append(accurancy)
    return lamda,average_num(accurancy_all),accurancy_all

def Judge(X,theta,Label):
    h=sigmoid(X@theta)
    TP=0
    TN=0
    FN=0
    FP=0
    for i in range(len(h)):
        if h[i]>0.5:
            h[i]=1
        else:
            h[i]=0
    for i in range(len(Label)):
        if h[i]==1 and Label[i]==1:
            TP=TP+1
        elif h[i]==0 and Label[i]==0:
            TN=TN+1
        elif h[i]==1 and Label[i]==0:
            FN=FN+1
        elif h[i]==0 and Label[i]==1:
            FP=FP+1
    precision=TP/(TP+FP)
    recall=TP/(TP+FN)
    accurancy=(TP+TN)/(TP+TN+FP+FN)
    F1_score=(2*precision*recall)/(precision+recall)
    #return precision,recall,accurancy,F1_score
    return accurancy

def crossvalidation(data_trainpath,data_testpath,label_trainpath,label_testpath):
    import time
    start=time.perf_counter()
    alpha=2
    rounds=150
    lamda=1
    costs_all=[]
    accurancy_all=[]
    theta_all=[]
    YTrain,YTest=labelprocessing(label_trainpath,label_testpath)
    XTrain,XTest=data_preprocessing(data_trainpath,data_testpath)
    #After the hstack, the last column of the Trainingset is the YTrain, which can then be scaled for splitting.
    Trainingset=np.hstack((XTrain,YTrain))
    Trainingset=np.array(Trainingset)
    KF=KFold(n_splits=10)
    theta_inital=np.zeros((len(XTrain.T),1))
    for train, test in KF.split(Trainingset):
        XTrain=Trainingset[train][:,:-1]
        YTrain=Trainingset[train][:,-1]
        XTest_crossvalidation=Trainingset[test][:,:-1]
        YTest_crossvalidatio=Trainingset[test][:,-1]

        YTest_crossvalidatio=YTest_crossvalidatio.reshape(len(YTest_crossvalidatio),1)
        YTrain=YTrain.reshape(len(YTrain),1)

        costs,theta=gradient_descent(XTrain,YTrain,theta_inital,rounds,alpha,lamda)
        accurancy=Judge(XTest_crossvalidation,theta,YTest_crossvalidatio)
        costs_all.append(costs)
        theta_all.append(theta)
        accurancy_all.append(accurancy)
    end=time.perf_counter()
    time=end-start
    accurancy_all_average=np.sum(accurancy_all)/len(accurancy_all)
    return accurancy_all_average,theta_all,costs_all,time

def main(data_trainpath,data_testpath,label_trainpath,label_testpath):
    YTrain,YTest=labelprocessing(label_trainpath,label_testpath)
    XTrain,XTest=data_preprocessing(data_trainpath,data_testpath)
    theta=theta_all[0]
    accurancy=Judge(XTest,theta,YTest)
    return accurancy

In [5]:
data_trainpath= r".\Datasets\celeba\img\*.jpg"
data_testpath= r".\Datasets\celeba_test\img\*.jpg"
label_trainpath=r".\Datasets\celeba\labels.csv"
label_testpath=r".\Datasets\celeba_test\labels.csv"

In [4]:
accurancy_all,theta_all,costs_all,time=crossvalidation(data_trainpath,data_testpath,label_trainpath,label_testpath)
accurancy=main(data_trainpath,data_testpath,label_trainpath,label_testpath)
print("TaskA1: Accuracy of Logistic Regression is {}".format(accurancy))

0.877
