In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import skimage.io as io
import skimage.color as color
import scipy.special
import time
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

In [11]:
def labelprocessing(label_trainpath,label_testpath):
    #Valid data split for read-in training csv, rename table headers, replace all -1's with 0's in order to cater for the 0-1 classification of the formula
    raw_data_train=pd.read_csv(label_trainpath,names=["mixed_information_train"])
    data_unmixed_train = raw_data_train["mixed_information_train"].str.split("\t",expand=True)
    data_unmixed_train.columns=['index','img_name','gender','smiling']
    data_unmixed_train.drop(columns=['index','img_name','smiling'],inplace=True)
    data_unmixed_train.drop(index=0,inplace=True)
    data_unmixed_train[data_unmixed_train<"1"]=0
    
    #Perform the same processing on the read-in test set results
    raw_data_test=pd.read_csv(label_testpath,names=["mixed_information_test"])
    data_unmixed_test = raw_data_test["mixed_information_test"].str.split("\t",expand=True)
    data_unmixed_test.columns=['index','img_name','gender','smiling']
    data_unmixed_test.drop(columns=['index','img_name','smiling'],inplace=True)
    data_unmixed_test.drop(index=0,inplace=True)
    data_unmixed_test[data_unmixed_test<"1"]=0
    
    #Converting label information into a form that is actually used 
    YTrain=data_unmixed_train
    YTest=data_unmixed_test
    YTrain=YTrain.values
    YTest=YTest.values
    YTrain=np.array([int(y) for y in YTrain])
    YTest=np.array([int(y) for y in YTest])
    YTrain=YTrain.reshape(len(YTrain),1)
    YTest=YTest.reshape(len(YTest),1)
    
    return(YTrain,YTest)

def data_preprocessing(data_trainpath,data_testpath):
    XTrain=[]
    XTest=[]
    n_components=300
    coll_train = io.ImageCollection(data_trainpath)
    coll_test = io.ImageCollection(data_testpath)

    for img in coll_train:
        intern_train=change_shape(color.rgb2gray(img))
        XTrain.append(intern_train)
    XTrain=np.array(XTrain)/255.
    pca=PCA(n_components=n_components,svd_solver='randomized',whiten=True).fit(XTrain)
    XTrain = pca.transform(XTrain)
    for img in coll_test:
        intern_test=change_shape(color.rgb2gray(img))
        XTest.append(intern_test)
    XTest=np.array(XTest)/255.
    XTest = pca.transform(XTest)
    XTrain=np.insert(XTrain,0,values=1,axis=1)
    XTest=np.insert(XTest,0,values=1,axis=1)
    return(XTrain,XTest)

def change_shape(img):
    [row,col]=img.shape
    img=img.reshape(row*col,)
    return img

def Judge(X,theta,Label):
    h=sigmoid(X@theta)
    TP=0
    TN=0
    FN=0
    FP=0
    for i in range(len(h)):
        if h[i]>0.5:
            h[i]=1
        else:
            h[i]=0
    for i in range(len(Label)):
        if h[i]==1 and Label[i]==1:
            TP=TP+1
        elif h[i]==0 and Label[i]==0:
            TN=TN+1
        elif h[i]==1 and Label[i]==0:
            FN=FN+1
        elif h[i]==0 and Label[i]==1:
            FP=FP+1
    precision=TP/(TP+FP)
    recall=TP/(TP+FN)
    accurancy=(TP+TN)/(TP+TN+FP+FN)
    F1_score=(2*precision*recall)/(precision+recall)
    #return precision,recall,accurancy,F1_score
    return accurancy

In [12]:
def chose_kernel(data_trainpath,data_testpath,label_trainpath,label_testpath):
    import time
    Kernel=['linear','rbf','sigmoid','']
    YTrain,YTest=labelprocessing(label_trainpath,label_testpath)
    XTrain,XTest=data_preprocessing(data_trainpath,data_testpath)
    YTrain=YTrain.reshape(len(YTrain),)
    start=time.perf_counter()
    for kernel in Kernel:
        svc=SVC(kernel=kernel,gamma='auto',cache_size=10000)
        svc.fit(XTrain,YTrain)
        end=time.perf_counter()
        time=end-start
        acc=svc.score(XTest,YTest.flatten())
        print(time,acc)
    return time,acc
def crossvalidation(data_trainpath,data_testpath,label_trainpath,label_testpath):
    import time
    gamma=[]
    C=[]
    c=0.1
    g=1/100000
    for i in range(10):
        gamma.append(g)
        C.append(c)
        g=2*g
        c=c*1.5
    accurancy_all=[]
    search=[]
    start=time.perf_counter()
    YTrain,YTest=labelprocessing(label_trainpath,label_testpath)
    XTrain,XTest=data_preprocessing(data_trainpath,data_testpath)
    #After the hstack, the last column of the Trainingset is the YTrain, which can then be scaled for splitting.
    Trainingset=np.hstack((XTrain,YTrain))
    Trainingset=np.array(Trainingset)
    KF=KFold(n_splits=10)
    for train, test in KF.split(Trainingset): 
        XTrain=Trainingset[train][:,:-1]
        YTrain=Trainingset[train][:,-1]
        XTest_crossvalidation=Trainingset[test][:,:-1]
        YTest_crossvalidatio=Trainingset[test][:,-1]
        YTest_crossvalidatio=YTest_crossvalidatio.reshape(len(YTest_crossvalidatio),1)
        YTrain=YTrain.reshape(len(YTrain),1)
        for i in C:
            for j in gamma:
                svc = SVC(C=i, gamma=j,kernel="rbf")
                svc.fit(XTrain,YTrain)
                search.append(svc.score(XTest_crossvalidation,YTest_crossvalidatio.flatten()))
                best_score = search[np.argmax(search)]   
                best_param = [np.argmax(search)] 
                #print(metrics.classification_report(YTest_crossvalidatio, svc.predict(XTest_crossvalidation)))
    end=time.perf_counter()
    time=end-start
    accurancy_all_average=np.sum(accurancy_all)/len(accurancy_all)
    return best_param,best_score,time

In [13]:
def main(data_trainpath,data_testpath,label_trainpath,label_testpath):
    import time
    from sklearn.metrics import plot_roc_curve,roc_curve,auc,roc_auc_score
    YTrain,YTest=labelprocessing(label_trainpath,label_testpath)
    XTrain,XTest=data_preprocessing(data_trainpath,data_testpath)
    YTrain=YTrain.reshape(len(YTrain),)
    start=time.perf_counter()
    svc=SVC(C=3.32,kernel='rbf',gamma=0.000202)
    svc.fit(XTrain,YTrain)
    end=time.perf_counter()
    time=end-start
    acc=svc.score(XTest,YTest.flatten())
    return acc

In [1]:
data_trainpath= r".\Datasets\celeba\img\*.jpg"
data_testpath= r".\Datasets\celeba_test\img\*.jpg"
label_trainpath=r".\Datasets\celeba\labels.csv"
label_testpath=r".\Datasets\celeba_test\labels.csv"

In [15]:
#crossvalidation(data_trainpath,data_testpath,label_trainpath,label_testpath)
#score,C,gamma=maina(data_trainpath,data_testpath,label_trainpath,label_testpath)
acc=main(data_trainpath,data_testpath,label_trainpath,label_testpath)
print("TaskA1: Accuracy of Support Vector Machine is {}".format(acc))