In [304]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import math
import os

maxAcc = 0.0
maxIter = 0
C_Lambda = 0.03
TrainingPercent = 80
ValidationPercent = 10
TestPercent = 10
M = 10
PHI = []
IsSynthetic = False

In [305]:
def create_pair_data_concat(df, df_pair):
    df.rename(columns={'img_id': 'img_id_A'}, inplace=True)
    df_new = pd.merge(df_pair, df, on="img_id_A")
    df.rename(columns={'img_id_A': 'img_id_B'}, inplace=True)
    df_new = pd.merge(df_new, df, on="img_id_B")
    df.rename(columns={'img_id_B': 'img_id'}, inplace=True)
    return df_new  

In [306]:
def create_pair_data_subtract(df, dataType):
    if dataType == "GSC":
        range_loop = 513
    else:
        range_loop = 10
    for i in range(1,range_loop):
        df[["f"+str(i)]] = df[["f"+str(i)+"_x"]] - df[["f"+str(i)+"_y"]].values
    for i in range(1,range_loop):
        df = df.drop(columns=["f"+str(i)+"_x" , "f"+str(i)+"_y"], axis=1)
    return df

In [307]:
def GenerateTrainingTarget(rawTraining,TrainingPercent = 80):
    TrainingLen = int(math.ceil(len(rawTraining)*(TrainingPercent*0.01)))
    t           = rawTraining[:TrainingLen]
    return t

def GenerateTrainingDataMatrix(rawData, TrainingPercent = 80):
    T_len = int(math.ceil(len(rawData[0])*0.01*TrainingPercent))
    d2 = rawData[:,0:T_len]
    return d2

# This method accepts the entire target data and splits it into the valPercent passed into it.
def GenerateValData(rawData, ValPercent, TrainingCount): 
    valSize = int(math.ceil(len(rawData[0])*ValPercent*0.01))
    V_End = TrainingCount + valSize
    dataMatrix = rawData[:,TrainingCount+1:V_End]
    return dataMatrix

# This method accepts the entire target data and splits it into the valPercent passed into it.
def GenerateValTargetVector(rawData, ValPercent, TrainingCount): 
    
    # size of the array is 10% of the entire target data.
    valSize = int(math.ceil(len(rawData)*ValPercent*0.01))
    V_End = TrainingCount + valSize
    
    # The entire target data is sliced from row index 55699+1 to 55699 + 69623*10*0.01
    t =rawData[TrainingCount+1:V_End]
    return t

def GenerateBigSigma(Data, MuMatrix,TrainingPercent,IsSynthetic):
    # Initialize the BigSigma into a 2-d numpy array of size 41 x 41
    BigSigma    = np.zeros((len(Data),len(Data)))
    
    # We transpose the data back to the original matrix 69623 X 41
    DataT       = np.transpose(Data)
    
    # The original data is divided and the length of 20% of it is assigned to the training length = 55699
    TrainingLen = math.ceil(len(DataT)*(TrainingPercent*0.01))        
    varVect     = []
    
    # The for loop takes in each row of the original data and loops in 41 times and stores the variance 
    # of all 69623 values accross each feature inside the array.
    # For each row accross the 65000 values it calculates the variance using np.var().
    # varVect is a 41 X 1 vector consisting the variance for each feature accross the dataset.
    for i in range(0,len(DataT[0])):
        vct = []
        for j in range(0,int(TrainingLen)):
            vct.append(Data[i][j])    
        varVect.append(np.var(vct))
    
    # The below block creates a diagonal matrix for Bigsigma with the variance values.It represents the 
    # feature vector with respect to itself.
    for j in range(len(Data)):
        BigSigma[j][j] = varVect[j]
    if IsSynthetic == True:
        BigSigma = np.dot(3,BigSigma)
    else:
        BigSigma = np.dot(200,BigSigma)
    print ("BigSigma Generated..")
    return BigSigma

def GetScalar(DataRow,MuRow, BigSigInv):  
    R = np.subtract(DataRow,MuRow)
    T = np.dot(BigSigInv,np.transpose(R))  
    L = np.dot(R,T)
    return L

# This method formulates the Gaussian radial basis functions to compute each PHI
def GetRadialBasisOut(DataRow,MuRow, BigSigInv):    
    phi_x = math.exp(-0.5*GetScalar(DataRow,MuRow,BigSigInv))
    return phi_x

def GetPhiMatrix(Data, MuMatrix, BigSigma, TrainingPercent = 80):
    # We transpose the data back to the original matrix 65000 X 41
    DataT = np.transpose(Data) 
    
    # We reduce the raw data to 80% of it and store the length of training data into TrainingLen = 55699
    TrainingLen = math.ceil(len(DataT)*(TrainingPercent*0.01)) 
    
    # We create a PHI matrix of dimension 55699 X 10 (Number of datasets) X (Number of basis functions)
    PHI = np.zeros((int(TrainingLen),len(MuMatrix)))
    BigSigInv = np.linalg.inv(BigSigma) 
    
    # We formulate the Gaussian Radial Bassis function which converts each row of the vectors - Data and Mu 
    # multiplied by inverse of BigSigma into a single scalar value and store it in the PHI matrix.
    for  C in range(0,len(MuMatrix)):
        for R in range(0,int(TrainingLen)):
            PHI[R][C] = GetRadialBasisOut(DataT[R], MuMatrix[C], BigSigInv)
    print ("PHI Generated..")
    
    return PHI

#This method returns the output of the model which is comparable to the target values.
def GetValTest(VAL_PHI,W):
    Y = np.dot(W,np.transpose(VAL_PHI))
    return Y

# This method returns the root mean quare errors when comparing the training / validation 
# dataset with the target value.
def GetErms(VAL_TEST_OUT,ValDataAct):
    sum = 0.0
    t=0
    accuracy = 0.0
    counter = 0
    val = 0.0
    for i in range (0,len(VAL_TEST_OUT)):
        sum = sum + math.pow((ValDataAct[i] - VAL_TEST_OUT[i]),2)
        if(int(np.around(VAL_TEST_OUT[i], 0)) == ValDataAct[i]):
            counter+=1
    accuracy = (float((counter*100))/float(len(VAL_TEST_OUT)))
    return (str(accuracy) + ',' +  str(math.sqrt(sum/len(VAL_TEST_OUT))))

In [308]:
def generate_raw_data(df, df_diff, df_same, feature_operation, dataType):
    df_same_concat = create_pair_data_concat(df, df_same)
    df_diff_concat = create_pair_data_concat(df, df_diff)
    df_diff_concat = df_diff_concat.sample(n=df_same_concat.shape[0])
    df_final = pd.concat([df_same_concat, df_diff_concat])
    df_final = df_final.iloc[np.random.permutation(len(df_final))]
    if feature_operation == "Subtract":
        df_final = create_pair_data_subtract(df_final, dataType)
    return df_final    

In [309]:
# ------------- Implementation for Stochastic Gradient Descent --------------- #
# we randomize the weights by multiplying a scalar value with the weights obtained from the closed form solution.
def generate_ERMS(TrainingTarget, TRAINING_PHI, VAL_PHI, TEST_PHI, ValDataAct, TestDataAct, dataType):
    if dataType == "GSC":
        range_loop = 200
    else:
        range_loop = TrainingTarget.shape[0]
    W_Now = np.ones(M)
    La           = 2
    learningRate = 0.01
    L_Erms_Val   = []
    L_Erms_TR    = []
    L_Erms_Test  = []
    W_Mat        = []

    for i in range(0,range_loop):
        Delta_E_D     = -np.dot((TrainingTarget[i] - np.dot(np.transpose(W_Now),TRAINING_PHI[i])),TRAINING_PHI[i])
        La_Delta_E_W  = np.dot(La,W_Now)
        Delta_E       = np.add(Delta_E_D,La_Delta_E_W)    
        Delta_W       = -np.dot(learningRate,Delta_E)
        W_T_Next      = W_Now + Delta_W
        W_Now         = W_T_Next
        
        #-----------------TrainingData Accuracy---------------------#
        TR_TEST_OUT   = GetValTest(TRAINING_PHI,W_T_Next) 
        Erms_TR       = GetErms(TR_TEST_OUT,TrainingTarget)
        L_Erms_TR.append(float(Erms_TR.split(',')[1]))

        #-----------------ValidationData Accuracy---------------------#
        VAL_TEST_OUT  = GetValTest(VAL_PHI,W_T_Next) 
        Erms_Val      = GetErms(VAL_TEST_OUT,ValDataAct)
        L_Erms_Val.append(float(Erms_Val.split(',')[1]))

        #-----------------TestingData Accuracy---------------------#
        TEST_OUT      = GetValTest(TEST_PHI,W_T_Next) 
        Erms_Test = GetErms(TEST_OUT,TestDataAct)
        L_Erms_Test.append(float(Erms_Test.split(',')[1]))
        
    print ('----------Gradient Descent Solution--------------------')
    print ("E_rms Training   = " + str(np.around(min(L_Erms_TR),5)))
    print ("E_rms Validation = " + str(np.around(min(L_Erms_Val),5)))
    print ("E_rms Testing    = " + str(np.around(min(L_Erms_Test),5)))     

In [310]:
def train_model(RawData, RawTarget, dataType):
    TrainingTarget = np.array(GenerateTrainingTarget(RawTarget,TrainingPercent))
    TrainingData   = GenerateTrainingDataMatrix(RawData,TrainingPercent)
    
    ValDataAct = np.array(GenerateValTargetVector(RawTarget,ValidationPercent, (len(TrainingTarget))))
    ValData    = GenerateValData(RawData,ValidationPercent, (len(TrainingTarget)))
    
    TestDataAct = np.array(GenerateValTargetVector(RawTarget,TestPercent, (len(TrainingTarget)+len(ValDataAct))))
    TestData = GenerateValData(RawData,TestPercent, (len(TrainingTarget)+len(ValDataAct)))
    
    ErmsArr = []
    AccuracyArr = []
    kmeans = KMeans(n_clusters=M, random_state=0).fit(np.transpose(TrainingData))
    Mu = kmeans.cluster_centers_
    BigSigma     = GenerateBigSigma(RawData, Mu, TrainingPercent,IsSynthetic)
    TRAINING_PHI = GetPhiMatrix(RawData, Mu, BigSigma, TrainingPercent)
    TEST_PHI     = GetPhiMatrix(TestData, Mu, BigSigma, 100) 
    VAL_PHI      = GetPhiMatrix(ValData, Mu, BigSigma, 100)
    
    generate_ERMS(TrainingTarget, TRAINING_PHI, VAL_PHI, TEST_PHI, ValDataAct, TestDataAct, dataType)

In [311]:
df_hof = pd.read_csv("HOF/HumanObserved-Features-Data.csv",index_col=0)
df_diff_hof = pd.read_csv("HOF/diffn_pairs.csv",index_col=0)
df_same_hof = pd.read_csv("HOF/same_pairs.csv",index_col=0)
print("-----------------------HOF : Concat--------------------")
df_final = generate_raw_data(df_hof, df_diff_hof, df_same_hof, "Concat", "HOF")
df_target = df_final['target']
df_final.drop(['img_id_A','img_id_B','target' ], axis=1, inplace=True)
RawData = np.transpose(df_final.as_matrix())
RawTarget = df_target.as_matrix()
train_model(RawData, RawTarget, "HOF")

print("-----------------------HOF : Subtract--------------------")
df_final = generate_raw_data(df_hof, df_diff_hof, df_same_hof, "Subtract", "HOF")
df_target = df_final['target']
df_final.drop(['img_id_A','img_id_B','target' ], axis=1, inplace=True)
RawData = np.transpose(df_final.as_matrix())
RawTarget = df_target.as_matrix()
train_model(RawData, RawTarget, "HOF")

df_gsc = pd.read_csv("GSC/GSC-Features.csv")
df_diff_gsc = pd.read_csv("GSC/diffn_pairs.csv")
df_same_gsc = pd.read_csv("GSC/same_pairs.csv")
print("-----------------------GSC : Concat--------------------")
df_final = generate_raw_data(df_gsc, df_diff_gsc, df_same_gsc, "Concat", "GSC")
df_target = df_final['target']
df_final.drop(['img_id_A','img_id_B','target' ], axis=1, inplace=True)
uniques = df_final.apply(lambda x: x.nunique())
df_final = df_final.drop(uniques[uniques==1].index, axis=1)
RawData = np.transpose(df_final.as_matrix())
RawTarget = df_target.as_matrix()
train_model(RawData, RawTarget, "GSC")

print("-----------------------GSC : Subtract--------------------")
df_final = generate_raw_data(df_gsc, df_diff_gsc, df_same_gsc, "Subtract", "GSC")
df_target = df_final['target']
df_final.drop(['img_id_A','img_id_B','target' ], axis=1, inplace=True)
uniques = df_final.apply(lambda x: x.nunique())
df_final = df_final.drop(uniques[uniques==1].index, axis=1)
RawData = np.transpose(df_final.as_matrix())
RawTarget = df_target.as_matrix()
train_model(RawData, RawTarget, "GSC")

-----------------------HOF : Concat--------------------


  
  if __name__ == '__main__':


TypeError: train_model() missing 1 required positional argument: 'dataType'