In [1]:
from sklearn.cluster import KMeans
import numpy as np
import csv
import math
import matplotlib.pyplot
from matplotlib import pyplot as plt

In [2]:
maxAcc = 0.0
maxIter = 0
C_Lambda = 0.03
TrainingPercent = 80
ValidationPercent = 10
TestPercent = 10
M = 10
PHI = []
IsSynthetic = False

In [3]:
# This method iterates over each line in the Querylevelnorm_t.csv which contains the target data, 
# reads and stores it into an array
def GetTargetVector(filePath):
    t = []
    with open(filePath, 'rU') as f:
        reader = csv.reader(f)
        for row in reader:  
            t.append(int(row[0]))
    #print("Raw Training Generated..")
    return t

def GenerateRawData(filePath, IsSynthetic):    
    dataMatrix = [] 
    with open(filePath, 'rU') as fi:
        reader = csv.reader(fi)
        for row in reader:
            dataRow = []
            for column in row:
                dataRow.append(float(column))
            dataMatrix.append(dataRow)   
    
    # We are deleting the 5 columns mentioned below,to eliminate values which will give the covariance 
    # as 0 , inverse cannot becalculated, does not add anything to get the data. 
    if IsSynthetic == False :
        dataMatrix = np.delete(dataMatrix, [5,6,7,8,9], axis=1)
    dataMatrix = np.transpose(dataMatrix)     
    print ("Data Matrix Generated..")
    return dataMatrix

def GenerateTrainingTarget(rawTraining,TrainingPercent = 80):
    TrainingLen = int(math.ceil(len(rawTraining)*(TrainingPercent*0.01)))
    t           = rawTraining[:TrainingLen]
    return t

def GenerateTrainingDataMatrix(rawData, TrainingPercent = 80):
    T_len = int(math.ceil(len(rawData[0])*0.01*TrainingPercent))
    d2 = rawData[:,0:T_len]
    return d2

# This method accepts the entire target data and splits it into the valPercent passed into it.
def GenerateValData(rawData, ValPercent, TrainingCount): 
    valSize = int(math.ceil(len(rawData[0])*ValPercent*0.01))
    V_End = TrainingCount + valSize
    dataMatrix = rawData[:,TrainingCount+1:V_End]
    return dataMatrix

# This method accepts the entire target data and splits it into the valPercent passed into it.
def GenerateValTargetVector(rawData, ValPercent, TrainingCount): 
    
    # size of the array is 10% of the entire target data.
    valSize = int(math.ceil(len(rawData)*ValPercent*0.01))
    V_End = TrainingCount + valSize
    
    # The entire target data is sliced from row index 55699+1 to 55699 + 69623*10*0.01
    t =rawData[TrainingCount+1:V_End]
    return t

def GenerateBigSigma(Data, MuMatrix,TrainingPercent,IsSynthetic):
    # Initialize the BigSigma into a 2-d numpy array of size 41 x 41
    BigSigma    = np.zeros((len(Data),len(Data)))
    
    # We transpose the data back to the original matrix 69623 X 41
    DataT       = np.transpose(Data)
    
    # The original data is divided and the length of 20% of it is assigned to the training length = 55699
    TrainingLen = math.ceil(len(DataT)*(TrainingPercent*0.01))        
    varVect     = []
    
    # The for loop takes in each row of the original data and loops in 41 times and stores the variance 
    # of all 69623 values accross each feature inside the array.
    # For each row accross the 65000 values it calculates the variance using np.var().
    # varVect is a 41 X 1 vector consisting the variance for each feature accross the dataset.
    for i in range(0,len(DataT[0])):
        vct = []
        for j in range(0,int(TrainingLen)):
            vct.append(Data[i][j])    
        varVect.append(np.var(vct))
    
    # The below block creates a diagonal matrix for Bigsigma with the variance values.It represents the 
    # feature vector with respect to itself.
    for j in range(len(Data)):
        BigSigma[j][j] = varVect[j]
    if IsSynthetic == True:
        BigSigma = np.dot(3,BigSigma)
    else:
        BigSigma = np.dot(200,BigSigma)
    print ("BigSigma Generated..")
    return BigSigma

def GetScalar(DataRow,MuRow, BigSigInv):  
    R = np.subtract(DataRow,MuRow)
    T = np.dot(BigSigInv,np.transpose(R))  
    L = np.dot(R,T)
    return L

# This method formulates the Gaussian radial basis functions to compute each PHI
def GetRadialBasisOut(DataRow,MuRow, BigSigInv):    
    phi_x = math.exp(-0.5*GetScalar(DataRow,MuRow,BigSigInv))
    return phi_x

def GetPhiMatrix(Data, MuMatrix, BigSigma, TrainingPercent = 80):
    # We transpose the data back to the original matrix 65000 X 41
    DataT = np.transpose(Data) 
    
    # We reduce the raw data to 80% of it and store the length of training data into TrainingLen = 55699
    TrainingLen = math.ceil(len(DataT)*(TrainingPercent*0.01)) 
    
    # We create a PHI matrix of dimension 55699 X 10 (Number of datasets) X (Number of basis functions)
    PHI = np.zeros((int(TrainingLen),len(MuMatrix)))
    BigSigInv = np.linalg.inv(BigSigma) 
    
    # We formulate the Gaussian Radial Bassis function which converts each row of the vectors - Data and Mu 
    # multiplied by inverse of BigSigma into a single scalar value and store it in the PHI matrix.
    for  C in range(0,len(MuMatrix)):
        for R in range(0,int(TrainingLen)):
            PHI[R][C] = GetRadialBasisOut(DataT[R], MuMatrix[C], BigSigInv)
    print ("PHI Generated..")
    
    return PHI

def GetWeightsClosedForm(PHI, T, Lambda):
    
    # We use the hyper parameter lambda to bound the weights within a certain range , so that even if 
    # one of the weight is spiked upto a large value, the lambda brings down the weight, 
    # so that no particular feature dominates and result in overfitting.
    Lambda_I = np.identity(len(PHI[0]))
    for i in range(0,len(PHI[0])):
        Lambda_I[i][i] = Lambda
    
    PHI_T       = np.transpose(PHI)
    PHI_SQR     = np.dot(PHI_T,PHI)
    PHI_SQR_LI  = np.add(Lambda_I,PHI_SQR)
    PHI_SQR_INV = np.linalg.inv(PHI_SQR_LI)
    INTER       = np.dot(PHI_SQR_INV, PHI_T)
    W           = np.dot(INTER, T)
    return W

#This method returns the output of the model which is comparable to the target values.
def GetValTest(VAL_PHI,W):
    Y = np.dot(W,np.transpose(VAL_PHI))
    print ("Test Out Generated..")
    return Y

# This method returns the root mean quare errors when comparing the training / validation 
# dataset with the target value.
def GetErms(VAL_TEST_OUT,ValDataAct):
    sum = 0.0
    t=0
    accuracy = 0.0
    counter = 0
    val = 0.0
    for i in range (0,len(VAL_TEST_OUT)):
        sum = sum + math.pow((ValDataAct[i] - VAL_TEST_OUT[i]),2)
        if(int(np.around(VAL_TEST_OUT[i], 0)) == ValDataAct[i]):
            counter+=1
    accuracy = (float((counter*100))/float(len(VAL_TEST_OUT)))
    return (str(accuracy) + ',' +  str(math.sqrt(sum/len(VAL_TEST_OUT))))

## Fetch and Prepare Dataset

In [4]:
# Every row of the Querylevelnorm_t is fetched and stored in an array. Numberof rows: 69623
RawTarget = GetTargetVector('Querylevelnorm_t.csv')

# Every row of the Querylevelnorm_X is fetched and stored in an array. Numberof rows: 41
RawData   = GenerateRawData('Querylevelnorm_X.csv',IsSynthetic)

  """
  


Data Matrix Generated..


## Prepare Training Data

In [5]:
# The training target data is divided and a percentage of 80% of it is stored in the numpy array TrainingTarget
# with dimensions 59699 X 1
TrainingTarget = np.array(GenerateTrainingTarget(RawTarget,TrainingPercent))

# The training data is divided and a percentage of 80% of it is stored in the numpy array TrainingData
# with dimensions 41 * 55699
TrainingData   = GenerateTrainingDataMatrix(RawData,TrainingPercent)
print(TrainingTarget.shape)
print(TrainingData.shape)

(55699,)
(41, 55699)


## Prepare Validation Data

In [6]:
# The validation target data is 10% of the actual target data.
ValDataAct = np.array(GenerateValTargetVector(RawTarget,ValidationPercent, (len(TrainingTarget))))
ValData    = GenerateValData(RawData,ValidationPercent, (len(TrainingTarget)))
print(ValDataAct.shape)
print(ValData.shape)

(6962,)
(41, 6962)


## Prepare Test Data

In [7]:
TestDataAct = np.array(GenerateValTargetVector(RawTarget,TestPercent, (len(TrainingTarget)+len(ValDataAct))))
TestData = GenerateValData(RawData,TestPercent, (len(TrainingTarget)+len(ValDataAct)))
print(ValDataAct.shape)
print(ValData.shape)

(6962,)
(41, 6962)


## Closed Form Solution [Finding Weights using Moore- Penrose pseudo- Inverse Matrix]

In [8]:
ErmsArr = []
AccuracyArr = []

# This is a clustering method which takes in 2parameters: the number of basis functions we want to generate 
# and the random state. We can input an array of centroids if we have previous knowledge of the centroids, 
# in our case we do not have prior knowledge so just initialize it to 0 and let K means calculate it.
kmeans = KMeans(n_clusters=M, random_state=0).fit(np.transpose(TrainingData))

# kmeans.cluster_centers_ returns the means or centroid of the clusters.
Mu = kmeans.cluster_centers_

# Returns a diagonal matrix representing a feature vector with respect to itself.
BigSigma     = GenerateBigSigma(RawData, Mu, TrainingPercent,IsSynthetic)

# Get the design matrix for the training dataset.
TRAINING_PHI = GetPhiMatrix(RawData, Mu, BigSigma, TrainingPercent)
W            = GetWeightsClosedForm(TRAINING_PHI,TrainingTarget,(C_Lambda)) 

# Get the design matrix for the testing dataset.
TEST_PHI     = GetPhiMatrix(TestData, Mu, BigSigma, 100) 

# Get the design matrix for the validation dataset.
VAL_PHI      = GetPhiMatrix(ValData, Mu, BigSigma, 100)

BigSigma Generated..
PHI Generated..
PHI Generated..
PHI Generated..


In [9]:
print(Mu.shape)
print(BigSigma.shape)
print(TRAINING_PHI.shape)
print(W.shape)
print(VAL_PHI.shape)
print(TEST_PHI.shape)

(10, 41)
(41, 41)
(55699, 10)
(10,)
(6962, 10)
(6961, 10)


## Finding Erms on training, validation and test set 

In [10]:
# The GetValTest() returns the training output from the linear regression function 
# y = transpose(W)*PHI(x) for training, testing and validation dataset.
TR_TEST_OUT  = GetValTest(TRAINING_PHI,W)
VAL_TEST_OUT = GetValTest(VAL_PHI,W)
TEST_OUT     = GetValTest(TEST_PHI,W)

# Computes the root mean square error for the training, testing and validation dataset
TrainingAccuracy   = str(GetErms(TR_TEST_OUT,TrainingTarget))
ValidationAccuracy = str(GetErms(VAL_TEST_OUT,ValDataAct))
TestAccuracy       = str(GetErms(TEST_OUT,TestDataAct))

Test Out Generated..
Test Out Generated..
Test Out Generated..


In [11]:
print ('UBITname      = atrayeen')
print ('Person Number = 50288651')
print ('----------------------------------------------------')
print ("------------------LeToR Data------------------------")
print ('----------------------------------------------------')
print ("-------Closed Form with Radial Basis Function-------")
print ('----------------------------------------------------')
print ("M = 10 \nLambda = 0.9")
print ("E_rms Training   = " + str(float(TrainingAccuracy.split(',')[1])))
print ("E_rms Validation = " + str(float(ValidationAccuracy.split(',')[1])))
print ("E_rms Testing    = " + str(float(TestAccuracy.split(',')[1])))

UBITname      = atrayeen
Person Number = 50288651
----------------------------------------------------
------------------LeToR Data------------------------
----------------------------------------------------
-------Closed Form with Radial Basis Function-------
----------------------------------------------------
M = 10 
Lambda = 0.9
E_rms Training   = 0.5494694067137861
E_rms Validation = 0.5384281741389029
E_rms Testing    = 0.6279788453856765


## Gradient Descent solution for Linear Regression

In [12]:
print ('----------------------------------------------------')
print ('--------------Please Wait for 2 mins!----------------')
print ('----------------------------------------------------')

----------------------------------------------------
--------------Please Wait for 2 mins!----------------
----------------------------------------------------


In [13]:
# ------------- Implementation for Stochastic Gradient Descent --------------- #
# we randomize the weights by multiplying a scalar value with the weights obtained from the closed form solution.

W_Now        = np.dot(220, W)
La           = 2
learningRate = 0.01
L_Erms_Val   = []
L_Erms_TR    = []
L_Erms_Test  = []
W_Mat        = []

# We train the model in a loop of 400 instead of the total training data because it has been observed after 400 
# datasets the weights are stagnant and do not change. This method is called Early Stopping.

for i in range(0,400):
    
    # In Stochastic gradient descent we first choose random weights and we process the entire training set in 
    # one go and update all the weights by calculating the error multiplied by the learning rate.
    Delta_E_D     = -np.dot((TrainingTarget[i] - np.dot(np.transpose(W_Now),TRAINING_PHI[i])),TRAINING_PHI[i])
    La_Delta_E_W  = np.dot(La,W_Now)
    Delta_E       = np.add(Delta_E_D,La_Delta_E_W)    
    Delta_W       = -np.dot(learningRate,Delta_E)
    W_T_Next      = W_Now + Delta_W
    W_Now         = W_T_Next
    
    # We are displaying Training, Validation and testing data on the fly during iteration.
    # In a real world scenario, training a model could take multiple days,
    # so it's the norm to printing out changes during the iteration as they can
    # monitored in real time
    #-----------------TrainingData Accuracy---------------------#
    TR_TEST_OUT   = GetValTest(TRAINING_PHI,W_T_Next) 
    Erms_TR       = GetErms(TR_TEST_OUT,TrainingTarget)
    L_Erms_TR.append(float(Erms_TR.split(',')[1]))
    
    #-----------------ValidationData Accuracy---------------------#
    VAL_TEST_OUT  = GetValTest(VAL_PHI,W_T_Next) 
    Erms_Val      = GetErms(VAL_TEST_OUT,ValDataAct)
    L_Erms_Val.append(float(Erms_Val.split(',')[1]))
    
    #-----------------TestingData Accuracy---------------------#
    TEST_OUT      = GetValTest(TEST_PHI,W_T_Next) 
    Erms_Test = GetErms(TEST_OUT,TestDataAct)
    L_Erms_Test.append(float(Erms_Test.split(',')[1]))

Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Gene

Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Gene

Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Gene

Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..
Test Out Generated..


In [14]:
print ('----------Gradient Descent Solution--------------------')
print ("M = 15 \nLambda  = 0.0001\neta=0.01")
print ("E_rms Training   = " + str(np.around(min(L_Erms_TR),5)))
print ("E_rms Validation = " + str(np.around(min(L_Erms_Val),5)))
print ("E_rms Testing    = " + str(np.around(min(L_Erms_Test),5)))

----------Gradient Descent Solution--------------------
M = 15 
Lambda  = 0.0001
eta=0.01
E_rms Training   = 0.54964
E_rms Validation = 0.53846
E_rms Testing    = 0.62372
