In [2]:
import numpy as np

In [3]:
# Load data
X_tr = np.reshape(np.load("age_regression_Xtr.npy"), (-1, 48*48))
ytr = np.load("age_regression_ytr.npy")
X_te = np.reshape(np.load("age_regression_Xte.npy"), (-1, 48*48))
y_te = np.load("age_regression_yte.npy")


# splitting the dataset into training and validation - lets keep a 80-20 split as suggested 

# If using just numpy then : 

num_datapoints = X_tr.shape[0] # Determine the number of samples  - here 5000

split_index = int(0.8 * num_datapoints) # Calculate the index to split at (80% for training, 20% for validation)

indices = np.arange(num_datapoints)  # Shuffle the data (important to avoid any potential order biases)
np.random.shuffle(indices)

# Split the indices
train_indices = indices[:split_index]   # first 80% of data goes into the training dataset
val_indices = indices[split_index:]     # remaining 20% of data goes into the validation dataset

# Split the data according to the indices
X_train, X_val = X_tr[train_indices], X_tr[val_indices]   
y_train, y_val = ytr[train_indices], ytr[val_indices]


# Alternate approach to split dataset would be 

#from sklearn.model_selection import train_test_split
#X_train, X_val, y_train, y_val = train_test_split(X_tr, ytr, test_size=0.2, random_state=42)

X_train.shape

(4000, 2304)

In [3]:
def train_age_regressor (X_train,y_train,mini_batch_size,learning_rate,num_epochs):
    
    #normalizing data
    X_train = (X_train - np.mean(X_train, axis=0)) / (np.std(X_train, axis=0) + 1e-8)
    y_train = (y_train - np.mean(y_train, axis=0)) / (np.std(y_train, axis=0) + 1e-8)
    
    # lets start training
    
    # step1 - initialize weights
    
    weights = np.zeros(X_train.shape[1])  # equal to 48*48 = 2304
    bias = 0.0 # initializing bias to zero -- we compute bias gradient by mean of error values across the batch samples
    
    # step 2 - randomize order of training set
    num_tr_samples = X_train.shape[0]   #4000 images
    tr_indices = np.arange(num_tr_samples)   # indices for entire training set - from this we create mini batches
    
    mse_list = []
    
    for epoch in range(num_epochs):
        #print("Initial weights:", weights)
        #print("Initial bias:", bias)

        
        # shuffle training data before each epoch
        np.random.shuffle(tr_indices)
        
        for num in range(0,num_tr_samples,mini_batch_size):
            
            batch_index = tr_indices[num: num + mini_batch_size]

            
            x_batch = X_train[batch_index]
            y_batch = y_train[batch_index]
            
            # computing gradient on selected batch
            
            error_value = (np.dot(x_batch,weights) + bias) - y_batch 
            
            error_value = np.nan_to_num(error_value, nan=0.0, posinf=np.finfo(np.float64).max, neginf=np.finfo(np.float64).min)

            #error_value = np.clip(error_value, -1e5, 1e5)
            
            mse = np.mean(error_value ** 2)
            
            # Check for inf or NaN in MSE and skip the batch if encountered
            if np.isinf(mse) or np.isnan(mse):
                print(f"Invalid MSE detected at Epoch {epoch+1}, Batch {num//mini_batch_size + 1}. Skipping batch...")
                continue
            
            mse_list.append(mse)
            print(f"Epoch {epoch+1}, Batch {num//mini_batch_size + 1}, MSE: {mse}")
      #     weight_grad = np.dot(x_batch.T, error_value)/x_batch.shape[0]
            weight_grad = np.dot(x_batch.T,(np.dot(x_batch,weights) + bias) - y_batch )/x_batch.shape[0]
            
            bias_grad = np.mean(error_value)
            
            # updating weights and bias
            
            weights -= learning_rate * weight_grad 
            bias -= learning_rate * bias_grad
        

    #average_training_mse = np.mean(mse_list)
    #print(f"Average Training MSE: {average_training_mse}")
    #print("Last 10 MSE values of gradient descent:")
    #print(mse_list[-10:])
    return weights,bias
    

    

In [1]:
# learning_rates = 1e-5 
# mini_batch_sizes = 64
# num_epochs_testing = 150

# wts,bias = train_age_regressor(X_train,y_train,mini_batch_size=mini_batch_sizes, learning_rate=learning_rates,num_epochs=num_epochs_testing)

In [7]:
# validation an hyperparameter tuning using grid-search

def validation(X_train,y_train,X_val,y_val):
    
    # tune atleast 2 values for each hyperparameter
    learning_rates = [1e-5,1e-4,1e-3] 
    mini_batch_sizes = [32, 64,128]
    num_epochs_testing = [50, 100,150]
    
    best_mse = float('inf')  # setting mse to positive infinity to ensure the first mse calculated becomes the default best value after first iteration and gets updated in the process
    best_hyperparams = {}   # dictionary to store the three HP parameters
    best_weights, best_bias = None, None
    
    # normalizing validaton data
    X_val = (X_val - np.mean(X_val, axis=0)) / np.std(X_val, axis=0) 
    y_val = (y_val - np.mean(y_val, axis=0)) / np.std(y_val, axis=0)     
    
    for rate in learning_rates:
        for batch in mini_batch_sizes:
            for epoch in num_epochs_testing:
                
                weights, bias = train_age_regressor(X_train,y_train,mini_batch_size=batch, learning_rate=rate,num_epochs=epoch)
                
                # once we have the trained weights we validate the model
                
                y_val_pred = np.dot(X_val, weights) + bias
                mse = np.mean((y_val_pred - y_val) ** 2)  # mean squared error to validate prediction
                #print(mse)
                print(f"Num_Epoch {epoch}, Batch_size {batch}, Learning_rate {rate}, MSE: {mse}")
                if mse < best_mse:  
                    best_mse = mse
                    best_hyperparameters = {'num_epochs': epoch,'learning_rate': rate,'mini_batch': batch}
                    best_weights,best_bias = weights,bias
    
    #print(best_mse)
    return best_hyperparameters,best_weights,best_bias,best_mse
                    



In [9]:
best_hyp,best_weights,best_bias,best_mse= validation(X_train,y_train,X_val,y_val)

Num_Epoch 50, Batch_size 32, Learning_rate 1e-05, MSE: 0.7980858284148201
Num_Epoch 100, Batch_size 32, Learning_rate 1e-05, MSE: 0.7817600759463128
Num_Epoch 150, Batch_size 32, Learning_rate 1e-05, MSE: 0.7764923913809932
Num_Epoch 50, Batch_size 64, Learning_rate 1e-05, MSE: 0.825059311597272
Num_Epoch 100, Batch_size 64, Learning_rate 1e-05, MSE: 0.7982439602620542
Num_Epoch 150, Batch_size 64, Learning_rate 1e-05, MSE: 0.7871683315511885
Num_Epoch 50, Batch_size 128, Learning_rate 1e-05, MSE: 0.858354726422686
Num_Epoch 100, Batch_size 128, Learning_rate 1e-05, MSE: 0.8248488083114183
Num_Epoch 150, Batch_size 128, Learning_rate 1e-05, MSE: 0.8079652447471504
Num_Epoch 50, Batch_size 32, Learning_rate 0.0001, MSE: 0.768266541636217
Num_Epoch 100, Batch_size 32, Learning_rate 0.0001, MSE: 0.7662357760753514
Num_Epoch 150, Batch_size 32, Learning_rate 0.0001, MSE: 0.7726601133474943
Num_Epoch 50, Batch_size 64, Learning_rate 0.0001, MSE: 0.7692581662422355
Num_Epoch 100, Batch_size 

In [10]:
np.save("best_model_weights2.npy", best_weights)
np.save("best_model_bias2.npy", best_bias)
np.save("best_model_hyperparameters2",best_hyp)


In [17]:
# Test on testing data
X_te = np.reshape(np.load("age_regression_Xte.npy"), (-1, 48*48))
y_te = np.load("age_regression_yte.npy")

# normalizing test data to maintain model consistency
X_te = (X_te - np.mean(X_te, axis=0)) / np.std(X_te, axis=0)
y_te = (y_te - np.mean(y_te, axis=0)) / np.std(y_te, axis=0)

# load the best performing weights and bias

test_weights = np.load("best_model_weights2.npy")
test_bias = np.load("best_model_bias2.npy")



In [18]:
y_test_pred = np.dot(X_te, test_weights) + test_bias


test_mse = np.mean((y_test_pred - y_te) ** 2)
print(f"Test MSE: {test_mse}")

Test MSE: 0.7690748379706631
