In [None]:
import copy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import wandb

In [None]:
!export WANDB_START_METHOD=thread
!export WANDB_AGENT_MAX_INITIAL_FAILURES=1000

In [None]:
import os
os.environ["WANDB_START_METHOD"] = "thread"
os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"] = "100"
# !export WANDB_START_METHOD=thread


wandb.init(project="neural_regressor", entity="aswanth_kumar_m")

In [None]:
def make_dir(path):
    """
    Make directories recursively till the path
    """
    if not os.path.exists(path):
        os.makedirs(path)


In [None]:
import seaborn as sns
import sys
import json


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
from constants import *

# CONSTANTS
COMET_SCORE = 'comet_score'
BLEU_SCORE = 'bleu_score'
NO_OF_TOKENS_IN_QUERY = 'no_of_tokens_in_query'
NO_OF_TOKENS_IN_SRC_SENT = 'no_of_tokens_in_src_sent'
NO_OF_TOKENS_IN_DST_SENT = 'no_of_tokens_in_dst_sent'
LABSE_SCORE_QUERY_SRC = 'labse_score_query_src'
LABSE_SCORE_QUERY_DST = 'labse_score_query_dst'
LABSE_SCORE_SRC_DST = 'labse_score_src_dst'
CHRF_SCORE = 'chrf_score'

COMET_QE_QUERY_SRC_SCORE = 'comet_qe_query_src_score'
COMET_QE_QUERY_DST_SCORE = 'comet_qe_query_dst_score'
COMET_QE_SRC_DST_SCORE = 'comet_qe_src_dst_score'
SRC_DST_PPL = 'src_dst_ppl'
SRC_DST_QUERY_PPL = 'src_dst_query_ppl'

IN22_OTHER_SOURCES = 'in22_other_sources'
SAMANANTAR = 'samanantar'
FLORES = 'flores'

SAVED_MODELS = 'saved_models'
DATASET_TRAIN = 'dataset_train'
DATASET_TEST = 'dataset_test'
 

In [None]:
features = [NO_OF_TOKENS_IN_QUERY, 
            NO_OF_TOKENS_IN_SRC_SENT, 
            NO_OF_TOKENS_IN_DST_SENT,
            LABSE_SCORE_QUERY_SRC, 
            LABSE_SCORE_QUERY_DST, 
            LABSE_SCORE_SRC_DST,
            CHRF_SCORE, 
            COMET_QE_QUERY_SRC_SCORE, 
            COMET_QE_QUERY_DST_SCORE, 
            COMET_QE_SRC_DST_SCORE,
            # SRC_PPL, 
            # DST_PPL, 
            SRC_DST_PPL, 
            SRC_DST_QUERY_PPL] 


### Inputs

In [None]:
training_source = EUROPARL
testing_source = FLORES
src_lang = FRA_LATN
dst_lang = ENG_LATN
approach = 'comet_qe_20_regression'

### Load and process data

In [None]:
# Training dataset is created using bloom.ipynb file. (Refer to get_prompt_scores function).
dataset_path = '{}/{}_{}_{}.csv'.format(DATASET_TRAIN, training_source, src_lang, dst_lang)
dataset = pd.read_csv(dataset_path)
dataset.replace([np.inf], 99999, inplace=True)
# dataset = dataset[dataset[COMET_SCORE] >= 0]
dataset = dataset.drop(['qid_tmp', 'index_tmp'], axis=1)
dataset

In [None]:
# create feature variables and y
df = dataset.copy()
X = df.drop(['comet_score', 'bleu_score', 'comet_qe_20_score', 'comet_da_22_score'], axis=1)
y = df[['comet_qe_20_score']]

# create train/val/test dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.2, random_state=10)
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=10)

# X_train_raw = X_train.copy()
# X_val_raw = X_val.copy()
# X_test_raw = X_test.copy()


In [None]:
# pick only the necessary features
X_train = X_train[features]
X_val = X_val[features]
# X_test = X_test[features]

In [None]:
X_train

In [None]:
# Standardizing data
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
# X_test = scaler.transform(X_test)

y_scalar = MinMaxScaler()
y_scalar.fit(y_train)
y_train = y_scalar.transform(y_train)
y_val = y_scalar.transform(y_val)
y_test = y_scalar.transform(y_test)

In [None]:
len(features) == len(X_val[0])

In [None]:
X_train[0]

In [None]:
# Convert to 2D PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32).reshape(-1, 1)
# X_test = torch.tensor(X_test, dtype=torch.float32)
# y_test = torch.tensor(y_test.to_numpy(), dtype=torch.float32).reshape(-1, 1)


In [None]:
X_train = X_train.to(device)
X_val = X_val.to(device)
# X_test = X_test.to(device)

y_train = y_train.to(device)
y_val = y_val.to(device)
# y_test = y_test.to(device)

In [None]:
# set seed
def set_seed():
    torch.manual_seed(42)
    np.random.seed(42)

set_seed()

### Train Model

In [None]:
class CustomizableNet(nn.Module):
    def __init__(self, input_size, output_size, hidden_layers, hidden_size, activation_func):
        super(CustomizableNet, self).__init__()
        
        # Create a list to hold the hidden layers
        layers = []
        
        # Add the input layer
        layers.append(nn.Linear(input_size, hidden_size))
        
        # Add the hidden layers
        for i in range(hidden_layers):
            # layers.append(nn.BatchNorm1d(hidden_size))
            layers.append(activation_func())
            layers.append(nn.Linear(hidden_size, hidden_size))
        
        # Add the output layer
        layers.append(nn.Linear(hidden_size, output_size))
        
        # Create a Sequential model using the layers list
        self.net = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.net(x)

In [None]:
def get_prompt_scores(model, input_X):
    prompt_scores = []
    model.eval()
    with torch.no_grad():
        # Test out inference with 5 samples
        for i in range(len(input_X)):
            X_sample = input_X[i: i+1]
            X_sample = X_sample[features]
            X_sample = scaler.transform(X_sample)
            X_sample = torch.tensor(X_sample, dtype=torch.float32)
            X_sample = X_sample.to(device)
            y_pred = model(X_sample)
            # print(y_pred[0].item())
            prompt_scores.append(round(y_pred[0].item(), 4))

    return prompt_scores

In [None]:
def train_model(model, batch_size, n_epochs, optimizer, loss_fn, log_to_wandb=True):
    batch_start = torch.arange(0, len(X_train), batch_size)
    model.to(device)

    # Hold the best model
    best_mse = np.inf   # init to infinity
    best_weights = None
    history = []

    for epoch in range(n_epochs):
        model.train()
        with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=log_to_wandb) as bar:
            bar.set_description(f"Epoch {epoch}")
            for start in bar:
                # take a batch
                X_batch = X_train[start:start+batch_size]
                y_batch = y_train[start:start+batch_size]
                # forward pass
                y_pred = model(X_batch)
                loss = loss_fn(y_pred, y_batch)
                # backward pass
                optimizer.zero_grad()
                loss.backward()
                # update weights
                optimizer.step()
                # print progress
                bar.set_postfix(mse=float(loss))
                
        # evaluate accuracy at end of each epoch
        model.eval()
        y_train_pred = model(X_train)
        train_mse = loss_fn(y_train_pred, y_train)
        train_mse = float(train_mse)
        
        y_val_pred = model(X_val)
        val_mse = loss_fn(y_val_pred, y_val)
        val_mse = float(val_mse)
        history.append(val_mse)
        if val_mse < best_mse:
            best_mse = val_mse
            best_weights = copy.deepcopy(model.state_dict())
            
        if log_to_wandb:
           wandb.log({"train_loss": train_mse, "val_loss": val_mse, "epoch": epoch, "best_mse": best_mse})


    # restore model and return best accuracy
    model.load_state_dict(best_weights)

    # evaluate test accuracy at end of run using best weights
    y_test_pred = get_prompt_scores(model, X_test)
    test_mse = mean_squared_error(y_test, y_test_pred, squared=True)

    if log_to_wandb:
        wandb.log({"actual_test_loss": test_mse})
    print('Actual test loss: {}'.format(test_mse))
    
    print("Val MSE: %.5f" % best_mse)
    # print("Test data MSE: %.5f" % test_mse)

    # plt.plot(history)
    # plt.show()
    # plt.savefig('plots/test.png')

In [None]:
sweep_config = {
    "name" : "general_sweeps",
    'method': "bayes",
    'metric': {
        'name': 'test_loss',
        'goal': 'minimize'  
    },
    "parameters" : {
        "neurons_hidden_layer" : {
            "values" : [128, 256, 512]
        },
        "number_of_epochs" : {
            "values" : [20, 30, 40]
        },
        "activation" : {
            "values" : ["sigmoid" , "relu" , "tanh"]
        },
        "no_of_hidden_layer" : {
            "values" : [3, 4, 5]
        },
        "batch_size" :{
            "values" : [16, 32, 64]
        },
        "optimizer" : {
            "values" : ['adam', 'rmsprop', 'sgd']
        },
        "weight_decay" : {
            "values" : [0]
        },
        "learning_rate" : {
            "values" : [0.001, 0.005, 0.01]
        },
        "output_size": {
            "values" : [1]
        },
        "src_lang": {
            "values": [src_lang]
        },
        "dst_lang": {
            "values": [dst_lang]
        },
        "dataset_used": {
            "values": [training_source]
        },
        "approach": {
            "values": [approach]
        }
        
    }
}
sweep_id = wandb.sweep(sweep_config, project="neural_regressor", entity="aswanth_kumar_m")

In [None]:
def train(config=None):
    
    with wandb.init(config=config) as run:
        # load the config
        config = wandb.config

        set_seed()
        sweep_name = 'hl_{}_bs_{}_ac_{}_{}'.format(config.no_of_hidden_layer, config.batch_size, config.activation, config.optimizer)
        run.name = sweep_name
        print(sweep_name)
        # wandb.log({'test': 1})
        
        if config.activation == 'relu':
            activation_func = nn.ReLU
        elif config.activation == 'sigmoid':
            activation_func = nn.Sigmoid
        elif config.activation == 'tanh':
            activation_func = nn.Tanh
        # Create custom network using the above config file
        model = CustomizableNet(len(features), config.output_size, config.no_of_hidden_layer, config.neurons_hidden_layer, activation_func)
        
        # loss function and optimizer
        if config.optimizer == 'adam':
            optimizer = optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
        elif config.optimizer == 'rmsprop':
            optimizer = optim.RMSprop(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
        elif config.optimizer == 'sgd':
            optimizer = optim.SGD(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)

        loss_fn = nn.MSELoss()  # mean square error
        
        train_model(model, config.batch_size, config.number_of_epochs, optimizer, loss_fn)

    wandb.finish()


### Generate best model

In [None]:
def use_best_model():
    # best model
    input_size = len(features)
    output_size = 1

    activation_func_name = 'relu'
    batch_size = 64  # size of each batch
    learning_rate = 0.01
    neurons_hidden_layer = 128
    no_of_hidden_layer = 4
    n_epochs = 40   # number of epochs to run
    optimizer_func = 'adam'
    weight_decay = 0


    if activation_func_name == 'relu':
        activation_func = nn.ReLU
    elif activation_func_name == 'sigmoid':
        activation_func = nn.Sigmoid
    elif activation_func_name == 'tanh':
        activation_func = nn.Tanh

    set_seed()
    model = CustomizableNet(input_size, output_size, no_of_hidden_layer, neurons_hidden_layer, activation_func)

    # model = CustomNet(len(features), output_size, no_of_hidden_layer, neurons_hidden_layer, activation_func, weight_init)

    # loss function and optimizer
    if optimizer_func == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    elif optimizer_func == 'rmsprop':
        optimizer = optim.RMSprop(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    elif optimizer_func == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    loss_fn = nn.MSELoss()  # mean square error

    train_model(model, batch_size, n_epochs, optimizer, loss_fn, log_to_wandb=False)
    return model

### Generate prompt scores for test data

In [None]:
# load test dataset and compute prompt scores
def get_test_prompt_scores(training_source, testing_source, src_lang, dst_lang, model=None):
    
    if not model:
        # load the saved model
        model_path = '{}/{}_{}'.format(SAVED_MODELS, src_lang, dst_lang)
        model = torch.load(model_path)
    
    # load the test dataset
    test_data_path = '{}/{}_{}_{}_{}.csv'.format(DATASET_TEST, training_source, testing_source, src_lang, dst_lang)
    X_test_raw = pd.read_csv(test_data_path)
    X_test_raw.replace([np.inf], 99999, inplace=True)
        
    # generate prompt scores
    prompt_scores = get_prompt_scores(model, X_test_raw)
    X_test_raw['prompt_score'] = prompt_scores
    X_test = X_test_raw.copy()
    
    # write prompt scores to file and clean the test outputs
    X_test['prompt_score'] = X_test['prompt_score'].apply(lambda x: round(x, 4))
    X_test = X_test.sort_values(by=['qid'])

    # sort the predicted scores 
    result = {}
    for i, row in X_test.iterrows():
        qid, index, pred_comet_score = row['qid'], row['index'], row['prompt_score']
        # print(qid, index, pred_comet_score)
        qid, index = int(qid), int(index)
        if qid not in result:
            result[qid] = []
        
        result[qid].append({"index": index, "score": pred_comet_score})

    # sort based on the predicted prompt score
    for qid in list(result.keys()):
        ranking = result[qid]
        ranking.sort(key=lambda x: x['score'], reverse=True)
        result[qid] = ranking

    # write score to a JSON file
    make_dir('rankings_custom/{}'.format(approach.lower()))
    with open('rankings_custom/{}/recommendations_{}_{}_{}_{}.json'.format(approach.lower(), training_source, testing_source, src_lang, dst_lang), 'w') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)

### Main functions

In [None]:
# # Code to train model
# try:
#     wandb.agent(sweep_id, train, count=40)
# except:
#     pass

In [None]:
# Generate best model
model = use_best_model()

model_path = '{}/{}_{}'.format(SAVED_MODELS, src_lang, dst_lang)
torch.save(model, model_path)


In [None]:
# Use best model to predict prompt_scores for the test dataset
training_source=EUROPARL
testing_source=FLORES
get_test_prompt_scores(training_source, testing_source, src_lang, dst_lang, model)