In [1]:
dataset_author = "rmisra"
dataset_name = "news-headlines-dataset-for-sarcasm-detection"
dataset_path = dataset_author + "/" + dataset_name

# Mount your Google Drive.
from google.colab import drive
drive.mount("/content/drive")

kaggle_creds_path = "/content/drive/MyDrive/my_kaggle/kaggle.json"

# remove kaggle directory if already exists, then crate it
!pwd
!rm -r ~/.kaggle
! mkdir ~/.kaggle
! cp {kaggle_creds_path} ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

! pip install kaggle --quiet
! kaggle datasets download -d {dataset_path}

import zipfile
zip_ref = zipfile.ZipFile(f'{dataset_name}.zip', 'r')

! mkdir dataset
zip_ref.extractall('/content/dataset')
zip_ref.close()
! rm {dataset_name}.zip

# Unmount your Google Drive
drive.flush_and_unmount()

Mounted at /content/drive
/content
Downloading news-headlines-dataset-for-sarcasm-detection.zip to /content
  0% 0.00/3.30M [00:00<?, ?B/s]
100% 3.30M/3.30M [00:00<00:00, 38.4MB/s]
mkdir: cannot create directory ‘dataset’: File exists


In [1]:
import pandas as pd
try:
    df = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines = True)
except:
    df = pd.read_json("/content/dataset/Sarcasm_Headlines_Dataset_v2.json", lines = True)
df.head()
df.drop(columns="article_link", axis=1, inplace=True)
df.head(5)
df

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...
...,...,...
28614,1,jews to celebrate rosh hashasha or something
28615,1,internal affairs investigator disappointed con...
28616,0,the most beautiful acceptance speech this week...
28617,1,mars probe destroyed by orbiting spielberg-gat...


In [2]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
import os

In [3]:
def tokenize_and_encode(texts, tokenizer):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='tf')

def df_to_dataset(dataframe, tokenizer, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('is_sarcastic')
    encoded_texts = tokenize_and_encode(dataframe['headline'].tolist(), tokenizer)
    ds = tf.data.Dataset.from_tensor_slices((encoded_texts['input_ids'], labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds


In [4]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.TPUStrategy(tpu)
    BATCH_SIZE = 32 * tpu_strategy.num_replicas_in_sync
except:
    print("No TPU available!")
    BATCH_SIZE = 32

AUTOTUNE = tf.data.experimental.AUTOTUNE

No TPU available!


In [5]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

train_df, test_df = train_test_split(df, test_size=0.2)
train_data = df_to_dataset(train_df, tokenizer, shuffle=True, batch_size=BATCH_SIZE)
test_data = df_to_dataset(test_df, tokenizer, shuffle=False, batch_size=BATCH_SIZE)
train_data = train_data.prefetch(buffer_size=AUTOTUNE)
test_data = test_data.prefetch(buffer_size=AUTOTUNE)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

# Import fine-tuned RoBERTa versions

In [6]:
mnli_model = TFAutoModelForSequenceClassification.from_pretrained(
            "textattack/roberta-base-MNLI", from_pt=True
        )

sst2_model = TFAutoModelForSequenceClassification.from_pretrained(
            "textattack/roberta-base-SST-2", from_pt=True
        )

### add RoBERTa for sentiment analysis

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


# Fine-tune Base RoBERTa



In [7]:
try:
    with tpu_strategy.scope():
        model_name = "roberta-base"
        model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True, num_labels=2)
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5, clipnorm=1.),
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics=[tf.metrics.SparseCategoricalAccuracy()],
            )
except:
    print("No TPU available!")
    model_name = "roberta-base"
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True, num_labels=2)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5, clipnorm=1.),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[tf.metrics.SparseCategoricalAccuracy()],
        )

try:
    model.load_weights(f'.\\models\\{model_name.replace("-","_")}_ft.h5')
except:
    history=model.fit(train_data, validation_data=test_data, epochs=6, verbose=1)
    model.save_weights(f'.\\models\\{model_name.replace("-","_")}_ft.h5')

No TPU available!


pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model.load_weights(f'.\\models\\{model_name.replace("-","_")}_ft.h5')
model.summary()

Model: "tf_roberta_for_sequence_classification_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  124055040 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  592130    
Total params: 124,647,170
Trainable params: 124,647,170
Non-trainable params: 0
_________________________________________________________________


# Take the weights from my models

In [9]:
import numpy as np

def fetch_mergeable_vars(mod, to_np=True):
    """Fetches trainable variables of a given model and converts them to NumPy arrays.

    This function supports both models running under a TensorFlow distribution strategy (e.g., TPUs)
    and standard models not using distribution strategies (e.g., models downloaded from Hugging Face).

    Args:
        model: TensorFlow or Hugging Face model whose trainable variables are to be fetched.
        strategy: (Optional) The TensorFlow distribution strategy under which the model is running.
                  If None, the model is assumed not to be under a distribution strategy.

    Returns:
        A list of NumPy arrays corresponding to the model's trainable variables.
    """
    def fetch_variables():
        body, *head = mod.layers
        return [v.numpy() for v in body.trainable_variables]

    def distributed_fetch():
        strategy = tf.distribute.get_strategy()
        body, *head = mod.layers
        return strategy.run(lambda: body.trainable_variables)

    if to_np:

        try:
            per_replica_variables = distributed_fetch()
            numpy_variables = []
            strategy = tf.distribute.get_strategy()
            for v in strategy.experimental_local_results(per_replica_variables):
                numpy_variables.extend([var.numpy() for var in v])
        except:
            numpy_variables = fetch_variables()

        vars = np.array(numpy_variables, dtype=object)

    else:
        body, *head = mod.layers
        vars = body.trainable_variables

    return vars

def clone_model(model):
    cloned = model.__class__(model.config)
    cloned(model.dummy_inputs)
    cloned.set_weights(model.get_weights())
    return cloned


# def assign_params(models_list):
#     """Assigns values from a list of NumPy arrays to the model's trainable variables.

#     This function supports both models running under a TensorFlow distribution strategy (e.g., TPUs)
#     and standard models not using distribution strategies (e.g., models downloaded from Hugging Face).

#     Args:
#         model: TensorFlow or Hugging Face model whose trainable variables are to be updated.
#         numpy_variables: A list of NumPy arrays containing the new values for the model's trainable variables.
#         strategy: (Optional) The TensorFlow distribution strategy under which the model is running.
#                   If None, the model is assumed not to be under a distribution strategy.
#     """
#     # just clone the first model in the list to initialize the parameters
#     new_model = clone_model(models_list[0])

#     vars_arr = np.array([fetch_mergeable_vars(mod) for mod in models_list], dtype=object)
#     # then we have to multiply each matrix in this vars_arr by a corresponding fisher matrix (different for each matrix for each model)
#     # the first goal: create a fishers_arr of the same dim of vars_arr (3, 197) with

#     def assign_variables():
#         body, *head = new_model.layers
#         for var, new_val in zip(body.trainable_variables, numpy_variables):
#             var.assign(new_val)

#     @tf.function
#     def distributed_assign():
#         strategy = tf.distribute.get_strategy()
#         body, *head = model.layers
#         for var, new_val in zip(body.trainable_variables, numpy_variables):
#             strategy.run(lambda: var.assign(new_val))

#     try:
#         distributed_assign()
#     except:
#         assign_variables()



# numpy_variables = fetch_trainable_vars(model)
numpy_variables_distributed = fetch_mergeable_vars(model)
numpy_variables = fetch_mergeable_vars(mnli_model)

In [61]:
def _fisher_for_batch(batch, model, variables):
    num_labels = model.num_labels
    batch = tf.expand_dims(batch, axis=0)
    with tf.GradientTape(persistent=True, watch_accessed_variables=False) as tape:
        tape.watch(variables)
        logits = model(batch, training=False).logits

        log_probs = tf.nn.log_softmax(logits, axis=-1)
        probs = tf.nn.softmax(logits, axis=-1)

        print(batch.shape)
        print(probs.shape)
        #print(batch.shape)

        with tape.stop_recording():
            # sq_grads = tf.square(tape.gradient(log_probs, variables))
            sq_grads = [tf.square(tape.gradient(log_probs[:,i], variables)) for i in range(num_labels)]
        #tmp=[probs[:,i]*sq_grads[i] for i in range(num_labels)]
        # tmp = tf.math.multply(probs, sr_grad)
        # logits = tf.squeeze(logits, axis=0)
        # log_probs = tf.nn.log_softmax(logits, axis=-1)
        # probs = tf.nn.softmax(logits, axis=-1)

        # grads = tape.gradient(log_probs, variables)
        # sq_grads = []
        # for i in range(num_labels):
        #     log_prob = log_probs[:, i]
        #     grad = tape.gradient(log_prob, variables)
        #     sq_grad = [probs[:, i] * tf.square(g) for g in grad]
        #     sq_grads.append(sq_grad)

        # example_fisher = [tf.reduce_sum(g, axis=0) for g in zip(*sq_grads)]

    return probs, log_probs

In [11]:
model.num_labels

2

In [12]:
def preprocess(input, label):
    return input

tr = train_data.map(preprocess)
mergeable_vars = fetch_mergeable_vars(model, to_np=False)

In [25]:
mergeable_vars[0]

<tf.Variable 'tf_roberta_for_sequence_classification_2/roberta/encoder/layer_._0/attention/self/query/kernel:0' shape=(768, 768) dtype=float32, numpy=
array([[ 0.0723473 , -0.05138723,  0.08780038, ..., -0.18677771,
        -0.2538248 , -0.05162003],
       [-0.00268799,  0.2061689 ,  0.06961331, ...,  0.01774677,
         0.04328064, -0.08617006],
       [-0.09013141,  0.07399632, -0.05160036, ..., -0.03148533,
         0.06424539,  0.10267198],
       ...,
       [ 0.10370311,  0.06530357, -0.04302413, ..., -0.05038806,
         0.07030558, -0.18979007],
       [ 0.09016812,  0.06345156, -0.00833152, ...,  0.10257896,
        -0.10443079,  0.00326744],
       [-0.10294539,  0.12811744,  0.10991048, ..., -0.11664649,
         0.01233165, -0.05385955]], dtype=float32)>

In [42]:
for example in tr:
   print(example.shape)
   break

(32, 195)


In [43]:
for example in tr.unbatch():
   print(example.shape)
   break

(195,)


In [74]:
for example in tr.unbatch():
   probs, sq = _fisher_for_batch(example, model, mergeable_vars)
   break

(1, 195)
(1, 2)


ValueError: Attempt to convert a value (None) with an unsupported type (<class 'NoneType'>) to a Tensor.

In [75]:
num_labels = model.num_labels
example = tf.expand_dims(example, axis=0)
with tf.GradientTape(persistent=True) as tape:
    tape.watch(mergeable_vars)
    logits = model(example, training=False).logits

    log_probs = tf.nn.log_softmax(logits, axis=-1)
    probs = tf.nn.softmax(logits, axis=-1)

    with tape.stop_recording():
        # sq_grads = tf.square(tape.gradient(log_probs, variables))
        sq_grads = [tf.square(tape.gradient(log_probs[:,i], mergeable_vars)) for i in range(num_labels)]
    #tmp=[probs[:,i]*sq_grads[i] for i in range(num_labels)]

ValueError: Attempt to convert a value (None) with an unsupported type (<class 'NoneType'>) to a Tensor.

In [69]:
tape.gradient(log_probs[:,1], mergeable_vars)

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [None]:
sq2 = np.array(sq, dtype=object)

In [None]:
probs2 = probs.numpy()

In [None]:
probs2.shape

(256, 2)

In [None]:
sq2.shape

(197,)

In [None]:
np.multiply(probs2, sq2)

ValueError: operands could not be broadcast together with shapes (256,2) (197,) 

In [None]:
len(numpy_variables_distributed)

197

In [None]:
prova = np.array(numpy_variables, dtype=object)
prova2 = np.array(numpy_variables_distributed, dtype=object)

In [None]:
models_list = [model, mnli_model, sst2_model]
vars_list = np.array([fetch_trainable_vars(mod) for mod in models_list], dtype=object)
pp=np.sum(vars_list, axis=0)

In [None]:
vars_list.shape

(3, 197)

In [None]:
pp=np.sum(vars_list, axis=0)
pp.shape

(197,)

# Fisher Averaging implementation

### hf_util.py

In [None]:
def get_body_and_head(
    model: Union[TFBertPreTrainedModel, TFRobertaPreTrainedModel]
) -> Tuple[tf.keras.layers.Layer, tf.keras.layers.Layer]:
    body, *head = model.layers
    if not head:
        head = None
    elif len(head) > 1:
        raise ValueError(
            f"Expected model to have a single 'head' layer. Instead found {len(head)}. TODO: Support this."
        )
    else:
        head = head[0]
    return body, head


def get_body(model):
    return get_body_and_head(model)[0]


def get_mergeable_variables(model):
    return get_body_and_head(model)[0].trainable_variables


def clone_model(model):
    cloned = model.__class__(model.config)
    cloned(model.dummy_inputs)
    cloned.set_weights(model.get_weights())
    return cloned

### evaluation.py

In [None]:
# import datasets as hfds

# def load_metric_for_glue_task(task: str):
#     return hfds.load_metric("glue", task)


def evaluate_model(model, dataset: tf.data.Dataset, metric: hfds.Metric):
    for model_input, gold_references in dataset:
        model_predictions = model(model_input).logits
        model_predictions = tf.argmax(model_predictions, axis=-1)
        metric.add_batch(predictions=model_predictions, references=gold_references)
    return metric.compute()


def average_score(score):
    return sum(score.values()) / len(score.values())

### fisher.py

In [None]:
def _batch_size(batch):
    return tf.shape(batch["input_ids"])[0]


@tf.function
def _compute_exact_fisher_for_batch(batch, model, variables, expectation_wrt_logits):
    assert expectation_wrt_logits, "TODO: Handle sampling from logits."
    num_labels = model.num_labels

    @tf.function
    def fisher_single_example(single_example_batch):
        """
        NOTE: I wrote this with Hugging Face classifiers in mind. There is
        probably a good way to do the same thing but with more customizability
        to support alternate forms of models.
        """
        with tf.GradientTape(persistent=True, watch_accessed_variables=False) as tape:
            tape.watch(variables)

            logits = model(single_example_batch, training=False).logits
            # The batch dimension must be 1 to call the model, so we remove it
            # here.
            logits = tf.squeeze(logits, axis=0)

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            probs = tf.nn.softmax(logits, axis=-1)

            sq_grads = []
            for i in range(num_labels):
                log_prob = log_probs[i]
                with tape.stop_recording():
                    grad = tape.gradient(log_prob, variables)
                    sq_grad = [probs[i] * tf.square(g) for g in grad]
                    sq_grads.append(sq_grad)
            # Take the average across logits. The per-logit weight was added
            # earlier as each per-logit square gradient was weighted by the
            # probability of the class according to the output distribution.
            example_fisher = [tf.reduce_sum(g, axis=0) for g in zip(*sq_grads)]

        return example_fisher

    batch = {k: tf.expand_dims(v, axis=1) for k, v in batch.items()}

    fishers = tf.vectorized_map(fisher_single_example, batch)
    return [tf.reduce_sum(f, axis=0) for f in fishers]


def compute_fisher_for_model(
    model, dataset: tf.data.Dataset, expectation_wrt_logits=True
):
    variables = hf_util.get_mergeable_variables(model)

    fishers = [
        tf.Variable(tf.zeros(w.shape), trainable=False, name=f"fisher/{w.name}")
        for w in variables
    ]

    n_examples = 0
    for batch, _ in dataset:
        n_examples += _batch_size(batch)
        batch_fishers = _compute_exact_fisher_for_batch(
            batch, model, variables, expectation_wrt_logits=expectation_wrt_logits
        )
        for f, bf in zip(fishers, batch_fishers):
            f.assign_add(bf)

    for fisher in fishers:
        fisher.assign(fisher / float(n_examples))

    return fishers

### merging.py

In [None]:
"""The code for actually performing the merge."""
import collections
from typing import Optional, Sequence
import datasets as hfds
import tensorflow as tf
import tensorflow_probability as tfp
from model_merging import hf_util
from model_merging import evaluation

MergeResult = collections.namedtuple("MergeResult", ["coefficients", "score"])


def print_merge_result(result: MergeResult):
    print(f"Merging coefficients: {result.coefficients}")
    print("Scores:")
    for name, value in result.score.items():
        print(f"  {name}: {value}")


def create_pairwise_grid_coeffs(n_weightings: int):
    n_weightings -= 2
    denom = n_weightings + 1
    weightings = [((i + 1) / denom, 1 - (i + 1) / denom) for i in range(n_weightings)]
    weightings = [(0.0, 1.0)] + weightings + [(1.0, 0.0)]
    weightings.reverse()
    return weightings


def create_random_coeffs(n_models: int, n_weightings: int, seed: Optional[int] = None):
    if seed is not None:
        tf.random.set_seed(seed)
    dist = tfp.distributions.Dirichlet(tf.ones([n_models]))
    return dist.sample(n_weightings, seed=seed).numpy().tolist()


def _merge_with_coeffs(
    output_variables: Sequence[tf.Variable],
    variables_to_merge: Sequence[Sequence[tf.Variable]],
    coefficients: Sequence[float],
    fishers=None,
    fisher_floor: float = 1e-6,
    favor_target_model=True,
    normalization_constants=None,
):
    n_models = len(variables_to_merge)
    assert len(coefficients) == n_models

    if fishers is None:
        fishers = n_models * [1.0]
    else:
        assert len(fishers) == n_models

    if normalization_constants is not None:
        assert len(normalization_constants) == n_models
        coefficients = [w / n for w, n in zip(coefficients, normalization_constants)]

    for i, var in enumerate(output_variables):
        lhs, rhs = [], []
        for j, (mvars, coeff, fisher) in enumerate(
            zip(variables_to_merge, coefficients, fishers)
        ):
            diag = fisher if isinstance(fisher, float) else fisher[i]
            if not favor_target_model or j == 0:
                diag = tf.maximum(diag, fisher_floor)
            mvar = mvars[i]
            tmp = coeff * diag
            lhs.append(tmp)
            rhs.append(tmp * mvar)
        rhs = tf.reduce_sum(rhs, axis=0)
        lhs = tf.reduce_sum(lhs, axis=0)
        var.assign(rhs / lhs)


def _l2_norm_of_fisher(fisher):
    norm_const = tf.reduce_sum([tf.reduce_sum(tf.square(d)) for d in fisher])
    return tf.sqrt(norm_const)


def generate_merged_for_coeffs_set(
    mergeable_models,
    coefficients_set: Sequence[Sequence[float]],
    fishers=None,
    fisher_floor: float = 1e-6,
    favor_target_model=True,
    normalize_fishers=True,
):
    # Create the model to yield, then handle the norm_constants
    if normalize_fishers and fishers is not None:
        norm_constants = [_l2_norm_of_fisher(f) for f in fishers]
    else:
        norm_constants = None

    # The first model in the list of mergeable models is the "target" model and
    # the rest are "donor" models.
    output_model = hf_util.clone_model(mergeable_models[0])
    output_variables = hf_util.get_mergeable_variables(output_model)

    variables_to_merge = [hf_util.get_mergeable_variables(m) for m in mergeable_models]

    # Make sure that all of the variable lists contain exactly the same number
    # of variables.
    assert len({len(output_variables)} | set(len(v) for v in variables_to_merge)) == 1

    for coefficients in coefficients_set:
        _merge_with_coeffs(
            output_variables,
            variables_to_merge,
            coefficients=coefficients,
            fishers=fishers,
            fisher_floor=fisher_floor,
            favor_target_model=favor_target_model,
            normalization_constants=norm_constants,
        )
        yield coefficients, output_model


def merging_coefficients_search(
    mergeable_models,
    coefficients_set: Sequence[Sequence[float]],
    dataset: tf.data.Dataset,
    metric: hfds.Metric,
    fishers=None,
    fisher_floor: float = 1e-6,
    favor_target_model=True,
    normalize_fishers=True,
    print_results=True,
):
    merged_models = generate_merged_for_coeffs_set(
        mergeable_models,
        coefficients_set,
        fishers,
        fisher_floor=fisher_floor,
        favor_target_model=favor_target_model,
        normalize_fishers=normalize_fishers,
    )
    results = []
    for coeffs, merged_model in merged_models:
        score = evaluation.evaluate_model(merged_model, dataset, metric)
        result = MergeResult(coefficients=coeffs, score=score)
        results.append(result)
        if print_results:
            print_merge_result(result)
    return results

### main.py

In [None]:
def load_models():
    models = []
    for i, model_str in enumerate(FLAGS.models):
        model_str = os.path.expanduser(model_str)
        model = TFAutoModelForSequenceClassification.from_pretrained(
            model_str, from_pt=FLAGS.from_pt
        )
        models.append(model)
        if i == 0:
            tokenizer = AutoTokenizer.from_pretrained(model_str)
    return models, tokenizer


def load_fishers():
    if not FLAGS.fishers:
        return None
    fishers = []
    for fisher_str in FLAGS.fishers:
        fisher_str = os.path.expanduser(fisher_str)
        fisher = hdf5_util.load_variables_from_hdf5(fisher_str, trainable=False)
        fishers.append(fisher)
    return fishers


def get_coeffs_set():
    n_models = len(FLAGS.models)
    if FLAGS.coeff_mode == "grid":
        assert n_models == 2
        return merging.create_pairwise_grid_coeffs(FLAGS.n_coeffs)
    elif FLAGS.coeff_mode == "random":
        return merging.create_random_coeffs(n_models, FLAGS.n_coeffs)
    else:
        raise ValueError


def get_best_results(results):
    return max(results, key=lambda r: evaluation.average_score(r.score))


def main(_):
    if FLAGS.fishers:
        assert len(FLAGS.fishers) == len(FLAGS.models)

    models, tokenizer = load_models()

    fishers = load_fishers()

    ds = data.load_glue_dataset(
        task=FLAGS.glue_task,
        split=FLAGS.split,
        tokenizer=tokenizer,
        max_length=FLAGS.sequence_length,
    )
    ds = ds.take(FLAGS.n_examples).batch(FLAGS.batch_size)

    metric = evaluation.load_metric_for_glue_task(FLAGS.glue_task)

    coefficients_set = get_coeffs_set()

    results = merging.merging_coefficients_search(
        models,
        coefficients_set=coefficients_set,
        dataset=ds,
        metric=metric,
        fishers=fishers,
        fisher_floor=FLAGS.fisher_floor,
        favor_target_model=FLAGS.favor_target_model,
        normalize_fishers=FLAGS.normalize_fishers,
    )

    best = get_best_results(results)
    print(80 * "*")
    print(" Best Merge")
    print(80 * "*")
    merging.print_merge_result(best)