In [1]:
import tensorflow as tf
import pandas as pd

def tfrecord_to_dataframe(tfrecord_path, max_records=None):
    """
    Legge un file .tf_record e lo converte in un DataFrame pandas.
    max_records = None -> legge tutto il file
    """
    dataset = tf.data.TFRecordDataset(tfrecord_path)
    records = []

    for i, raw_record in enumerate(dataset):
        example = tf.train.Example()
        example.ParseFromString(raw_record.numpy())

        # Converti in dict
        features = {}
        for k, v in example.features.feature.items():
            # int64
            if v.int64_list.value:
                features[k] = list(v.int64_list.value)
                if len(features[k]) == 1:
                    features[k] = features[k][0]  # scalar
            # float
            elif v.float_list.value:
                features[k] = list(v.float_list.value)
                if len(features[k]) == 1:
                    features[k] = features[k][0]
            # bytes (string)
            elif v.bytes_list.value:
                features[k] = [x.decode("utf-8") for x in v.bytes_list.value]
                if len(features[k]) == 1:
                    features[k] = features[k][0]

        records.append(features)

        if max_records is not None and i + 1 >= max_records:
            break

    return pd.DataFrame(records)


# === USO ===
df = tfrecord_to_dataframe("../dat/PeerRead/proc/arxiv-all.tf_record", max_records=100)

print(df.head())
print("Shape:", df.shape)


   avg_sentence_length  venue   
0            24.723810      5  \
1            28.226257      4   
2            14.513618      5   
3            25.314816      5   
4            25.141975      6   

                                          token_mask  arxiv   
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      0  \
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      0   
2  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      0   
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      0   
4  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      1   

   abstract_contains_outperform  most_recent_reference_year   
0                             0                           6  \
1                             0                           7   
2                             0                           7   
3                             0                           7   
4                             0                           6   

   title_contains_neural  num_authors  num_