In [2]:
from pdfminer.high_level import extract_text

text = extract_text("survey.pdf")
print(text)

Transformers for Tabular Data Representation:
A Survey of Models and Applications

Gilbert Badaro Mohammed Saeed Paolo Papotti
EURECOM, France

{gilbert.badaro,mohammed.saeed,paolo.papotti}@eurecom.fr

Abstract

In the last few years, the natural language pro-
cessing community has witnessed advances
in neural representations of free texts with
transformer-based language models (LMs).
Given the importance of knowledge available
in tabular data, recent research efforts extend
LMs by developing neural representations for
structured data. In this article, we present a
survey that analyzes these efforts. We first
abstract the different systems according to a
traditional machine learning pipeline in terms
of training data, input representation, model
training, and supported downstream tasks. For
each aspect, we characterize and compare the
proposed solutions. Finally, we discuss future
work directions.

1

Introduction

Many researchers are studying how to represent
tabular data with neural

In [1]:
import nltk
from rake_nltk import Rake
import string

nltk.download('punkt')  # Download the punkt tokenizer data if not already installed
nltk.download('stopwords')  # Download the punkt tokenizer data if not already installed

rake_nltk_var = Rake()

def get_sentences(paragraph):
    paragraph = paragraph.replace("\n", " ")
    sentences = nltk.sent_tokenize(paragraph)
    return sentences

def get_bullets_from_sents(sentences):
    bullets_text = ""
    for sentence in sentences:
        sentence = sentence.strip().lstrip(string.punctuation).strip()
        
        bullets_text += "* " + sentence  + "\n"
    return bullets_text

def get_keywords(sentence):
    rake_nltk_var.extract_keywords_from_text(sentence)
    keywords_extracted = rake_nltk_var.get_ranked_phrases()
    keywords_extracted.sort(key=lambda x: sentence.lower().find(x))
    return keywords_extracted
    
# Example usage:
paragraph = """ 
While the Transformer architecture has become the de-facto standard for natural
language processing tasks, its applications to computer vision remain limited. In
vision, attention is either applied in conjunction with convolutional networks, or
used to replace certain components of convolutional networks while keeping their
overall structure in place. We show that this reliance on CNNs is not necessary
and a pure transformer applied directly to sequences of image patches can perform
very well on image classification tasks. When pre-trained on large amounts of
data and transferred to multiple mid-sized or small image recognition benchmarks
(ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent
results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train.
"""
# remove new lines from paragraph
paragraph = paragraph.replace("\n", " ")
sents = get_sentences(paragraph)

print(get_bullets_from_sents(sents))

* While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited.
* In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place.
* We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks.
* When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.
* Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train.



[nltk_data] Downloading package punkt to /Users/essam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/essam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
import re

def get_sentences(paragraph):
    paragraph = paragraph.replace("\n", " ")
    sentences = nltk.sent_tokenize(paragraph)
    return sentences

def get_bullets_from_sents(sentences):
    bullets_text = ""
    for sentence in sentences:
        sentence = colorize(sentence)
        bullets_text += "- " + sentence + "\n"
    return bullets_text

def colorize(sentence, classes=["color-1", "color-2", "color-3", "color-4"]):
    sentence = sentence.strip().lstrip(string.punctuation).strip()
    keywords_extracted = get_keywords(sentence)
    print(keywords_extracted)
    for i, keyword in enumerate(keywords_extracted):
        if keyword.casefold() in sentence.casefold():
            pattern = re.compile(re.escape(keyword), flags=re.IGNORECASE)
            sentence = re.sub(pattern, lambda match: f"<span class='{classes[i % 4]}'>{match.group()}</span>", sentence)
    return sentence

def get_keywords(sentence):
    rake_nltk_var.extract_keywords_from_text(sentence)
    keywords_extracted = rake_nltk_var.get_ranked_phrases()
    keywords_extracted.sort(key=lambda x: sentence.lower().find(x))
    
   # Create a pattern to match any word that has a hyphen in it
    pattern = re.compile(r"\b\w+-\w+\b")
    words_with_hyphen = pattern.findall(sentence)
    for i, keyword in enumerate(keywords_extracted):
        for word in words_with_hyphen:
            for single_keyword in keyword.split(" "):
                if single_keyword in word:
                    keywords_extracted[i] = word
    
    return list(set(keywords_extracted))


sents = get_sentences(paragraph)

print(get_bullets_from_sents(sents))

['natural language processing tasks', 'computer vision remain limited', 'de-facto', 'transformer architecture', 'applications', 'become']
['vision', 'convolutional networks', 'either applied', 'keeping', 'attention', 'used', 'overall structure', 'replace certain components', 'place', 'conjunction']
['pure transformer applied directly', 'image patches', 'well', 'image classification tasks', 'cnns', 'perform', 'sequences', 'necessary', 'show', 'reliance']
['transferred', 'imagenet', 'data', 'large amounts', 'small image recognition benchmarks', 'pre-trained', 'etc', 'mid-sized', 'vtab', 'cifar', 'CIFAR-100']
['attains excellent results compared', 'state-of', 'the-art', 'vit', 'train', 'vision transformer', 'requiring substantially fewer computational resources']
- While the <span class='color-4'>Transformer architecture</span> has <span class='color-2'>become</span> the <span class='color-3'>de-facto</span> standard for <span class='color-1'>natural language processing tasks</span>, its 

In [23]:
from torch.nn.functional import softmax
from transformers import BertForNextSentencePrediction, BertTokenizer


seq_B = 'We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks.'
seq_A = 'In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place'

# load pretrained model and a pretrained tokenizer
model = BertForNextSentencePrediction.from_pretrained('bert-base-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# encode the two sequences. Particularly, make clear that they must be 
# encoded as "one" input to the model by using 'seq_B' as the 'text_pair'
encoded = tokenizer.encode_plus(seq_A, text_pair=seq_B, return_tensors='pt')
print(encoded)
# {'input_ids': tensor([[  101,   146,  1176, 18621,   106,   102,  2091,  1128,  1176,  1172, 136,   102]]),
#  'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]),
#  'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
# NOTE how the token_type_ids are 0 for all tokens in seq_A and 1 for seq_B, 
# this way the model knows which token belongs to which sequence

# a model's output is a tuple, we only need the output tensor containing
# the relationships which is the first item in the tuple
seq_relationship_logits = model(**encoded)[0]

# we still need softmax to convert the logits into probabilities
# index 0: sequence B is a continuation of sequence A
# index 1: sequence B is a random sequence
probs = softmax(seq_relationship_logits, dim=1)

print(probs[0][0].item())

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'input_ids': tensor([[  101,  1130,  4152,   117,  2209,  1110,  1719,  3666,  1107,  9342,
          1114, 14255,  6005, 18404,  1348,  6379,   117,  1137,  1215,  1106,
          4971,  2218,  5644,  1104, 14255,  6005, 18404,  1348,  6379,  1229,
          3709,  1147,  2905,  2401,  1107,  1282,   102,  1284,  1437,  1115,
          1142, 24727,  1113, 13597,  1116,  1110,  1136,  3238,  1105,   170,
          5805, 11303,  1200,  3666,  2626,  1106, 10028,  1104,  3077, 14879,
          1169,  3870,  1304,  1218,  1113,  3077,  5393,  8249,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [11]:
original_string = "This is a \ sample \ string \n with \ backslashes."

# Remove all backslashes
modified_string = original_string.replace("\\", "")

print(modified_string)


This is a  sample  string 
 with  backslashes.
