In [None]:
#!pip install bertopic
#!pip install -U sentence-transformers

In [None]:
from datasets import load_dataset

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP

from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

dataset = load_dataset("tweet_eval", "hate")

### Evaluating the dataset (5 points)

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2970
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
})

The dataset is splitted into 3<br>
train 9000<br>
validation 1000<br> 
test 2970<br>

In [6]:
print(f"hate entires in the train set is {sum(dataset['train']['label'])/len(dataset['train']['label'])}")
print(f"Hate entires in the validation set is {sum(dataset['validation']['label'])/len(dataset['validation']['label'])}")
print(f"Hate entires in the test set is {sum(dataset['test']['label'])/len(dataset['test']['label'])}")

hate entires in the train set is 0.42033333333333334
Hate entires in the validation set is 0.427
Hate entires in the test set is 0.42154882154882156


In all three datasets, there is the same proportion of hateful entries <br>

In [None]:
umap_model = UMAP(random_state=42)
model = BERTopic(embedding_model="all-MiniLM-L6-v2", n_gram_range=(2,5), umap_model=umap_model)
topics, probs = model.fit_transform(dataset['train']['text'])
model.visualize_topics()

In [None]:
umap_model = UMAP(random_state=42)
model = BERTopic(embedding_model="all-MiniLM-L6-v2", n_gram_range=(2,5), umap_model=umap_model)
topics, probs = model.fit_transform(list(map(lambda x : x['text'], filter(lambda x : x['label'] == 0, dataset['train']))))
model.visualize_topics()
model.visualize_barchart()

In [122]:
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification, AutoTokenizer
import csv
import urllib.request

task='hate'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
'''try : 
    PATH = 'models/cased_L-12_H-768_A-12/'
    tokenizer = AutoTokenizer.from_pretrained(PATH, local_files_only=True)
    print('loaded locally')
except :'''
tokenizer = AutoTokenizer.from_pretrained(MODEL)
print('downloaded')

labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]
print(labels)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


downloaded
['not-hate', 'hate']


### Evaluate a model (5 points)

In [145]:
task='hate'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

def pred(model, tokenizer, inputs):
    preds = []
    for text in inputs['text']:
        #text = preprocess(text)
        encoded_input = tokenizer(text, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        preds.append(np.argmax(scores))

    return preds
preds = pred(model, tokenizer, dataset['test'])

from sklearn.metrics import f1_score, precision_score, recall_score
print(f"F1 score : {f1_score(dataset['test']['label'], preds)}")
print(f"precision score : {precision_score(dataset['test']['label'], preds)}")
print(f"recall score : {recall_score(dataset['test']['label'], preds)}")


F1 score : 0.6538143762049022
precision score : 0.4989491382934006
recall score : 0.9480830670926518


### Theoritical questions (16 points)

#### (2 point) What is the purpose of subword tokenization used by transformer models?
#### Part of the answer is in the first part of the course (lesson 2).
#### What is the effect on the vocabulary size?
#### How does it impact out-of-vocabulary words (words which are not in the training data, but appear in the test data, or production environment)?

The model is more comprehensive because it will be able to directly handle some trés like plural, feminine / masculine etc. 

#### (2 point) When building an encoder-decoder model using an RNN, what is the purpose of adding attention?
#### What problem are we trying to solve?
#### How does attention solve the problem?

One limitation of a classical RNN such as a seq2seq approach is that the RNN tries to remember the entire input sequence via a single hidden unit before translating it. Compressing all the information into a single hidden unit can result in information loss, especially for long sequences. Thus, similar to the way humans translate sentences, it may be beneficial to have access to the entire input sequence at each time step. 

Unlike an ordinary RNN, an attention mechanism allows the RNN to access all input elements at each given time step. However, having access to all elements of the input sequence at each time step can be overwhelming. Thus, to help the RNN focus on the most relevant elements of the input sequence, the attention mechanism assigns different attention weights to each input element. 

####  (2 point) In a transformer model what is the multihead attention used for?
#### What are we trying to achieve with self-attention?
#### Why do we use muliple head instead of one?

Intuitively, multiple attention heads allow us to attend differently to certain parts of the sequence (e.g., long-term versus short-term dependencies). This allows us to reach a higher level of understanding 

####  (2 point) In a transformer model, what is the purpose of positional embedding?
#### What would be the problem if we didn't use it?

Embending is a way to represent the meaning and importance of a word in a sentence in a vector space in order to group and match words of similar or common meaning. Positional embending is a way to do this by including the position of the word in the sentence as a parameter which adds more meaning to it. 

#### (2 point) What are the are the purpose of benchmarks?
#### And are they reliable? Why?

Benchmarks allow to compare different models and innovations to show the effect of technology changes. Their reliability is based on all other models already created so yes. 

#### (4 points) What are the differences between BERT and GPT?
#### What kind of transformer-based model are they?
#### How are they pretrained?
#### How are they fine-tuned?

They are fundamentally different in that BERT has just the encoder blocks from the transformer, whilst GPT has just the decoder blocks from the transformer. 

BERT was pretrained on two tasks: language modeling (15% of tokens were masked and BERT was trained to predict them from context) and next sentence prediction (BERT was trained to predict if a chosen next sentence was probable or not given the first sentence). 

GPT use an unsupervised pre-trained

BERT is fine-tuned by simply adding one additional layer after the final BERT layer and training the entire network for just a few epochs.

Fine-tuning in GPT-3 is the process of adjusting the parameters of a pre-trained model to better suit a specific task. This can be done by providing GPT-3 with a data set that is tailored to the task at hand, or by manually adjusting the parameters of the model itself.

#### (2 points) How are zero-shot and few-shots learning different from fine-tuning?
#### How do fine-tuning, zero-shot, and few-shot learning affect the model's weights?

Rero-shot and few-shots make learn to the model of the features thanks to small specific dataset for this one where the fine-tuning touch directly the weight to the hand 

They work by forcing the model to learn on small datasets created to learn specific features.

#### (2 point) In a few paragraphs, explain how the triplet loss is used to train a bi-encoder model for semantic similarity?

#### (2 point) What is the purpose of using an Approximate Nearest Neighbour method to speed up search?
#### What does it really reduce?