In [None]:
!pip install datasets

# HuggingFace Models Usage and Tokenizer explanation

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

DEVICE = "cuda:0"
MODEL = "gpt2"

?
?

In [None]:
?

### Attention mask

In [None]:
text = "The most famous city of Italy is "
?

print("tokens", tokens)
print("encoded_input", encoded_input)
print("encoded_output", encoded_output)
print("output: ", output)

### Pipeline usage

In [None]:
?

# Sentiment Analysis 

In [None]:
from datasets import load_dataset

?

In [None]:
MODEL = 'google/flan-t5-large'

In [None]:
from datasets import load_dataset
from transformers import pipeline

test_dataset = load_dataset("carblacac/twitter-sentiment-analysis", "test")
train_dataset = load_dataset("carblacac/twitter-sentiment-analysis", "train")

In [None]:
?

In [None]:
import random
def build_example(elem):
    return f"""TEXT: {elem['text']}
SENTIMENT: {["NEGATIVE", "POSITIVE"][elem['feeling']]}
"""

def get_build_prompt(dataset, shots=3):
    def build_prompt(sent):
        ret = ""
        for i in range(0, shots):
            ret += build_example(dataset["train"][i])+"\n"
        ret += f"TEXT: {sent}\n"
        ret += "SENTIMENT:"

        return ret

    return build_prompt

prompt = get_build_prompt(train_dataset, shots=3)
print(prompt("This is my sentence"))

In [None]:
?

In [None]:
import torch
from tqdm import tqdm
from transformers import pipeline

?

In [None]:
results = eval()

In [None]:
?

## Feature extraction

In [None]:
?

In [None]:
import numpy as np

def get_feature_extractor(model):
    if model == "bert":
        pipe = pipeline("feature-extraction", framework="pt", model="bert-base-uncased", device="cuda")

        def feature_extractor(text):
            features_full = pipe(text)
            features = []
            for full in features_full:
                full = np.array(full[0])
                features.append(full.mean(axis=0))
            return np.array(features)

        return feature_extractor, 768

In [None]:
from datasets import load_dataset

MODEL = "bert"

extractor, feature_size = get_feature_extractor(MODEL)
test_dataset = load_dataset("carblacac/twitter-sentiment-analysis", "test")
train_dataset = load_dataset("carblacac/twitter-sentiment-analysis", "train")

In [None]:
from tqdm import tqdm

test_total = 100
test_features = []
test_classes = []
for i, elem in tqdm(enumerate(test_dataset['test']), total=test_total):
    test_classes.append(elem['feeling'])
    test_features.append(extractor([elem['text']])[0])
    if i >= test_total:
        break

train_total = 1000
train_features = []
train_classes = []
for i, elem in tqdm(enumerate(train_dataset['train']), total=train_total):
    train_classes.append(elem['feeling'])
    train_features.append(extractor([elem['text']])[0])
    if i >= train_total:
        break

train_features = np.array(train_features)
test_features = np.array(test_features)

print("train", train_features.shape)
print("test", test_features.shape)

In [None]:
total = 1000

features = []
classes = []
for i, elem in tqdm(enumerate(test_dataset['test']), total=total):
    classes.append(elem['feeling'])
    features.append(extractor([elem['text']])[0])
    if i >= total:
        break

features = np.array(features)
print(features.shape)

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

tsne = TSNE(n_components=2)
features_2d = tsne.fit_transform(features)

plt.title(MODEL)
plt.scatter(features_2d[:, 0], features_2d[:, 1], c=classes, s=5)
plt.show()

In [None]:
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier


classifiers = dict(
    lr=LogisticRegression(max_iter=10000),
    mlp=MLPClassifier(max_iter=10000),
    knn=KNeighborsClassifier(n_neighbors=5)
)

reports = dict()

for name, clf in classifiers.items():
    clf.fit(train_features, train_classes)
    predictions = clf.predict(test_features)
    reports[name] = classification_report(test_classes, predictions, output_dict=True)

In [None]:
print("name\tacc\tf1")
for name, report in reports.items():
    print(f"{name}\t{report['accuracy']:.2f}\t{report['weighted avg']['f1-score']:.2f}")

# Sentence Transformers

In [None]:
!pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

sentences = [
    "I think dogs are the best pets.",
    "I think cats are the best pets.",
    "Deep learning is a subfield of machine learning.",
]

?

In [None]:
?
?

# Fill Mask

In [None]:
?

# Generate Keywords

In [None]:
?


In [None]:
llm_wikipedia = "A large language model (LLM) is a type of artificial intelligence (AI) algorithm that uses deep learning techniques and massively large data sets to understand, summarize, generate and predict new content. The term generative AI also is closely connected with LLMs, which are, in fact, a type of generative AI that has been specifically architected to help generate text-based content."

inputs = tokenizer(f"Generate keywords for this text:\n{llm_wikipedia}\nkeywords: ", return_tensors="pt")
?

# Few Shot Prompting

In [None]:
MODEL = "flan-t5"

In [None]:
from datasets import load_dataset

test_dataset = load_dataset("carblacac/twitter-sentiment-analysis", "test")
train_dataset = load_dataset("carblacac/twitter-sentiment-analysis", "train")


def label_to_text(label):
    label = int(label)
    if label == 0:
        return "NEGATIVE"
    elif label == 1:
        return "POSITIVE"
    else:
        raise Exception("Unknown label")

def text_to_label(text):
    if "NEGATIVE" in text:
        return 0
    elif "POSITIVE" in text:
        return 1
    else:
        print(f"ERROR: unknown label {text}")
        return 0

In [None]:
def get_model(shots=3):
    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large").to("cuda")
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

    def inference(text):
        def encode_example(elem):
            return f"text: {elem['text']}\nsentiment: {label_to_text(elem['feeling'])}"
        prompt = f"Classify the sentiment of the following texts in one of these two categories: POSITIVE, NEGATIVE.\n\n"
        for i in range(0, shots):
            prompt += encode_example(train_dataset["train"][i])+"\n"
        prompt += f"text: {text}\nsentiment:"

        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=50)
        return text_to_label(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

    return inference

In [None]:
from tqdm import tqdm
from sklearn.metrics import classification_report

overall_results = dict()

for shots in [0, 5, 10]:
    print("Running shot", shots)
    model = get_model(MODEL, shots=shots)

    total = 100
    targets = []
    predictions = []

    for i, elem in tqdm(enumerate(test_dataset['test']), total=total):
        targets.append(elem['feeling'])
        predictions.append(model(elem['text']))
        if i >= total:
            break


    report = classification_report(targets, predictions, output_dict=True)
    overall_results[shots] = report

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

dfs = []

for shot, report in overall_results.items():
    dfs.append(dict(
        shot=shot,
        accuracy=report["accuracy"],
        **report["macro avg"]
    ))

df = pd.DataFrame(dfs)
print(df)
plt.plot(df["shot"], df["precision"], c="k")
plt.plot(df["shot"], df["recall"], c="r")
plt.plot(df["shot"], df["f1-score"], c="b")
#plt.scatter(df["shot"], df["support"], c="g")

plt.show()

In [None]:
?

# Traslator

In [None]:
!pip install evaluate

In [None]:
from datasets import load_dataset

train_dataset = load_dataset("opus_books", "en-it", split="train[1%:2%]")
val_dataset = load_dataset("opus_books", "en-it", split="train[2%:100%]")

In [None]:
print(train_dataset)
print(val_dataset)

In [None]:
for i, elem in enumerate(train_dataset):
    print(elem)
    if i >= 5:
        break

In [None]:
def get_translator():
    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large").to("cuda")
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

    def translator(text):
        def encode_example(elem):
            return f"text: {elem['translation']['en']}\ntranslation: {elem['translation']['it']}\n"
        prompt = "Translate this text from English into Italian.\n\n"
        for i, elem in enumerate(train_dataset):
            prompt += encode_example(elem)+"\n"
            if i >= 5:
                break
        prompt += f"text: {text}\ntranslation:"

        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=50)
        return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    return translator


In [None]:
translator = get_translator(MODEL)
print(translator("Hello, how are you?"))

In [None]:
from tqdm import tqdm

count = 100
predicted = []
actual = []

for i, elem in enumerate(tqdm(val_dataset, total=count)):
    if i >= count:
        break

    text = elem["translation"]["en"]
    translation = elem["translation"]["it"]
    prediction = translator(text)
    predicted.append(prediction)
    actual.append(translation)


In [None]:
import pandas as pd

pd.DataFrame(dict(actual=actual, predicted=predicted))

In [None]:
import evaluate

bleu = evaluate.load("bleu")
bleu.add_batch(predictions=predicted, references=actual)
results = bleu.compute()
print(results)