In [None]:
pip install transformers



# Translation Model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
model_name = "facebook/nllb-200-distilled-600M"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

if torch.cuda.is_available():
    model.to("cuda")

translator = pipeline("translation", model=model, tokenizer=tokenizer)

# You need to set source and target language codes manually
text =  """New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
2010 marriage license application, according to court documents"""
#tokenizer.src_lang = "eng_Latn"
encoded = tokenizer(text, return_tensors="pt")
# Move the input tensors to the same device as the model
if torch.cuda.is_available():
    encoded = encoded.to("cuda")

output = model.generate(**encoded,forced_bos_token_id=tokenizer.convert_tokens_to_ids("hin_Deva"))
output = tokenizer.batch_decode(output, skip_special_tokens=True)
output


Device set to use cuda:0


['न्यूयॉर्क (CNN) जब लीना बैरींटस 23 साल की थी, तो उन्होंने न्यूयॉर्क के वेस्टचेस्टर काउंटी में शादी की। एक साल बाद, उन्होंने वेस्टचेस्टर काउंटी में फिर से शादी की, लेकिन एक अलग आदमी से और अपने पहले पति से तलाक के बिना। केवल 18 दिनों बाद उस शादी के बाद, उन्होंने फिर से शादी की। फिर, बैरींटस ने पांच बार और घोषणा की "मैं करता हूं", कभी-कभी केवल दो सप्ताह के भीतर। 2010 में, उन्होंने एक बार फिर से शादी की, इस बार ब्रोंक्स में। विवाह लाइसेंस के लिए एक आवेदन में, उन्होंने कहा कि यह उनकी "पहली और एकमात्र" शादी थी। बैरींटस, अब 39, को "पहली डिग्री में दाखिल करने के लिए एक झूठा उपकरण की पेशकश करने" के दो आपराधिक आरोपों का सामना करना पड़ रहा है, "अदालत के दस्तावेजों के अनुसार']

# Summarization Model

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
2010 marriage license application, according to court documents.
"""
output = summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False)

Device set to use cuda:0


In [None]:
output

[{'summary_text': 'Liana Barrientos has been married five times, sometimes within two weeks of each other. She is facing two criminal counts of "offering a false instrument for filing in the first degree"'}]

# FineTuning

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [4]:
from datasets import load_dataset
dataset = load_dataset('imdb')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [6]:
dataset['train'].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

In [7]:
label_id = dataset['train']['label'][0]
print(label_id)
label_name = dataset['train'].features['label'].int2str(label_id)
print(label_name)

0
neg


In [8]:
import pandas as pd

Parameter	Description
# pretrained_model_name_or_path	The name of the model (e.g., "bert-base-uncased") or a local path to a tokenizer directory.
# cache_dir	Optional path to a directory where the tokenizer will be cached.
# use_fast	Boolean (default: True). Whether to load the fast version (written in Rust). It’s much faster and recommended.
# tokenizer_type	Used if loading from a custom tokenizer directory.
# revision	Specific model version or branch to load (useful for version control on Hugging Face Hub).
# trust_remote_code	If you're loading a tokenizer from an external repo that uses custom code, set this to True. Be cautious — only use this with trusted sources.
# local_files_only	If set to True, will only load the tokenizer from local files and not try to download from the internet.
# force_download	If True, forces re-download of the tokenizer files even if they are cached.
# proxies	A dictionary of proxy servers to use for downloading files.
# resume_download	If True, resumes downloading in case of an interrupted download.
# token	Optional authentication token if the model is private.

In [9]:
from transformers import AutoTokenizer
model_checkpoint = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Parameter	Type	Description
# text or text_pair	str or List[str]	Input sentence(s). Can be a string or list of strings.
# truncation	bool or str	Whether to truncate inputs longer than max_length. Use True or 'longest_first'.
# padding	bool or str	If True, pad to the longest sequence in the batch. If 'max_length', pad to max_length.
# max_length	int	Max sequence length. Required when padding='max_length'.
# return_tensors	"pt" / "tf" / "np"	Return PyTorch, TensorFlow, or NumPy tensors instead of Python lists.
# return_attention_mask	bool	Whether to return the attention mask. Default is True.
# return_token_type_ids	bool	Return segment IDs (used in models like BERT).
# add_special_tokens	bool	Whether to add [CLS], [SEP], etc. Default: True.

In [10]:
def tokenizer_fn(data):
  return tokenizer(data['text'],truncation=True,padding="max_length")
tokenized_data = dataset.map(tokenizer_fn,batched=True, remove_columns=["text"])

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [11]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [51]:
tonized_dataset = tokenized_data.remove_columns(['text'])


In [13]:
tokenized_data.set_format('torch')

In [14]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training

In [15]:
from transformers import TrainingArguments,Trainer

In [16]:
training_args = TrainingArguments(
                   output_dir='./results',
                   eval_strategy="epoch",
                   save_strategy="epoch",
                   learning_rate=2e-5,
                   per_device_train_batch_size=5,
                   per_device_eval_batch_size=5,
                   num_train_epochs=3,
                   weight_decay=0.01,

)

In [17]:
import wandb
wandb.init(mode="offline")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [19]:
trainer = Trainer(model= model,
                  args=training_args,
                  train_dataset=tokenized_data['train'].select(range(50)),
                  eval_dataset=tokenized_data['test'].select(range(50)))
trainer.train()




Epoch,Training Loss,Validation Loss
1,No log,0.175896
2,No log,0.063756
3,No log,0.04453


TrainOutput(global_step=30, training_loss=0.19232433636983234, metrics={'train_runtime': 704.5175, 'train_samples_per_second': 0.213, 'train_steps_per_second': 0.043, 'total_flos': 19870109798400.0, 'train_loss': 0.19232433636983234, 'epoch': 3.0})

# Saving the model and predicting

In [20]:
trainer.save_model("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")

('fine_tuned_model/tokenizer_config.json',
 'fine_tuned_model/special_tokens_map.json',
 'fine_tuned_model/vocab.txt',
 'fine_tuned_model/added_tokens.json',
 'fine_tuned_model/tokenizer.json')

In [21]:
model_path = r"/content/fine_tuned_model"
token_path = r"/content/fine_tuned_model"
token = AutoTokenizer.from_pretrained(token_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [22]:
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [61]:
text = dataset['unsupervised']['text'][56]


In [53]:
text

'I love how I can relate to the main characters. They\'re all just screwed up enough in the head that I feel like I\'m watching my own life on the screen(except my life isn\'t quite that exciting.) I don\'t know anybody who couldn\'t relate to this movie, or find it hilarious! They did a great job on this one. I wanna see more movies like this one. It makes me feel like I\'m not as far off as I thought. The main characters put it all on the line. They\'re like open books. The producers of this one did a great job putting it all on the line for the audience to see. I haven\'t seen a movie like this since "Kids" and "Clerks". Great job guys!'

In [62]:
inputs = token(text, return_tensors="pt", truncation=True, padding="max_length")

In [63]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask'])

In [83]:
pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [64]:
model.config.id2label

{0: 'Negative', 1: 'Positive'}

In [65]:
for key, value in model.config.id2label.items():
  if key == 0:
    model.config.id2label[key] = "Negative"
  else:
    model.config.id2label[key] = "Positive"

In [66]:
model.config.id2label

{0: 'Negative', 1: 'Positive'}

In [67]:
import torch.nn.functional as F
import numpy as np

outputs = model(**inputs)
print(outputs.logits.shape)

# Logits to probabilities
probs = F.softmax(outputs.logits, dim=1).squeeze().tolist()
print(probs)

torch.Size([1, 2])
[0.9582871198654175, 0.04171282425522804]


In [68]:
# Get predicted class
pred_class_id = np.argmax(probs)
# Map id to label (if available)
label_name = model.config.id2label[pred_class_id]
print(f"Prediction: {label_name}")

Prediction: Negative
