In [None]:
# Mount Google Drive to save files directly if needed
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install transformers if not already installed
!pip install transformers==4.25.1


Collecting transformers==4.25.1
  Downloading transformers-4.25.1-py3-none-any.whl.metadata (93 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.9/93.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.25.1)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: to

In [None]:
# Import necessary libraries
import json
import torch
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from tqdm.auto import tqdm
import os

In [None]:
# Disable unnecessary parallel tokenizers logging if preferred
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [None]:
# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Configuration class for model and dataset paths
class CFG:
    INPUT = '/content/drive/MyDrive/dataset/Bengali hate speech(1) .csv'  # Update this path as needed
    TRANS_MODEL = 'facebook/nllb-200-distilled-600M'

In [None]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/dataset/Bengali hate speech(1) .csv')
display(df.head())

Unnamed: 0,sentence,hate,category
0,যত্তসব পাপন শালার ফাজলামী!!!!!,1,sports
1,পাপন শালা রে রিমান্ডে নেওয়া দরকার,1,sports
2,জিল্লুর রহমান স্যারের ছেলে এতো বড় জারজ হবে এটা...,1,sports
3,শালা লুচ্চা দেখতে পাঠার মত দেখা যায়,1,sports
4,তুই তো শালা গাজা খাইছচ।তুর মার হেডায় খেলবে সাকিব,1,sports


In [None]:
# Load model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(CFG.TRANS_MODEL)
model.to(device)
model.eval()  # Set model to evaluation mode
tokenizer = AutoTokenizer.from_pretrained(CFG.TRANS_MODEL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

  return torch.load(checkpoint_file, map_location="cpu")


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

In [None]:
# Save model and tokenizer (optional)
torch.save(model.state_dict(), '/content/drive/MyDrive/nllb-200-distilled-600M.pth')
tokenizer.save_pretrained('/content/drive/MyDrive/tokenizer/')

('/content/drive/MyDrive/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/tokenizer/sentencepiece.bpe.model',
 '/content/drive/MyDrive/tokenizer/added_tokens.json',
 '/content/drive/MyDrive/tokenizer/tokenizer.json')

In [None]:
# Define the translation function
def translate_by_nllb(text, tokenizer, model, device):
    # Tokenize the text and move inputs to the GPU
    inputs = tokenizer(text, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate translation
    translated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"],
        max_length=64
    )
    return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

In [None]:
# Create a new DataFrame to store translated sentences with the 'sentence' and 'hate' columns
translated_df = pd.DataFrame(columns=['sentence', 'labels'])

# Apply translation to each row within the specified range (15 to 30)
for i, row in tqdm(df.iterrows(), total=len(df)):
    if 15 <= i <= 30:
        translated_sentence = translate_by_nllb(row['sentence'], tokenizer, model, device)
        # Create a temporary DataFrame for the new row
        new_row = pd.DataFrame({'sentence': [translated_sentence], 'labels': [row['hate']]})
        # Concatenate the new row with the existing DataFrame
        translated_df = pd.concat([translated_df, new_row], ignore_index=True)

# Display the first few rows of the new DataFrame with translated sentences
display(translated_df.head(9))

  0%|          | 0/30000 [00:00<?, ?it/s]

Unnamed: 0,sentence,labels
0,I'm not sure.,1
1,I don't want to get the shoes off the bet. I c...,1
2,The stomach of the son of Papan Maghir is full...,1
3,It is necessary to give the shoes to your sins,1
4,I would have fucked the guy with the jersey.,1
5,Yeshir Ali has no idea how to play,1
6,You son of a bitch.,1
7,The polarity of the Papan Magir is doing it.,1
8,The name of the company,1


In [None]:
translated_df.shape

(16, 2)

In [None]:
translated_df .to_csv('hello.csv', index=False)