In [13]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

import os

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_PROJECT'] = 'advanced-rag'
os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY")
os.environ['GROQ_API_KEY'] = os.getenv("GROQQ_API_KEY")


In [47]:
from langchain_core.output_parsers import StrOutputParser
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate
import pandas as pd
from tqdm import tqdm

# Compile the template and set up the pipeline once
template = """You are a helpful assistant that translates the comment passed in mix of english + Indian languages to their original language phrase. 
Following are the conditions:
If you find any English word just leave it as it is in English.
I don't need  any word by word explanation, just need output in the translated language script.
If you find any inappropriate or disrespectful word just output as "Profanity/Disrespectful word/words included"
Please give only the original language phrase as shown in the example. 
Eg: comment: "mere naam hai Roshan",  "मेरा नाम है रोशन"
comment: "nenu UST lo Panichestunna", "nenu UST లో పనిచేస్తున్నా"
{comment} 
Output"""

prompt = ChatPromptTemplate.from_template(template)
pipeline = (
    prompt
    | ChatGroq(temperature=0)
    | StrOutputParser()
    | (lambda x: x.split("\n"))  
)

# Define the translation function
def translation_to_source_script(comment):
    print(pipeline.invoke({"comment": comment}))
    return pipeline.invoke({"comment": comment})

# Load the comments and apply the translation function
comments = pd.read_excel("updated_comments.xlsx", sheet_name='Comments')
comments=comments.iloc[:10,:]
tqdm.pandas()
comments['translated_text'] = comments['Original_comment - "Text"'].apply(translation_to_source_script)


prompt = ChatPromptTemplate.from_template(template)
pipeline = (
    prompt
    | ChatGroq(temperature=0)
    | StrOutputParser()
    | (lambda x: x.split("\n")[0])  # Directly access the first line of the output
)

template = """You are a helpful assistant that extracts the comment passed which has mix of language scripts, You have to just identify and get the Indian script part from it.
Please give only the Indian language phrase as shown in the example. 
Eg: comment: "mere naam hai Roshan मेरा नाम है रोशन ", original language phrase: "मेरा नाम है रोशन"
comment: "nenu hero ga Panichestunna నేను హీరోగా పనిచేస్తున్నాను", original language phrase: "నేను హీరోగా పనిచేస్తున్నాను"
{comment} 
Output"""

comments['re-translated_text'] = comments['translated_text'].apply(translation_to_source_script)



['"Mileage bataani thi" (Hindi)']
['"Purana hi accha nigga" (Hindi and English)', '', "Note: I've included the profanity/disrespectful word in the translation, as per your instructions. However, I strongly advise against using such language, as it can be offensive and disrespectful to others."]
['"Daddy" का नाम "Pulsar 220" 😂😂😂', '', 'In this case, "Daddy" is being used as a term of endearment or admiration towards the Pulsar 220 motorcycle, so I have translated the rest of the sentence accordingly.']
['For the given comment "New one looks like ninja 400", I will leave the English words as they are and provide the rest in Hindi:', '', '"Naya ek jaisa hai ninja 400"']
["Sure, I'll do my best to translate the comments from a mix of English and Indian languages to their original language script. Here are some examples:", '', 'Comment: "mere naam hai Roshan"', 'Output: "मेरा नाम है रोशन"', '', 'Comment: "nenu UST lo Panichestunna"', 'Output: "నేను UST లో పనిచేస్తున్నా"', '', 'Comment: "my 

In [42]:
comments

Unnamed: 0,CID,"Original_comment - ""Text""",comment,models,sentiment,aspects,translated_text,re-translated_text
0,Ugxw3xwVOCGgtX_WCqN4AaABAg,Mileage bi btani thi,mileage bi tell to be,['No model mentioned'],neutral,{'Mileage'},"""मीलेज बतानी थी""","""मीलेज बतानी थी"" should be translated to ""मेरा..."
1,Ugy5CyM64xUhGNGZBNN4AaABAg,Old is gold nigga,old is gold nigga,['No model mentioned'],positive,(),"I cannot provide a translation for the term ""n...","Sure, I can help with that. Here's the transla..."
2,Ugxh01ZYZ75etrRPXix4AaABAg,Daddy name Pulsar 220😂😂😂,daddy name bajaj-pulsar,['bajaj-pulsar'],neutral,(),"Sure, I can help with that. The original langu...","""Daddy का नाम Pulsar 220😂😂😂"""
3,UgxOAJMxPG2s_u2p4FB4AaABAg,New one looks like ninja 400,new one looks like kawasaki-ninja,['kawasaki-ninja'],positive,{'Design'},"original language phrase: ""निन्जा 400 का नाम है""","""निन्जा 400 का नाम है"""
4,UgwXN3os9eUvfpS04Vt4AaABAg,Old,old,['No model mentioned'],neutral,(),"Sure, I'd be happy to help you translate comme...",Example 1:
5,Ugz0A-HC2Z15ktYp5-B4AaABAg,"These ""old is gold"" NPCs will always choose th...",these old is gold npcs will always choose the ...,['No model mentioned'],positive,(),"""ये बहुत पुराने हैं लेकिन ये हमेशा आपको बेचने ...","Original language phrase: ""ये बहुत पुराने हैं ..."
6,UgzRgB1WHok3gW68IKF4AaABAg,Old i dope new is doop😅,old i dope new is doop,['No model mentioned'],neutral,(),"""Old अब तक मujhe doped था नew अब doop हो गया है""","""पुराने अब तक मujhe doped था नew अब doop हो गय..."
7,UgxKn5fM5tdc8nV9Bs54AaABAg,The old Karizma has something the new one does...,the old hero-karizma has something the new one...,['hero-karizma'],neutral,(),The old Karizma has something the new one does...,"करिजमा की बुजुर्गी नए में कुछ नहीं है, एक आत्म..."
8,UgxRKzLzhj750uNpqsJ4AaABAg,"Jaso engine vs Indo-Euro engine, everyone know...",like engine vs indoeuro engine everyone knows ...,['No model mentioned'],positive,{'Engine'},"Original language phrase: ""जाने विद्यमान है कि...","Original language phrase: ""जाने विद्यमान है कि..."
9,Ugy5YOjzmXvg6PBrrYt4AaABAg,Abb bhai purani to le nhi skte jo ha abb yahi ...,abb bhai purani to le nhi skte jo ha abb yahi ...,['No model mentioned'],neutral,(),"Abb bhai purani ""old"" toys ""toys"" le ""take"" nh...","अब भाई पुरानी ""old"" तोयz ""toys"" ले ""take"" नहीं..."


In [16]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM

tokenizer = LlamaTokenizer.from_pretrained('sarvamai/OpenHathi-7B-Hi-v0.1-Base')
model = LlamaForCausalLM.from_pretrained('sarvamai/OpenHathi-7B-Hi-v0.1-Base', torch_dtype=torch.bfloat16)

prompt = "मैं एक अच्छा हाथी हूँ"
inputs = tokenizer(prompt, return_tensors="pt")

# Generate
generate_ids = model.generate(inputs.input_ids, max_length=30)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

Downloading shards: 100%|██████████| 3/3 [01:42<00:00, 34.06s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  4.59it/s]


"मैं एक अच्छा हाथी हूँ।\n\nI'm a good elephant, I'm a good elephant,"

In [None]:
# # Install required libraries
# !pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
# !pip install -q datasets bitsandbytes einops
# !pip install -q wandb

# Import required libraries
from datasets import load_dataset, Dataset
from random import randrange
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM
from trl import SFTTrainer
from huggingface_hub import login
import wandb
from huggingface_hub import notebook_login


import torch
from transformers import LlamaTokenizer, LlamaForCausalLM

tokenizer = LlamaTokenizer.from_pretrained('sarvamai/OpenHathi-7B-Hi-v0.1-Base')
model = LlamaForCausalLM.from_pretrained('sarvamai/OpenHathi-7B-Hi-v0.1-Base', torch_dtype=torch.bfloat16)


# Define a function to extract and tokenize the input text and response
def extract_and_tokenize(row):
    input_text = row['text'].split('\\n')[0]
    response = row['text'].split('\\n')[1]
    
    input_ids = tokenizer.encode_plus(
        input_text,
        max_length=169,  # adjust the max length as needed
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )['input_ids'].flatten()
    
    response_ids = tokenizer.encode_plus(
        response,
        max_length=158,  # adjust the max length as needed
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )['input_ids'].flatten()
    
    return input_ids, response_ids

import pandas as pd
df=pd.read_csv("test_sft.csv")
# Apply the function to each row in the dataframe
df['input_ids'], df['response_ids'] = zip(*df.apply(extract_and_tokenize, axis=1))

df['input_ids'] = df['input_ids'].apply(lambda x: x.tolist())
df['response_ids'] = df['response_ids'].apply(lambda x: x.tolist())



# Load dataset

dataset = Dataset.from_pandas(df)  # assuming your dataset is stored in a Pandas dataframe df

# # Apply the function to each row in the dataset
# dataset = dataset.map(extract_and_tokenize)

# Set model and tokenizer
# model_name = "sarvamai/OpenHathi-7B-Hi-v0.1-Base"
# tokenizer = LlamaTokenizer.from_pretrained(model_name)
# model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)

# Set LoRA and BitsAndBytes configurations
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

# Log in to HF Hub
notebook_login()

# Log in to W&B
# wandb.login()

# # Set environment variable for W&B project
# wandb.init(
#     # set the wandb project where this run will be logged
#     project="my-awesome-project",

#     # track hyperparameters and run metadata
#     config={
#     "learning_rate": 0.02,
#     "architecture": "CNN",
#     "dataset": "CIFAR-100",
#     "epochs": 10,
#     }
# )
# %env WANDB_PROJECT=your_wandb_project_name  # replace with your W&B project name

# Set training arguments
trainingArgs = TrainingArguments(
    output_dir="finetuned_model",  # replace with your desired output dir
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=5,
    save_strategy="epoch",
    learning_rate=2e-4,
    weight_decay=0.001,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    group_by_length=False,
    lr_scheduler_type="cosine",
    disable_tqdm=True,
    #report_to="wandb",
    seed=42,
)

# Create the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=2048,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=lambda x: x,  # no formatting function needed in this case
    args=trainingArgs,
)

# Train the model
trainer.train()


In [None]:
import torch

from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM

from trl import SFTTrainer

from huggingface_hub import login

import wandb

# Create a custom dataset class to handle our tokenized data
class AspectSentimentDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_ids = self.df.iloc[idx, 1]
        response_ids = self.df.iloc[idx, 2]

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'labels': torch.tensor(response_ids, dtype=torch.long)
        }

# Create a dataset instance and data loader
dataset = AspectSentimentDataset(df, tokenizer, max_len=169)
batch_size = 16
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Load the pre-trained LlamaForCausalLM model

# Add LoRA layers to the model
lora_dim = 8
lora_module = LoRAModule(r=lora_dim)
model.transformer.output.LayerNorm = LoRALayer(lora_module)

# Add adapters to the model
adapter_dim = 64
adapter_module = AdapterModule(adapter_dim)
model.transformer.output.adapter = adapter_module

# Set the device (CPU)
device = torch.device('cpu')
model.to(device)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Train the model
for epoch in range(5):  # train for 5 epochs
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, labels=labels)
        loss = loss_fn(outputs.logits.view(-1, 50264), labels.view(-1))

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}')

    model.eval()


ModuleNotFoundError: No module named 'pylora'

In [40]:
(df['text'].apply(lambda x: (x.split('\\n'))[0])).apply(len).max()
#(df['text'].apply(lambda x: (x.split('\\n'))[1])).apply(len).max()

169

In [25]:
import pandas as pd
df=pd.read_csv("test_sft.csv")
df.text[0] 
df.columns

Index(['text'], dtype='object')

In [41]:



# Define a function to extract and tokenize the input text and response
def extract_and_tokenize(row):
    input_text = row['text'].split('\\n')[0]
    response = row['text'].split('\\n')[1]
    
    input_ids = tokenizer.encode_plus(
        input_text,
        max_length=169,  # adjust the max length as needed
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )['input_ids'].flatten()
    
    response_ids = tokenizer.encode_plus(
        response,
        max_length=158,  # adjust the max length as needed
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )['input_ids'].flatten()
    
    return input_ids, response_ids

# Apply the function to each row in the dataframe
df['input_ids'], df['response_ids'] = zip(*df.apply(extract_and_tokenize, axis=1))


In [None]:
import torch

# Create a custom dataset class to handle our tokenized data
class AspectSentimentDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_ids = self.df.iloc[idx, 1]
        response_ids = self.df.iloc[idx, 2]

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'labels': torch.tensor(response_ids, dtype=torch.long)
        }

# Create a dataset instance and data loader
dataset = AspectSentimentDataset(df, tokenizer, max_len=169)
batch_size = 16
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)


# Set the device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Train the model
for epoch in range(5):  # train for 5 epochs
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, labels=labels)
        loss = loss_fn(outputs.logits.view(-1, 50264), labels.view(-1))

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}')

    model.eval()


In [43]:
import torch

from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM

from trl import SFTTrainer

from huggingface_hub import login

import wandb

# Create a custom dataset class to handle our tokenized data
class AspectSentimentDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        input_ids = self.df.iloc[idx, 1]
        response_ids = self.df.iloc[idx, 2]

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'labels': torch.tensor(response_ids, dtype=torch.long)
        }

# Create a dataset instance and data loader
dataset = AspectSentimentDataset(df, tokenizer, max_len=169)
batch_size = 16
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Load the pre-trained LlamaForCausalLM model

# Add LoRA layers to the model
lora_dim = 8
lora_module = LoRAModule(r=lora_dim)
model.transformer.output.LayerNorm = LoRALayer(lora_module)

# Add adapters to the model
adapter_dim = 64
adapter_module = AdapterModule(adapter_dim)
model.transformer.output.adapter = adapter_module

# Set the device (CPU)
device = torch.device('cpu')
model.to(device)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Train the model
for epoch in range(5):  # train for 5 epochs
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, labels=labels)
        loss = loss_fn(outputs.logits.view(-1, 50264), labels.view(-1))

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}')

    model.eval()


ModuleNotFoundError: No module named 'pylora'

In [4]:

from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

model = AutoPeftModelForCausalLM.from_pretrained(
    "Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa"
)
hf_token= "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"
tokenizer = AutoTokenizer.from_pretrained("Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa-2.0",load_in_4bit = False,
    token = hf_token)

input_prompt = """
### Instruction:
{}

### Input:
{}

### Response:
{}"""

input_text = input_prompt.format(
        "Tranlsate following sentence to Hindi.", # instruction
        "This model is developed by Telugu LLM Labs", # input
        "", # output - leave this blank for generation!
    )

inputs = tokenizer([input_text], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 300, use_cache = True)
response = tokenizer.batch_decode(outputs)[0]


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/gemma-2b.
403 Client Error. (Request ID: Root=1-66c340be-5fa340a93bc174ce500717f6;2f7be0d1-8407-4f48-8565-5957bdf6d22b)

Cannot access gated repo for url https://huggingface.co/google/gemma-2b/resolve/unsloth/config.json.
Access to model google/gemma-2b is restricted and you are not in the authorized list. Visit https://huggingface.co/google/gemma-2b to ask for access.

In [5]:
PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
from transformers import pipeline
pipe = pipeline(model='sarvamai/sarvam-2b-v0.5', device=0)
pipe('भारत के प्रथम प्रधानमंत्री', max_new_tokens=15, temperature=0.1, repetition_penalty=1.2)[0]['generated_text']
# 'भारत के प्रथम प्रधानमंत्री जवाहरलाल नेहरू थे।\n\n'

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 8.90 GB, other allocations: 704.00 KB, max allowed: 9.07 GB). Tried to allocate 501.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).