In [1]:
import csv
import torch
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import AdamW, BertTokenizer, BertForSequenceClassification

In [2]:
from google.colab import files

uploaded = files.upload()

Saving english_telugu_data.txt to english_telugu_data (2).txt


In [3]:

# Specify input file path
input_file_path = 'english_telugu_data.txt'

# Specify output file path
output_file_path = 'teluguenglishseparate11.csv'

# Read data from the input file
with open(input_file_path, 'r', encoding='utf-8') as infile:
    telugudata = [line.strip().split('++++$++++') for line in infile]

# Write data to CSV with separate columns for Telugu and English
with open(output_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)

    # Write header
    csv_writer.writerow(['English Text', 'Telugu Text'])

    # Write data rows
    csv_writer.writerows(telugudata)



print(f'CSV file "{output_file_path}" created successfully.')

CSV file "teluguenglishseparate11.csv" created successfully.


In [4]:
!pip install transformers
!pip install sentencepiece



In [5]:
# Load the dataset
dataset = pd.read_csv('teluguenglishseparate11.csv')


dataset = dataset.head(140000)

In [6]:
# Display the first few rows to check the structure
print(dataset.tail())

                                   English Text  \
139995                         Black suits you.   
139996           Wasn't Tom in Boston with you?   
139997  Tom didn't understand a word Mary said.   
139998                  Tom wasn't quite happy.   
139999      We had a good time in the open air.   

                                   Telugu Text  
139995                  నలుపు మీకు సరిపోతుంది.  
139996              టామ్ మీతో బోస్టన్‌లో లేరా?  
139997  మేరీ చెప్పిన మాట టామ్‌కు అర్థం కాలేదు.  
139998                టామ్ చాలా సంతోషంగా లేడు.  
139999   మాకు బహిరంగ ప్రదేశంలో మంచి సమయం ఉంది.  


In [7]:
from transformers import MarianMTModel, MarianTokenizer, AdamW
import torch
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
import pandas as pd

In [8]:
# Load the saved model and tokenizer
model_dir = '/content/fine_tuned_model3'
model = MarianMTModel.from_pretrained(model_dir)
tokenizer = MarianTokenizer.from_pretrained(model_dir)



In [9]:
# Define training parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = CrossEntropyLoss()




In [10]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        english_text = str(self.data.iloc[idx]['English Text'])
        telugu_text = str(self.data.iloc[idx]['Telugu Text'])

        # Tokenize and encode the inputs
        inputs = self.tokenizer(
            english_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True
        )

        # Tokenize and encode the labels
        labels = self.tokenizer(
            telugu_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True
        )

        # Return the tokenized inputs and labels
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "labels": labels["input_ids"].squeeze()  # Treat Telugu text as labels
        }


# Split the dataset into training and validation sets
train_data, temp_data = train_test_split(dataset, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Use your CustomDataset class
train_dataset = CustomDataset(train_data, tokenizer)
val_dataset = CustomDataset(val_data, tokenizer)
test_dataset = CustomDataset(test_data, tokenizer)

# Create PyTorch DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)



In [11]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [12]:
from tqdm import tqdm

num_epochs = 2

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    # Use tqdm to display a progress bar
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch"):
        optimizer.zero_grad()
        inputs = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

print("Training finished.")


Epoch 1/2:   0%|          | 0/28000 [00:00<?, ?batch/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Epoch 1/2: 100%|██████████| 28000/28000 [49:34<00:00,  9.41batch/s]


Epoch 1/2, Average Loss: 0.0108


Epoch 2/2: 100%|██████████| 28000/28000 [49:35<00:00,  9.41batch/s]

Epoch 2/2, Average Loss: 0.0074
Training finished.





In [13]:
model.eval()
with torch.no_grad():
    for val_batch in val_dataloader:  # Assuming you have a DataLoader for your validation set
        val_inputs = val_batch["input_ids"].to(device)
        val_labels = val_batch["labels"].to(device)

        val_outputs = model(val_inputs, labels=val_labels)
        val_loss = val_outputs.loss

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Validation Loss: {avg_val_loss}")


Validation Loss: 8.374591402571241e-07


In [14]:

# Save the updated model
updated_model_dir = '/content/updated_fine_tuned_model'
model.save_pretrained(updated_model_dir)
tokenizer.save_pretrained(updated_model_dir)


('/content/updated_fine_tuned_model/tokenizer_config.json',
 '/content/updated_fine_tuned_model/special_tokens_map.json',
 '/content/updated_fine_tuned_model/vocab.json',
 '/content/updated_fine_tuned_model/source.spm',
 '/content/updated_fine_tuned_model/target.spm',
 '/content/updated_fine_tuned_model/added_tokens.json')

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
# Load the fine-tuned model and tokenizer
loaded_model_dir = '/content/updated_fine_tuned_model'
loaded_model = MarianMTModel.from_pretrained(loaded_model_dir)
loaded_tokenizer = MarianTokenizer.from_pretrained(loaded_model_dir)

# Move the loaded model to the desired device (CPU or GPU)
loaded_model.to(device)




MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(61950, 512, padding_idx=61949)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(61950, 512, padding_idx=61949)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLUActivation()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,),

In [2]:
input_text = "Its pretty amazing"
input_ids = loaded_tokenizer.encode(input_text, return_tensors="pt")

# Move input tensor to the same device as the model
input_ids = input_ids.to(loaded_model.device)

# Generate output on the same device
output_ids = loaded_model.generate(input_ids)

# Move output tensor to CPU if needed
output_ids = output_ids.to("cpu")

output_text = loaded_tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(output_text)



చాల అద్బుతంగా


In [None]:
#import matplotlib.pyplot as plt

# Given data
#epochs = list(range(1, 11))
#training_losses = [0.0239, 0.0085, 0.0071, 0.0061, 0.0053, 0.0046, 0.0040, 0.0035, 0.0030, 0.0027]
#validation_loss = 3.9389281170088e-08  # This remains constant

# Plotting the training loss
#plt.plot(epochs, training_losses, marker='o', label='Training Loss', linestyle='-')

# Plotting a line for the validation loss
#plt.axhline(y=validation_loss, color='r', linestyle='--', label='Validation Loss')

# Adding labels and title
#plt.xlabel('Epochs')
#plt.ylabel('Loss')
#plt.title('Training and Validation Loss Over Epochs')
#plt.legend()

# Showing grid
#plt.grid(True)

# Display the plot
#plt.show()


In [19]:
!pip install nltk




In [3]:
import nltk
from nltk.translate.bleu_score import corpus_bleu

# Example model translations into Telugu
model_translations_telugu = [
    'చాల అద్బుతంగా',

]

# Corresponding reference translations in Telugu
reference_translations_telugu = [
    [['ఇది', 'అద్భుతం'], ['ఇది', 'చాలా', 'మంచిది']],
    # ... more references
]

nltk.download('punkt')

# Tokenize the model translations
model_translations_telugu_tokens = [nltk.word_tokenize(sent) for sent in model_translations_telugu]

reference_translations_telugu_tokens = reference_translations_telugu

# Calculate BLEU scores
bleu_score_telugu = corpus_bleu(reference_translations_telugu_tokens, model_translations_telugu_tokens)

print(f"BLEU score for Telugu: {bleu_score_telugu}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


BLEU score for Telugu: 0


In [26]:
import os
print(os.listdir("/content"))


['.config', 'teluguenglishseparate11.csv', 'english_telugu_data.txt', 'english_telugu_data (1).txt', 'english_telugu_data (2).txt', 'updated_fine_tuned_model', 'fine_tuned_model3', 'drive', 'sample_data']
