<a href="https://colab.research.google.com/github/Abhishekredy1289/DL-2/blob/main/DL2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
import random


def read_tsv(path):
    data = []
    with open(path, encoding='utf-8') as f:
        for line in f:
            dev, lat, freq = line.strip().split('\t')
            data.extend([(lat, dev)] * int(freq))
    return data

def build_vocab(sequences):
    vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
    for seq in sequences:
        for char in seq:
            if char not in vocab:
                vocab[char] = len(vocab)
    return vocab

class TransliterationDataset(Dataset):
    def __init__(self, data, input_vocab, target_vocab):
        self.data = data
        self.input_vocab = input_vocab
        self.target_vocab = target_vocab

    def __len__(self):
        return len(self.data)

    def encode_seq(self, seq, vocab, add_tokens=False):
        ids = [vocab[c] for c in seq]
        if add_tokens:
            ids = [vocab['<sos>']] + ids + [vocab['<eos>']]
        return torch.tensor(ids, dtype=torch.long)

    def __getitem__(self, idx):
        src, tgt = self.data[idx]
        return self.encode_seq(src, self.input_vocab), self.encode_seq(tgt, self.target_vocab, add_tokens=True)

def collate_fn(batch):
    srcs, trgs = zip(*batch)
    srcs_padded = nn.utils.rnn.pad_sequence(srcs, batch_first=True, padding_value=0)
    trgs_padded = nn.utils.rnn.pad_sequence(trgs, batch_first=True, padding_value=0)
    return srcs_padded, trgs_padded


class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, rnn_type='gru'):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        rnn_cls = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU}[rnn_type]
        self.rnn = rnn_cls(emb_dim, hid_dim, n_layers, batch_first=True)
        self.rnn_type = rnn_type

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, hidden = self.rnn(embedded)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, rnn_type='gru'):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        rnn_cls = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU}[rnn_type]
        self.rnn = rnn_cls(emb_dim, hid_dim, n_layers, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)

    def forward(self, input, hidden):
        input = input.unsqueeze(1)  # [batch_size, 1]
        embedded = self.embedding(input)
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.shape
        output_dim = self.decoder.fc_out.out_features
        outputs = torch.zeros(batch_size, trg_len, output_dim).to(self.device)

        hidden = self.encoder(src)
        input = trg[:, 0]  # <sos>

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1

        return outputs


def train(model, data_loader, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    for src, trg in data_loader:
        src, trg = src.to(model.device), trg.to(model.device)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

def accuracy(model, data_loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for src, trg in data_loader:
            src, trg = src.to(model.device), trg.to(model.device)
            output = model(src, trg, 0)
            preds = output.argmax(-1)
            for pred, true in zip(preds, trg):
                if torch.equal(pred[1:], true[1:]):
                    correct += 1
                total += 1
    return correct / total


def predict(model, seq, input_vocab, output_vocab, max_len=30):
    model.eval()
    inv_vocab = {v: k for k, v in output_vocab.items()}
    src_tensor = torch.tensor([input_vocab[c] for c in seq], dtype=torch.long).unsqueeze(0).to(model.device)

    hidden = model.encoder(src_tensor)
    input = torch.tensor([output_vocab['<sos>']], device=model.device)
    output_seq = []

    for _ in range(max_len):
        out, hidden = model.decoder(input, hidden)
        top1 = out.argmax(1).item()
        if top1 == output_vocab['<eos>']:
            break
        output_seq.append(inv_vocab.get(top1, ''))
        input = torch.tensor([top1], device=model.device)

    return ''.join(output_seq)


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_path = "/hi.translit.sampled.train.tsv"
val_path = "/hi.translit.sampled.dev.tsv"
train_data = read_tsv(train_path)
val_data = read_tsv(val_path)

input_vocab = build_vocab([d[0] for d in train_data])
target_vocab = build_vocab([d[1] for d in train_data])

train_dataset = TransliterationDataset(train_data, input_vocab, target_vocab)
val_dataset = TransliterationDataset(val_data, input_vocab, target_vocab)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

INPUT_DIM = len(input_vocab)
OUTPUT_DIM = len(target_vocab)
EMB_DIM = 64
HID_DIM = 128
N_LAYERS = 1
RNN_TYPE = 'gru'

encoder = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, RNN_TYPE)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, RNN_TYPE)
model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=target_vocab['<pad>'])

for epoch in range(5):
    loss = train(model, train_loader, optimizer, criterion)
    acc = accuracy(model, val_loader)
    print(f"Epoch {epoch+1} | Loss: {loss:.4f} | Val Accuracy: {acc:.4f}")


print("\nSample Predictions:")
for i in range(5):
    src_sample, tgt_sample = val_data[i]
    pred = predict(model, src_sample, input_vocab, target_vocab)
    print(f"Input: {src_sample} | Target: {tgt_sample} | Predicted: {pred}")


Epoch 1 | Loss: 1.6382 | Val Accuracy: 0.0069
Epoch 2 | Loss: 0.8966 | Val Accuracy: 0.0111
Epoch 3 | Loss: 0.7341 | Val Accuracy: 0.0126
Epoch 4 | Loss: 0.6491 | Val Accuracy: 0.0187
Epoch 5 | Loss: 0.5887 | Val Accuracy: 0.0182

Sample Predictions:
Input: ankan | Target: अंकन | Predicted: आंकन
Input: ankan | Target: अंकन | Predicted: आंकन
Input: ankan | Target: अंकन | Predicted: आंकन
Input: angkor | Target: अंगकोर | Predicted: अंंककर
Input: angkor | Target: अंगकोर | Predicted: अंंककर


In [None]:
!pip install matplotlib-venn



In [None]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
!apt-get -qq install -y libfluidsynth1

E: Package 'libfluidsynth1' has no installation candidate


In [None]:
!apt-get -qq install -y libarchive-dev && pip install -U libarchive
import libarchive



In [None]:
!apt-get -qq install -y graphviz && pip install pydot
import pydot



In [None]:
!pip install cartopy
import cartopy



In [None]:
import pandas as pd
import re
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, pipeline

def clean_lyrics(lyric):
    if pd.isna(lyric):
        return ""
    lyric = str(lyric)
    lyric = re.sub(r'^#+', '', lyric)
    lyric = lyric.encode('utf-8').decode('utf-8', 'ignore')
    lyric = re.sub(r'[\u2018\u2019\u201c\u201d]+', "'", lyric)
    lyric = re.sub(r'[^\x00-\x7F]+', '', lyric)
    return lyric.strip()

khalid_df = pd.read_csv('/Drake.csv')
gaga_df = pd.read_csv('/ColdPlay.csv')
lyrics_df = pd.concat([khalid_df, gaga_df])

lyrics_texts = lyrics_df['Lyric'].dropna().apply(clean_lyrics).tolist()
with open("lyrics_dataset.txt", "w", encoding="utf-8") as f:
    f.write("\n\n".join(lyrics_texts))

dataset = load_dataset("text", data_files={"train": "lyrics_dataset.txt"})

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    tokens = tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

model = GPT2LMHeadModel.from_pretrained("gpt2")

training_args = TrainingArguments(
    output_dir="./gpt2-lyrics",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_steps=100,
    save_steps=500,
    save_total_limit=1,
    prediction_loss_only=True,
    report_to="none",
    fp16=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
)

trainer.train()

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
output = generator("I remember those nights when", max_length=100, num_return_sequences=1)
print(output[0]["generated_text"])


Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/1593 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,1.3683
200,1.1197
300,1.1233
400,1.1067
500,1.0977
600,1.3059
700,1.2401
800,1.0305
900,1.264
1000,0.854


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


I remember those nights when you had my head in your mouth when you were you got me so open it up you just had to let your emotions get me and let's hope go out to the city of the lights the thunder it could make your skin turn into a bird you just gotta let it go out to the sound i remember those nights when you were i remember you when you were i remember you we were gonna make it this way and make it last and i don't wanna lie oh yeah oh
