<a href="https://colab.research.google.com/github/Asha629399/Assignment_3/blob/main/Assignment3_Vanilla.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.autograd import Variable
from torch.utils.data import Dataset
!pip install wandb
import numpy as np
import random
import pandas as pd

# Visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import wandb
wandb.login()
#c467a6693ff6ce5eff3b78a68a9fd6bed4d726cd

[34m[1mwandb[0m: Currently logged in as: [33mcs22m021[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
sweep_config = {
    'method':'random'    
}

In [None]:
metric = {
    'name':'val_accuracy',
    'goal':'maximize'
}
sweep_config['metric'] = metric

In [2]:
parameters_dict = {
    'cell_type' :{
        'values':['RNN','LSTM','GRU']
    },
    'hidden_layer_size':{
        'values':[16,32,64,128]
    },
    'lr':{
        'values':[0.001,0.01,0.1]
    },
    'n_batches':{
        'values':[10,100]
    },
    'num_layers':{
        'values':[1,2,3]
    },
        
    
    'batch_size':{
        'values':[16,32,64,128]
    }
}
sweep_config['parameters'] = parameters_dict

NameError: ignored

In [None]:
sweep_id = wandb.sweep(sweep_config, project = 'Attention')

Create sweep with ID: 1t1lq4yx
Sweep URL: https://wandb.ai/cs22m021/Attention/sweeps/1t1lq4yx


In [None]:
SEED = 33
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
PATH_TO_DATA = "./aksharantar_sampled/hin/"

### Data Preprocessing

In [None]:
# storing all the alphabets of English and the pad char to a dictionary to create OHE representation later.
eng_alphabets = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
pad_char = '-PAD-'

eng_alpha2index = {pad_char: 0}
for index, alpha in enumerate(eng_alphabets):
    eng_alpha2index[alpha] = index+1

print(eng_alpha2index) #0B80 - 0BFF = 2944 - 3071

{'-PAD-': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 'W': 23, 'X': 24, 'Y': 25, 'Z': 26}


In [None]:
# Hindi Unicode Hex Range is 2304:2432. Source: https://en.wikipedia.org/wiki/Devanagari_(Unicode_block)

hindi_alphabets = [chr(alpha) for alpha in range(2304, 2432)]
hindi_alphabet_size = len(hindi_alphabets)

hindi_alpha2index = {pad_char: 0}
for index, alpha in enumerate(hindi_alphabets):
    hindi_alpha2index[alpha] = index+1

print(hindi_alpha2index)

{'-PAD-': 0, 'ऀ': 1, 'ँ': 2, 'ं': 3, 'ः': 4, 'ऄ': 5, 'अ': 6, 'आ': 7, 'इ': 8, 'ई': 9, 'उ': 10, 'ऊ': 11, 'ऋ': 12, 'ऌ': 13, 'ऍ': 14, 'ऎ': 15, 'ए': 16, 'ऐ': 17, 'ऑ': 18, 'ऒ': 19, 'ओ': 20, 'औ': 21, 'क': 22, 'ख': 23, 'ग': 24, 'घ': 25, 'ङ': 26, 'च': 27, 'छ': 28, 'ज': 29, 'झ': 30, 'ञ': 31, 'ट': 32, 'ठ': 33, 'ड': 34, 'ढ': 35, 'ण': 36, 'त': 37, 'थ': 38, 'द': 39, 'ध': 40, 'न': 41, 'ऩ': 42, 'प': 43, 'फ': 44, 'ब': 45, 'भ': 46, 'म': 47, 'य': 48, 'र': 49, 'ऱ': 50, 'ल': 51, 'ळ': 52, 'ऴ': 53, 'व': 54, 'श': 55, 'ष': 56, 'स': 57, 'ह': 58, 'ऺ': 59, 'ऻ': 60, '़': 61, 'ऽ': 62, 'ा': 63, 'ि': 64, 'ी': 65, 'ु': 66, 'ू': 67, 'ृ': 68, 'ॄ': 69, 'ॅ': 70, 'ॆ': 71, 'े': 72, 'ै': 73, 'ॉ': 74, 'ॊ': 75, 'ो': 76, 'ौ': 77, '्': 78, 'ॎ': 79, 'ॏ': 80, 'ॐ': 81, '॑': 82, '॒': 83, '॓': 84, '॔': 85, 'ॕ': 86, 'ॖ': 87, 'ॗ': 88, 'क़': 89, 'ख़': 90, 'ग़': 91, 'ज़': 92, 'ड़': 93, 'ढ़': 94, 'फ़': 95, 'य़': 96, 'ॠ': 97, 'ॡ': 98, 'ॢ': 99, 'ॣ': 100, '।': 101, '॥': 102, '०': 103, '१': 104, '२': 105, '३': 106, '४': 107, '५': 108, '६': 109, '७': 

In [None]:
# Funcitons used to do some pre-processing.
# removing all non-alphabetic char in English as well as Hindi.

import re
non_eng_letters_regex = re.compile('[^a-zA-Z ]')

# Remove all English non-letters
def cleanEnglishVocab(line):
    line = line.replace('-', ' ').replace(',', ' ').upper()
    line = non_eng_letters_regex.sub('', line)
    return line.split()

# Remove all Hindi non-letters
def cleanHindiVocab(line):
    line = line.replace('-', ' ').replace(',', ' ')
    cleaned_line = ''
    for char in line:
        if char in hindi_alpha2index or char == ' ':
            cleaned_line += char
    return cleaned_line.split()

In [None]:
class TransliterationDataLoader(Dataset):
    def __init__(self, filename):
        self.eng_words, self.hindi_words = self.readDataset(filename, cleanHindiVocab)
        self.shuffle_indices = list(range(len(self.eng_words)))
        random.shuffle(self.shuffle_indices)
        self.shuffle_start_index = 0
        
    def __len__(self):
        return len(self.eng_words)
    
    def __getitem__(self, idx):
        return self.eng_words[idx], self.hindi_words[idx]
    
    def readDataset(self, filename, lang_vocab_cleaner):
        '''Task : to read the csv file and store all the contents in a list.
                  Then we will do some pre-processing of data to remove noise as well as delimeters. '''
        transliterationCorpus = pd.read_csv(filename,header=None)
        lang1_words = []
        lang2_words = []

        for index,line in transliterationCorpus.iterrows():
            wordlist1 = cleanEnglishVocab(line[0]) # clean English words.
            wordlist2 = lang_vocab_cleaner(line[1])# clean hindi words.

            # Skip noisy data
            if len(wordlist1) != len(wordlist2):
                print('Skipping: ', line[0].text, ' - ', line[1].text)
                continue

            for word in wordlist1:
                lang1_words.append(word)
            for word in wordlist2:
                lang2_words.append(word)

        return lang1_words, lang2_words
    
    def get_random_sample(self):
        return self.__getitem__(np.random.randint(len(self.eng_words)))
    
    def get_batch_from_array(self, batch_size, array): # child function of get_batch() function.
        '''Given an array , and batch size , this fucntion will return some samples from the array i.e can be HindiWords or EnglishWords etc. '''
        end = self.shuffle_start_index + batch_size # what index till i want to go.
        batch = []
        if end >= len(self.eng_words): # if we overflow the words array , we have to loop back.
            batch = [array[i] for i in self.shuffle_indices[0:end%len(self.eng_words)]]
            end = len(self.eng_words)
        return batch + [array[i] for i in self.shuffle_indices[self.shuffle_start_index : end]]
    
    def get_batch(self, batch_size, postprocess = True):
        eng_batch = self.get_batch_from_array(batch_size, self.eng_words)
        hindi_batch = self.get_batch_from_array(batch_size, self.hindi_words)
        self.shuffle_start_index += batch_size + 1
        
        # Reshuffle if 1 epoch is complete
        if self.shuffle_start_index >= len(self.eng_words):
            random.shuffle(self.shuffle_indices)
            self.shuffle_start_index = 0
            
        return eng_batch, hindi_batch

In [None]:
from google.colab import drive
drive.mount('/content/drive')
train_data = TransliterationDataLoader('/content/drive/MyDrive/hin_train.csv')
val_data = TransliterationDataLoader('/content/drive/MyDrive/hin_train.csv')
test_data = TransliterationDataLoader('/content/drive/MyDrive/hin_train.csv') 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#train and test data directory


train_data1 = pd.read_csv("./aksharantar_sampled/hin/hin_train.csv",header=None)
val_data1 =  pd.read_csv("./aksharantar_sampled/hin/hin_valid.csv",header=None)
test_data1 =  pd.read_csv("./aksharantar_sampled/hin/hin_test.csv",header=None)

train_data1.head()

Unnamed: 0,0,1
0,shastragaar,शस्त्रागार
1,bindhya,बिन्द्या
2,kirankant,किरणकांत
3,yagyopaveet,यज्ञोपवीत
4,ratania,रटानिया


In [None]:
# for index,line in train_data1.iterrows():
#     print(line[1])

In [None]:
print(f"Number of training examples: {len(train_data1)}")
print(f"Number of validation examples: {len(val_data1)}")
print(f"Number of test examples: {len(test_data1)}")

Number of training examples: 51200
Number of validation examples: 4096
Number of test examples: 4096


In [None]:
train_data.get_batch(10)

(['SUBCONTINENT',
  'HANKUNK',
  'LIBETADOR',
  'CHEKROVOLU',
  'VALLABHAVIDYANAGAR',
  'AVRODHMUKT',
  'BURO',
  'BOTAINI',
  'ALEKSANDAR',
  'BHAVMAYI'],
 ['सबकॉन्टीनेंट',
  'हंकुंक',
  'लिबेटाडोर',
  'चेकरोवोलू',
  'वल्लभविद्यानगर',
  'अवरोधमुक्त',
  'बुरो',
  'बोटैनी',
  'एलेक्जैंडर',
  'भावमयी'])

In [None]:
## Encoding the words

def word_rep(word, letter2index, device = 'cpu'):
    rep = torch.zeros(len(word)+1, 1, len(letter2index)).to(device)
    for letter_index, letter in enumerate(word):
        pos = letter2index[letter]
        rep[letter_index][0][pos] = 1
    pad_pos = letter2index[pad_char]
    rep[letter_index+1][0][pad_pos] = 1
    return rep

def gt_rep(word, letter2index, device = 'cpu'):
    gt_rep = torch.zeros([len(word)+1, 1], dtype=torch.long).to(device)
    for letter_index, letter in enumerate(word):
        pos = letter2index[letter]
        gt_rep[letter_index][0] = pos
    gt_rep[letter_index+1][0] = letter2index[pad_char]
    return gt_rep

In [None]:
eng, hindi = train_data.get_random_sample()
eng_rep = word_rep(eng, eng_alpha2index)
print(eng, eng_rep)

ANKASHASTRIYA tensor([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [None]:
hindi_gt = gt_rep(hindi, hindi_alpha2index)
print(hindi, hindi_gt.shape[0])

अंकशास्त्रीय 13


## Encoder-Decoder

In [43]:
MAX_OUTPUT_CHARS = 30
class Transliteration_EncoderDecoder(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size,cell_type ='GRU',num_decoder_layer=1,num_encoder_layer=1, verbose=False):
        super(Transliteration_EncoderDecoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.cell_type = cell_type
        self.num_decoder_layer = num_decoder_layer
        self.num_encoder_layer = num_encoder_layer
        if self.cell_type == 'RNN':
            self.encoder_rnn_cell = nn.RNN(input_size, hidden_size)
            self.decoder_rnn_cell = nn.RNN(output_size, hidden_size)
        elif self.cell_type == 'GRU':
            self.encoder_rnn_cell = nn.GRU(input_size, hidden_size)
            self.decoder_rnn_cell = nn.GRU(output_size, hidden_size)
        elif self.cell_type == 'LSTM':
            self.encoder_rnn_cell = nn.LSTM(input_size, hidden_size)
            self.decoder_rnn_cell = nn.LSTM(output_size, hidden_size
        self.encoder_rnn_cell = nn.GRU(input_size, hidden_size)
        self.decoder_rnn_cell = nn.GRU(output_size, hidden_size)
        
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=2)
        
        self.verbose = verbose
        
    def forward(self, input, max_output_chars = MAX_OUTPUT_CHARS, device = 'cpu', ground_truth = None):
        
        # encoder
        out, hidden = self.encoder_rnn_cell(input)
        
        if self.verbose:
            print('Encoder input', input.shape)
            print('Encoder output', out.shape)
            print('Encoder hidden', hidden.shape)
        
        # decoder
        decoder_state = hidden
        decoder_input = torch.zeros(1, 1, self.output_size).to(device) # also can be variable i.e learned from data.
        outputs = []
        
        if self.verbose:
            print('Decoder state', decoder_state.shape)
            print('Decoder input', decoder_input.shape)
        
        for i in range(max_output_chars):
            
            out, decoder_state = self.decoder_rnn_cell(decoder_input, decoder_state)
            
            if self.verbose:
                print('Decoder intermediate output', out.shape)
            # Apply dropout to the intermediate decoder output
            
            out = self.h2o(decoder_state)
            out = self.softmax(out)
            outputs.append(out.view(1, -1))

           
            if self.verbose:
                print('Decoder output', out.shape)
                self.verbose = False
           
            max_idx = torch.argmax(out, 2, keepdim=True)
            
            if not ground_truth is None:
                max_idx = ground_truth[i].reshape(1, 1, 1)
                
            one_hot = torch.FloatTensor(out.shape).to(device)
            one_hot.zero_() # all the elements will be 0.
            
            one_hot.scatter_(2, max_idx, 1)
            
            decoder_input = one_hot.detach() # don't pass gradient with this tensor.
            
        return outputs

In [46]:
# inference routine for sequence model.
def infer(net, eng_word,shape,device ='cpu'):
    input_ = word_rep(eng_word,eng_alpha2index,device) # convert the name into one hot encoding.
    outputs = net(input_,shape,device) # initilise the hidden layer.
    
    return outputs

In [47]:
net = Transliteration_EncoderDecoder(len(eng_alpha2index), 256, len(hindi_alpha2index), verbose=True)

In [None]:
out = infer(net, 'INDIA', 30)

Encoder input torch.Size([6, 1, 27])
Encoder output torch.Size([6, 1, 256])
Encoder hidden torch.Size([1, 1, 256])
Decoder state torch.Size([1, 1, 256])
Decoder input torch.Size([1, 1, 129])
Decoder intermediate output torch.Size([1, 1, 256])
Decoder output torch.Size([1, 1, 129])


In [None]:
print(len(out))
for i in range(len(out)):
    print(out[i].shape, list(hindi_alpha2index.keys())[list(hindi_alpha2index.values()).index(torch.argmax(out[i]))])

30
torch.Size([1, 129]) ऊ
torch.Size([1, 129]) ऊ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ
torch.Size([1, 129]) ऍ


In [35]:
def test(net_attn, word, device = 'cpu'):
    net_attn = net_attn.eval().to(device)
    outputs = infer(net_attn, word, 30, device)
    hindi_output = ''
    for out in outputs:
        val, indices = out.topk(1)
        index = indices.tolist()[0][0]
        if index == 0:
            break
        hindi_char = hindi_alphabets[index+1]
        hindi_output += hindi_char
    print(word + ' - ' + hindi_output)
    return hindi_output

In [36]:
def calc_accuracy(net_attn, device = 'cpu'):
    net_attn = net_attn.eval().to(device)
    predictions = []
    accuracy = 0
    for i in range(len(val_data)):
        eng, hindi = val_data[i]
        gt = gt_rep(hindi, hindi_alpha2index, device)
        outputs = infer(net, eng, gt.shape[0], device)
        correct = 0
        for index, out in enumerate(outputs):
            val, indices = out.topk(1)
            hindi_pos = indices.tolist()[0]
            print(hindi_pos[0],gt[index])
            if hindi_pos[0] == gt[index][0]:
                correct += 1
        
        accuracy += correct/gt.shape[0]
    accuracy /= len(val_data)
    return accuracy, outputs
     

## Training

In [44]:
def train_batch(net, opt, criterion, batch_size, device = 'cpu', teacher_force = False):
    
    net.train().to(device)
    opt.zero_grad()
    eng_batch, hindi_batch = train_data.get_batch(batch_size)
    
    total_loss = 0
    for i in range(batch_size):
        
        input = word_rep(eng_batch[i], eng_alpha2index, device)
        gt = gt_rep(hindi_batch[i], hindi_alpha2index, device)
        outputs = net(input, gt.shape[0], device, ground_truth = gt if teacher_force else None)
        
        for index, output in enumerate(outputs):
            loss = criterion(output, gt[index]) / batch_size
            loss.backward(retain_graph = True)
            total_loss += loss
        val_accuracy =  calc_accuracy(net, device = 'cpu') 
         
    opt.step()
    
    return total_loss/batch_size,val_accuracy

In [48]:
# Training Helper

def train_setup(net=net, config=None, momentum=0.9, display_freq=5, device='cpu'):
    with wandb.init(config=config):
        config = wandb.config
        net = net.to(device)
        criterion = nn.NLLLoss(ignore_index=-1)
        opt = optim.Adam(net.parameters(), lr=0.001)
        teacher_force_upto = config.n_batches // 3

        loss_arr = np.zeros(config.n_batches + 1)
        val_accuracy_arr = np.zeros(config.n_batches + 1)

        for i in range(config.n_batches):
            loss, val_accuracy = train_batch(net, opt, criterion, config.batch_size, device=device,
                                             teacher_force=i < teacher_force_upto)
            loss_arr[i + 1] = (loss_arr[i] * i + loss)
            val_accuracy_arr[i + 1] = val_accuracy
            wandb.log({'loss': loss_arr[i + 1], 'val_accuracy': val_accuracy_arr[i + 1], 'batch': i + 1})
            
            
        return loss_arr, val_accuracy_arr


In [49]:
def train_setup1(net, lr = 0.001, n_batches = 100, batch_size = 10, momentum = 0.9, display_freq=5, device = 'cpu'):
    
    net = net.to(device)
    criterion = nn.NLLLoss(ignore_index = -1)
    opt = optim.Adam(net.parameters(), lr=lr)
    teacher_force_upto = n_batches//3
    
    loss_arr = np.zeros(n_batches + 1)
    
    for i in range(n_batches):
        loss_arr[i+1] = (loss_arr[i]*i + train_batch(net_attn, opt, criterion, batch_size, device = device, teacher_force = i))
        
    return loss_arr

In [50]:
# Training without Attention

net = Transliteration_EncoderDecoder(len(eng_alpha2index), 256, len(hindi_alpha2index)).

In [None]:
wandb.agent(sweep_id,train_setup,count=15)

In [None]:
#used to create prediction_vanilla file
def test(net, word, device = 'cpu'):
    net = net.eval().to(device)
    outputs = infer(net, word, 30, device)
    hindi_output = ''
    for out in outputs:
        val, indices = out.topk(1)
        index = indices.tolist()[0][0]
        if index == 0:
            break
        hindi_char = hindi_alphabets[index+1]
        hindi_output += hindi_char
    print(word + ' - ' + hindi_output)
    return hindi_output

In [None]:
#calculate accuracy on test data
def calc_accuracy(net, device = 'cpu'):
    net = net.eval().to(device)
    predictions = []
    accuracy = 0
    for i in range(len(test_data)):
        eng, hindi = test_data[i]
        gt = gt_rep(hindi, hindi_alpha2index, device)
        outputs = infer(net, eng, gt.shape[0], device)
        correct = 0
        for index, out in enumerate(outputs):
            val, indices = out.topk(1)
            hindi_pos = indices.tolist()[0]
            print(hindi_pos[0],gt[index])
            if hindi_pos[0] == gt[index][0]:
                correct += 1
        
        accuracy += correct/gt.shape[0]
    accuracy /= len(test_data)
    return accuracy, outputs
     

In [None]:
accuracy,op = calc_accuracy(net) #* 100
# accuracy_attn = calc_accuracy(net_att) * 100
print('Accuracy w/o attention ', accuracy)
# print('Acurracy with attention', accuracy_attn)

57 tensor([38])
78 tensor([49])
49 tensor([47])
63 tensor([73])
0 tensor([22])
0 tensor([78])
0 tensor([57])
0 tensor([0])
57 tensor([57])
78 tensor([64])
49 tensor([23])
78 tensor([63])
49 tensor([16])
63 tensor([24])
0 tensor([63])
0 tensor([0])
57 tensor([51])
63 tensor([49])
49 tensor([78])
63 tensor([41])
0 tensor([0])
57 tensor([32])
78 tensor([78])
49 tensor([54])
78 tensor([64])
49 tensor([32])
63 tensor([49])
0 tensor([78])
0 tensor([57])
0 tensor([0])
57 tensor([37])
78 tensor([64])
49 tensor([49])
78 tensor([66])
49 tensor([41])
78 tensor([72])
49 tensor([51])
63 tensor([54])
0 tensor([72])
0 tensor([51])
0 tensor([65])
0 tensor([0])
57 tensor([8])
78 tensor([3])
49 tensor([34])
78 tensor([64])
49 tensor([43])
78 tensor([72])
49 tensor([3])
78 tensor([34])
0 tensor([72])
0 tensor([3])
0 tensor([57])
0 tensor([0])
57 tensor([57])
78 tensor([78])
49 tensor([43])
78 tensor([72])
49 tensor([55])
63 tensor([64])
0 tensor([48])
0 tensor([76])
0 tensor([3])
0 tensor([0])
57 tensor(

In [None]:
test_data1.head()

Unnamed: 0,0,1
0,thermax,थरमैक्स
1,sikhaaega,सिखाएगा
2,learn,लर्न
3,twitters,ट्विटर्स
4,tirunelveli,तिरुनेलवेली
