In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import spacy
import pickle
import json
import pandas as pd

### Preprocessing

In [15]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./eval.csv')
train_data.describe()
test_data.describe()
train_data.head()

Unnamed: 0,input,target
0,grammar: So I think we can not live if old peo...,So I think we would not be alive if our ancest...
1,grammar: So I think we can not live if old peo...,So I think we could not live if older people d...
2,grammar: So I think we can not live if old peo...,So I think we can not live if old people could...
3,grammar: So I think we can not live if old peo...,So I think we can not live if old people can n...
4,grammar: For not use car .,Not for use with a car .


In [16]:
train_data['input'] = train_data['input'].apply(lambda x: x[8:-1])
train_data.head()       

Unnamed: 0,input,target
0,So I think we can not live if old people coul...,So I think we would not be alive if our ancest...
1,So I think we can not live if old people coul...,So I think we could not live if older people d...
2,So I think we can not live if old people coul...,So I think we can not live if old people could...
3,So I think we can not live if old people coul...,So I think we can not live if old people can n...
4,For not use car .,Not for use with a car .


In [17]:
test_data['input'] = test_data['input'].apply(lambda x: x[8:-1])
test_data.iloc[115]

input        I have an IBM computer and my laptop is DELL 
target    I have an IBM computer and my laptop is a Dell .
Name: 115, dtype: object

### Tokenization

In [18]:
nlp = spacy.load('en_core_web_sm')
def tokenize(text):
    res=[tok.text for tok in nlp.tokenizer(text)]
    return res
print(tokenize(train_data.iloc[0]['input']))

[' ', 'So', 'I', 'think', 'we', 'can', 'not', 'live', 'if', 'old', 'people', 'could', 'not', 'find', 'siences', 'and', 'tecnologies', 'and', 'they', 'did', 'not', 'developped', '.']


In [19]:
train_data['input'] = train_data['input'].apply(lambda x: ['<s>',*tokenize(x),'</s>'])
train_data['target'] = train_data['target'].apply(lambda x: ['<s>',*tokenize(x),'</s>'])
train_data.head()

Unnamed: 0,input,target
0,"[<s>, , So, I, think, we, can, not, live, if,...","[<s>, So, I, think, we, would, not, be, alive,..."
1,"[<s>, , So, I, think, we, can, not, live, if,...","[<s>, So, I, think, we, could, not, live, if, ..."
2,"[<s>, , So, I, think, we, can, not, live, if,...","[<s>, So, I, think, we, can, not, live, if, ol..."
3,"[<s>, , So, I, think, we, can, not, live, if,...","[<s>, So, I, think, we, can, not, live, if, ol..."
4,"[<s>, , For, not, use, car, ., </s>]","[<s>, Not, for, use, with, a, car, ., </s>]"


In [20]:
test_data['input'] = test_data['input'].apply(lambda x: ['<s>',*tokenize(x),'</s>'])
test_data['target'] = test_data['target'].apply(lambda x: ['<s>',*tokenize(x),'</s>'])
test_data.head()

Unnamed: 0,input,target
0,"[<s>, , New, and, new, technology, has, been,...","[<s>, New, technology, has, been, introduced, ..."
1,"[<s>, , New, and, new, technology, has, been,...","[<s>, New, technology, has, been, introduced, ..."
2,"[<s>, , New, and, new, technology, has, been,...","[<s>, Newer, and, newer, technology, has, been..."
3,"[<s>, , New, and, new, technology, has, been,...","[<s>, Newer, and, newer, technology, has, been..."
4,"[<s>, , One, possible, outcome, is, that, an,...","[<s>, One, possible, outcome, is, that, an, en..."


### Embedding

In [21]:
from gensim.models import Word2Vec
data = []
for i in range(len(train_data)):
    data.append(train_data.iloc[i]['input'])
    data.append(train_data.iloc[i]['target'])
for i in range(len(test_data)):
    data.append(test_data.iloc[i]['input'])
    data.append(test_data.iloc[i]['target'])
    
word2vec_model = Word2Vec(data, vector_size=100,\
                                 window=5, min_count=1, workers=4, sg=1,\
                                 epochs=100)

In [22]:
word2vec_model.wv.n_similarity(['<s>'],['</s>'])

-0.08922081

### Dataset and Dataloader

In [33]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

class CustomDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=128):
        self.df = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        input_text = str(self.df.loc[index, 'input'])
        target_text = str(self.df.loc[index, 'target'])

        input_encoding = self.tokenizer(
            f"[CLS] {input_text} [SEP]",
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        target_encoding = self.tokenizer(
            f"[CLS] {target_text} [SEP]",
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = input_encoding['input_ids'].squeeze()
        attention_mask = input_encoding['attention_mask'].squeeze()

        target_ids = target_encoding['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'target_ids': target_ids,
        }

# Example usage
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
train_dataset = CustomDataset('train.csv', tokenizer)
test_dataset = CustomDataset('eval.csv', tokenizer)
# Create DataLoader
batch_size = 32
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Access a batch
for batch in train_dataloader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    target_ids = batch['target_ids']

    # Your training code goes here
    print("Input IDs:", input_ids.size())
    print("Attention Mask:", attention_mask.size())
    print("Target IDs:", target_ids.size())
    break  # For demonstration, only print the first batch

for batch in test_dataloader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']  
    target_ids = batch['target_ids']

    # Your training code goes here
    print("Input IDs:", input_ids.size())
    print("Attention Mask:", attention_mask.size())
    print("Target IDs:", target_ids.size())
    break  # For demonstration, only print the first batch

Input IDs: torch.Size([32, 128])
Attention Mask: torch.Size([32, 128])
Target IDs: torch.Size([32, 128])
Input IDs: torch.Size([32, 128])
Attention Mask: torch.Size([32, 128])
Target IDs: torch.Size([32, 128])
