In [90]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import spacy
import pickle
import json
import pandas as pd

### Preprocessing

In [91]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./eval.csv')
train_data.describe()
test_data.describe()
train_data.head()

Unnamed: 0,input,target
0,grammar: So I think we can not live if old peo...,So I think we would not be alive if our ancest...
1,grammar: So I think we can not live if old peo...,So I think we could not live if older people d...
2,grammar: So I think we can not live if old peo...,So I think we can not live if old people could...
3,grammar: So I think we can not live if old peo...,So I think we can not live if old people can n...
4,grammar: For not use car .,Not for use with a car .


In [92]:
train_data['input'] = train_data['input'].apply(lambda x: x[8:-1])
train_data.head()       

Unnamed: 0,input,target
0,So I think we can not live if old people coul...,So I think we would not be alive if our ancest...
1,So I think we can not live if old people coul...,So I think we could not live if older people d...
2,So I think we can not live if old people coul...,So I think we can not live if old people could...
3,So I think we can not live if old people coul...,So I think we can not live if old people can n...
4,For not use car .,Not for use with a car .


In [93]:
test_data['input'] = test_data['input'].apply(lambda x: x[8:-1])
test_data.iloc[115]

input        I have an IBM computer and my laptop is DELL 
target    I have an IBM computer and my laptop is a Dell .
Name: 115, dtype: object

### Tokenization

In [94]:
nlp = spacy.load('en_core_web_sm')
def tokenize(text):
    return [tok.text for tok in nlp.tokenizer(text)]
print(['<s>',*tokenize(train_data.iloc[0]['input']),'</s>'])

['<s>', ' ', 'So', 'I', 'think', 'we', 'can', 'not', 'live', 'if', 'old', 'people', 'could', 'not', 'find', 'siences', 'and', 'tecnologies', 'and', 'they', 'did', 'not', 'developped', '.', '</s>']


In [95]:
train_data['input'] = train_data['input'].apply(lambda x: ['<s>',*tokenize(x),'</s>'])
train_data['target'] = train_data['target'].apply(lambda x: ['<s>',*tokenize(x),'</s>'])
train_data.head()

Unnamed: 0,input,target
0,"[<s>, , So, I, think, we, can, not, live, if,...","[<s>, So, I, think, we, would, not, be, alive,..."
1,"[<s>, , So, I, think, we, can, not, live, if,...","[<s>, So, I, think, we, could, not, live, if, ..."
2,"[<s>, , So, I, think, we, can, not, live, if,...","[<s>, So, I, think, we, can, not, live, if, ol..."
3,"[<s>, , So, I, think, we, can, not, live, if,...","[<s>, So, I, think, we, can, not, live, if, ol..."
4,"[<s>, , For, not, use, car, ., </s>]","[<s>, Not, for, use, with, a, car, ., </s>]"


In [96]:
test_data['input'] = test_data['input'].apply(lambda x: ['<s>',*tokenize(x),'</s>'])
test_data['target'] = test_data['target'].apply(lambda x: ['<s>',*tokenize(x),'</s>'])
test_data.head()

Unnamed: 0,input,target
0,"[<s>, , New, and, new, technology, has, been,...","[<s>, New, technology, has, been, introduced, ..."
1,"[<s>, , New, and, new, technology, has, been,...","[<s>, New, technology, has, been, introduced, ..."
2,"[<s>, , New, and, new, technology, has, been,...","[<s>, Newer, and, newer, technology, has, been..."
3,"[<s>, , New, and, new, technology, has, been,...","[<s>, Newer, and, newer, technology, has, been..."
4,"[<s>, , One, possible, outcome, is, that, an,...","[<s>, One, possible, outcome, is, that, an, en..."


### Embedding