<a href="https://colab.research.google.com/github/AnnaZhuravleva/compling/blob/master/exam_var_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Question

**6. Is it possible to train distributed representations for unstructured data (table dataset with categorical features)? How?** 

We should encode the data  and make normalization
- target mean coding
- select the most valuable parameters
- reduce the number of parameters

## Task
Develop a model for predicting review rating.


Binary classification:

- positive class: target = 5
- negative class: target = 1,2,3,4
- Score: binary F1

You are forbidden to use test dataset for any kind of training.
Remember proper training pipeline.

If you are not using default params in the models, you have to use some validation scheme to justify them.

Use random_state or seed params - your experiment must be reprodusible.

- 1 baseline = 0.720
- 2 baseline = 0.745

### Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.externals import joblib
from sklearn import metrics
from tqdm import tqdm, tqdm_notebook
import nltk 
from nltk import PunktSentenceTokenizer
import gensim.downloader as api
nltk.download('punkt')
nltk.download('stopwords')
import re
import nltk
import gensim
import spacy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torchtext.data import Field, LabelField, BucketIterator, TabularDataset, Iterator
from keras.utils.np_utils import to_categorical
from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn import metrics
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
stopwords = stopwords.words('english')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Using TensorFlow backend.


### Data

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train['target'] = (df_train['target'] == 5).astype(np.int)
df_test['target'] = (df_test['target'] == 5).astype(np.int)

df_train.shape

(48192, 3)

In [3]:
import spacy
spacy_en = spacy.load('en')
import nltk
nltk.download('stopwords')

def tokenizer(text): 
    return [tok.text for tok in spacy_en.tokenizer(text) if tok.text not in string.punctuation]

TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer, 
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english'))

LABEL = LabelField(dtype=torch.int64, use_vocab=False, preprocessing=lambda x: x == '5')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
batch_size = 32

dataset = TabularDataset('train.csv', format='csv', 
                         fields=[('review', TEXT), ('title', TEXT),('target', LABEL)], 
                         skip_header=True)
test_df = TabularDataset('test.csv', format='csv', 
                         fields=[('review', TEXT), ('title', TEXT), ('target', LABEL)], 
                         skip_header=True)
train, valid = dataset.split(0.7)

TEXT.build_vocab(dataset, min_freq=1)
LABEL.build_vocab(dataset)


train_iterator, valid_iterator, test_iter = BucketIterator.splits(
    (train, valid, test_df),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.review),
)
len(TEXT.vocab.itos)

48821

### Model

In [0]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, kernels):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.convs = nn.ModuleList([nn.Conv1d(embed_size, hidden_size, k, padding=5) for k in kernels])
        self.fc = nn.Linear(hidden_size * len(kernels), 2)
        
    def forward(self, x):

        x = self.embedding(x)
        x = x.transpose(1,2)   
        concatenated = []

        for conv in self.convs:
            z = conv(x)
            z = F.avg_pool1d(z, kernel_size=z.size(2))
            z = z.squeeze(2)
            concatenated.append(z)
            
        x = torch.cat(concatenated, 1)
        x = self.fc(x)
        return x


def early_stopping(loss, patience=5):
    if len(loss) < patience:
        return False
    best = np.min(np.array(loss))
    return all(map(lambda x: x > best, loss[-patience:]))

def train_batch(model, batch_x, batch_y, optimizer=None):
    if optimizer:
        optimizer.zero_grad()
    pred = model(batch_x)
    loss = criterion(pred, batch_y)
    loss.backward()
    if optimizer:
        optimizer.step()
    return loss.data.detach().item()

def eval_batch(model, batch_x, batch_y):
    model.eval()
    with torch.no_grad():
        pred = model(batch_x)
        loss = criterion(pred, batch_y)
    return loss.data.item()

def train_model(model, train_iter, valid_iter, scheduler, 
                n_epochs, early_stopping_patience=5):
    history = {'train': [], 'valid': []}    
    for epoch in range(n_epochs):
        epoch_train_loss = 0
        for batch in tqdm(train_iter, desc=f'epoch:{epoch}', leave=True):
            loss = train_batch(model, batch.review[0], batch.target)
            epoch_train_loss += loss
            if scheduler:
                scheduler.step(loss)
        epoch_train_loss /= len(train_iter)
        history['train'].append(epoch_train_loss)
        epoch_valid_loss = 0
        for batch in valid_iter:
            loss = eval_batch(model, batch.review[0], batch.target)
            epoch_valid_loss += loss
            
        epoch_valid_loss /= len(valid_iter)
        print(f'train loss: {epoch_train_loss:.3f} \
                valid loss:{epoch_valid_loss:.3f}')
        history['valid'].append(epoch_valid_loss)
        
        if early_stopping(history['valid'], patience=5):
            print('early stopping')
            break

    print('history valid:', history['valid'])
    return history

### Train & test

In [45]:
torch.cuda.empty_cache()

model = MyModel(len(TEXT.vocab.itos),
                embed_size=100,
                hidden_size=64,
                kernels=[7,9]
               )


optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
criterion = nn.CrossEntropyLoss()
history = train_model(model, train_iterator, valid_iterator, scheduler, 2, 5)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, description='epoch:0', max=1055, style=ProgressStyle(description_width='in…

Epoch    12: reducing learning rate of group 0 to 1.0000e-04.
Epoch    23: reducing learning rate of group 0 to 1.0000e-05.
Epoch    36: reducing learning rate of group 0 to 1.0000e-06.
Epoch    47: reducing learning rate of group 0 to 1.0000e-07.
Epoch    58: reducing learning rate of group 0 to 1.0000e-08.

train loss: 0.683                 valid loss:0.689


HBox(children=(IntProgress(value=0, description='epoch:1', max=1055, style=ProgressStyle(description_width='in…


train loss: 0.683                 valid loss:0.689
history valid: [0.6893594882129568, 0.6893594882129568]


In [0]:
def test_model(model, test_iter):
    model.eval()
    y_test = []
    y_pred = []
    for batch in tqdm(test_iter, desc='test'):
        with torch.no_grad():
            pred = model(batch.review[0])
            y_test.append(batch.target)
            y_pred.append(np.argmax(pred, axis=-1))
    y_test = np.concatenate(y_test)
    y_pred = np.concatenate(y_pred)

    return metrics.f1_score(y_test, y_pred)

In [47]:
test_model(model, test_iter)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(IntProgress(value=0, description='test', max=168, style=ProgressStyle(description_width='initia…




0.36868686868686873

In [0]:
from sklearn.pipeline import Pipeline
model = Pipeline([
    ('est',RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = SEED))
])

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est',RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = SEED))
    #('est', LogisticRegressionCV(Cs=5, cv=5, n_jobs=-1, scoring='f1_macro', 
    #                         penalty='l2', solver='newton-cg', multi_class='multinomial', random_state=SEED)),
])

model.fit(df_train[columns], y_train)