# Отчет о проделанной работе

## Результаты

 - Достигнута точность 68.66%, что меньше требуемой 70%.
 - Учет поверхностных признаков повысил точность на 0.66%, хотя сами по себе они дают точность примерно 58%.

In [1]:
import pandas as pd
import numpy as np
import time
import logging
logging.basicConfig(filename="pt.log", level=logging.INFO)

In [2]:
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.data import Field, LabelField, BucketIterator, TabularDataset, Iterator

import nltk
import gensim
import spacy
import en_core_web_sm
spacy_en = en_core_web_sm.load()

def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text) if tok.text.isalpha()]     

In [87]:
df = pd.read_csv('sarcasm.csv')

In [4]:
df = shuffle(df)
df[:800000].to_csv('train6.csv', index=True, index_label='index')
df[800000:].to_csv('test6.csv', index=True, index_label='index')

In [None]:
TRAIN = 'train6.csv'
TEST = 'test6.csv'
FREQ = 50
batch_size = 32

TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english'))

LABEL = LabelField(dtype=torch.int64)

INDEX = Field(sequential=False, use_vocab=False, dtype=torch.int64)

d_train = TabularDataset(TRAIN, format='csv', 
    fields=[('index', INDEX), ('label', LABEL), ('text', TEXT)], skip_header=True)

d_test = TabularDataset(TEST, format='csv', 
    fields=[('index', INDEX), ('label', LABEL), ('text', TEXT)], skip_header=True)

TEXT.build_vocab(d_train, min_freq=FREQ, vectors="glove.6B.100d")
LABEL.build_vocab(d_train)

train_iterator = BucketIterator(d_train, batch_size, shuffle=True,
    sort_key=lambda x: len(x.text))

test_iterator = BucketIterator(d_test, batch_size, shuffle=True,
    sort_key=lambda x: len(x.text))

pretrained_embeddings = TEXT.vocab.vectors


In [None]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv_0 = nn.Conv1d(in_channels=1, out_channels=n_filters, 
                                kernel_size=(filter_sizes[0], embedding_dim))
        self.conv_1 = nn.Conv1d(in_channels=1, out_channels=n_filters,
                        kernel_size=(filter_sizes[1], embedding_dim))        
        self.conv_2 = nn.Conv1d(in_channels=1, out_channels=n_filters,
                        kernel_size=(filter_sizes[2], embedding_dim))        
        self.conv_3 = nn.Conv1d(in_channels=1, out_channels=n_filters,
                                kernel_size=(filter_sizes[3], embedding_dim))        
        self.fc1 = nn.Linear(len(filter_sizes)*n_filters, 100)
        
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(100, 20)
        self.fc3 = nn.Linear(20, 1)
        
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.unsqueeze(1)
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3))
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3))
        conved_2 = F.relu(self.conv_2(embedded).squeeze(3))
        conved_3 = F.relu(self.conv_3(embedded).squeeze(3))        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)        
        pooled_3 = F.max_pool1d(conved_3, conved_3.shape[2]).squeeze(2)                
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2, pooled_3), dim=1))
        x = self.fc1(cat)
        x = self.fc2(x)
        x = self.fc3(x)
        return x
    
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2,3,4,5]

OUTPUT_DIM = 1

DROPOUT = 0.5
print(INPUT_DIM, OUTPUT_DIM)

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, 
                                FILTER_SIZES, OUTPUT_DIM, DROPOUT)
# +GLOVE
model.embedding.weight.data.copy_(pretrained_embeddings)
# +FROZEN
model.embedding.weight.requires_grad=False


In [None]:
def accuracy(y_pred, y_true):
    y_true = y_true.byte()
    y_pred = torch.ge(y_pred, 0.5)
    return torch.mean(torch.eq(y_pred, y_true).float())

def bb5(t0):
    s = t0.size()[1]
    if s == 4:
        t0 = F.pad(t0, (0, 1), 'constant', 1)
    elif s == 3:
        t0 = F.pad(t0, (0, 1), 'constant', 1)
        t0 = F.pad(t0, (0, 1), 'constant', 1)
    elif s == 2:        
        t0 = F.pad(t0, (0, 1), 'constant', 1)
        t0 = F.pad(t0, (0, 1), 'constant', 1)
        t0 = F.pad(t0, (0, 1), 'constant', 1)        
    elif s == 1:        
        t0 = F.pad(t0, (0, 1), 'constant', 1)
        t0 = F.pad(t0, (0, 1), 'constant', 1)
        t0 = F.pad(t0, (0, 1), 'constant', 1)        
        t0 = F.pad(t0, (0, 1), 'constant', 1)                
    return t0

def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()
    i = 0
    for batch in iterator:
        optimizer.zero_grad()
        
        text = bb5(batch.text[0])
        y_train = batch.label.float()  
        
        i += 1
        logging.info(str(epoch) + ' ' + str(i) + ' ' + str(text.size()))
       
        pred = model(text)        
        y_pred = F.sigmoid(pred)        
        y_pred = y_pred.squeeze()
        loss = criterion(y_pred, y_train)
        acc = accuracy(y_pred, y_train)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        
        i = 0
        for batch in iterator:
            i += 1
            text = bb5(batch.text[0])
            target = batch.label        
            y_test = batch.label.float()  
            ind = batch.index
            
            predictions = model(text)
            prob = F.sigmoid(predictions)
            
            pred = model(text)        
            y_pred = F.sigmoid(pred)            
            y_pred = y_pred.squeeze()
            loss = criterion(y_pred, y_test)
            acc = accuracy(y_pred, y_test)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [86]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()
N_EPOCHS = 40
BEST = 0
EPOCH = 0

for epoch in range(N_EPOCHS):
    start = time.time()        
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    elapsed = time.time() - start
    print(f'elapsed={elapsed:.1f} | Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')  
    
    if test_acc > BEST:
        BEST = test_acc
        EPOCH = epoch + 1
        torch.save(model, "t6.pt")

print('best_acc=', BEST, 'epoch_best=', EPOCH)        

len_train= 25000
len_test= 6589
Epoch: 01 | Train Loss: 0.581 | Train Acc: 69.02%
elapsed=772.6 | Test Loss: 0.593 | Test Acc: 67.82%
Epoch: 02 | Train Loss: 0.579 | Train Acc: 69.19%
elapsed=761.5 | Test Loss: 0.592 | Test Acc: 67.93%
Epoch: 03 | Train Loss: 0.578 | Train Acc: 69.30%
elapsed=761.5 | Test Loss: 0.592 | Test Acc: 68.00%
Epoch: 04 | Train Loss: 0.576 | Train Acc: 69.47%
elapsed=766.9 | Test Loss: 0.593 | Test Acc: 67.90%
Epoch: 05 | Train Loss: 0.575 | Train Acc: 69.46%
elapsed=762.7 | Test Loss: 0.591 | Test Acc: 68.07%
Epoch: 06 | Train Loss: 0.573 | Train Acc: 69.63%
elapsed=767.8 | Test Loss: 0.592 | Test Acc: 67.96%
Epoch: 07 | Train Loss: 0.572 | Train Acc: 69.77%
elapsed=766.2 | Test Loss: 0.593 | Test Acc: 68.09%
Epoch: 08 | Train Loss: 0.571 | Train Acc: 69.86%
elapsed=767.9 | Test Loss: 0.593 | Test Acc: 68.08%
Epoch: 09 | Train Loss: 0.570 | Train Acc: 69.89%
elapsed=764.5 | Test Loss: 0.593 | Test Acc: 68.03%
Epoch: 10 | Train Loss: 0.569 | Train Acc: 69.99%
