In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json
/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from torch import nn
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from tqdm.notebook import tqdm

In [3]:
data = pd.read_json('/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json', 
                    lines=True)

In [4]:
data.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [5]:
data.iloc[0]['headline']

"former versace store clerk sues over secret 'black code' for minority shoppers"

In [6]:
data['is_sarcastic'].value_counts(normalize = True)

is_sarcastic
0    0.561047
1    0.438953
Name: proportion, dtype: float64

In [7]:
X = data['headline']
y = data['is_sarcastic'].values

X = X.str.lower()
X = X.str.split()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [8]:
unique_words = set(X_train.sum())
vocabulary = {x: i+1 for i, x in enumerate(unique_words)}

In [9]:
X.map(len).describe()

count    26709.000000
mean         9.845820
std          3.168955
min          2.000000
25%          8.000000
50%         10.000000
75%         12.000000
max         39.000000
Name: headline, dtype: float64

In [10]:
X.map(len).quantile(0.95)

15.0

In [11]:
MAX_LENGTH = 15

In [12]:
def encode(words, max_length = MAX_LENGTH):
    tokens = [vocabulary.get(word, 0) for word in words] #0 if we didn't see this word in train
    tokens += [0]*(max_length - len(tokens)) #add if < max_length
    tokens = tokens[:max_length] #cut if > max_length
    return tokens

In [13]:
X_train_tokens = np.stack(X_train.map(encode))
X_test_tokens = np.stack(X_test.map(encode))

In [14]:
X_train_tokens = torch.Tensor(X_train_tokens).to(torch.int64)
X_test_tokens = torch.Tensor(X_test_tokens).to(torch.int64)
y_train = torch.Tensor(y_train).type(torch.LongTensor)
y_test = torch.Tensor(y_test).type(torch.LongTensor)

In [15]:
X_train_tokens[:10]

tensor([[27390,  5702,  3330, 17045, 26679,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [ 7887, 26830, 22616, 22497, 18891, 12444, 17324, 21546,     0,     0,
             0,     0,     0,     0,     0],
        [11694,  4147, 18548, 21025,  4147,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [ 3905, 21925,  3844,  1269,  2743, 24424, 18601,  5880, 26885, 13297,
         17308,     0,     0,     0,     0],
        [28037, 17132, 23456,  7558, 10740, 22847,   905, 20362, 15033, 11748,
         10740, 12096,     0,     0,     0],
        [21958, 27071,  1505, 17838, 27325,  8381,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [21842, 20901, 26830, 17838, 19749,  1301, 22393,     0,     0,     0,
             0,     0,     0,     0,     0],
        [30805, 24437, 25830, 29457, 15918, 12448,  2596,  4293, 23640,     0,
             0,     0,     0,     0,     0],
        

In [16]:
embedding = nn.Embedding(len(vocabulary)+1, 8)

In [17]:
batch = X_train_tokens[:8, :]
batch

tensor([[27390,  5702,  3330, 17045, 26679,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [ 7887, 26830, 22616, 22497, 18891, 12444, 17324, 21546,     0,     0,
             0,     0,     0,     0,     0],
        [11694,  4147, 18548, 21025,  4147,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [ 3905, 21925,  3844,  1269,  2743, 24424, 18601,  5880, 26885, 13297,
         17308,     0,     0,     0,     0],
        [28037, 17132, 23456,  7558, 10740, 22847,   905, 20362, 15033, 11748,
         10740, 12096,     0,     0,     0],
        [21958, 27071,  1505, 17838, 27325,  8381,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [21842, 20901, 26830, 17838, 19749,  1301, 22393,     0,     0,     0,
             0,     0,     0,     0,     0],
        [30805, 24437, 25830, 29457, 15918, 12448,  2596,  4293, 23640,     0,
             0,     0,     0,     0,     0]])

In [18]:
embedding(batch)

tensor([[[-1.6595e+00,  1.5631e-01, -2.0175e+00,  1.0455e+00,  2.3566e-01,
          -1.9820e-01, -1.2759e+00, -9.2908e-01],
         [ 1.2807e-01, -1.6156e+00,  2.2920e+00,  4.2425e-01, -9.7808e-01,
           7.4626e-01,  9.4392e-01,  5.2310e-01],
         [ 7.6578e-01, -1.2106e+00, -6.9320e-01,  8.8116e-01, -1.3299e-02,
          -2.1785e+00,  1.1815e+00, -8.1894e-01],
         [-1.7198e+00, -3.0463e-01, -1.2328e+00,  3.8999e-01, -1.0775e+00,
           1.0065e+00, -8.3230e-02,  1.1191e+00],
         [ 8.4491e-01,  1.3797e+00,  8.4762e-02, -9.9734e-01, -9.1550e-02,
           4.7394e-01, -3.2732e-01, -1.1087e+00],
         [-3.8610e-01, -1.0702e+00,  3.6634e-03, -6.7629e-01,  1.0031e+00,
           1.6347e+00, -5.8754e-02,  4.9604e-01],
         [-3.8610e-01, -1.0702e+00,  3.6634e-03, -6.7629e-01,  1.0031e+00,
           1.6347e+00, -5.8754e-02,  4.9604e-01],
         [-3.8610e-01, -1.0702e+00,  3.6634e-03, -6.7629e-01,  1.0031e+00,
           1.6347e+00, -5.8754e-02,  4.9604e-01],


In [19]:
embedding(batch).shape #присвоили каждому числу, обозначающему слова, случайный вектор

torch.Size([8, 15, 8])

In [20]:
train_loader = DataLoader(
    list(zip(X_train_tokens, y_train)), 
    batch_size = 32
)

val_loader = DataLoader(
    list(zip(X_test_tokens, y_test)), 
    batch_size = 32
)

In [21]:
for tokens, target in train_loader:
    print(tokens, target)
    break

tensor([[27390,  5702,  3330, 17045, 26679,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [ 7887, 26830, 22616, 22497, 18891, 12444, 17324, 21546,     0,     0,
             0,     0,     0,     0,     0],
        [11694,  4147, 18548, 21025,  4147,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [ 3905, 21925,  3844,  1269,  2743, 24424, 18601,  5880, 26885, 13297,
         17308,     0,     0,     0,     0],
        [28037, 17132, 23456,  7558, 10740, 22847,   905, 20362, 15033, 11748,
         10740, 12096,     0,     0,     0],
        [21958, 27071,  1505, 17838, 27325,  8381,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [21842, 20901, 26830, 17838, 19749,  1301, 22393,     0,     0,     0,
             0,     0,     0,     0,     0],
        [30805, 24437, 25830, 29457, 15918, 12448,  2596,  4293, 23640,     0,
             0,     0,     0,     0,     0],
        

In [22]:
model = nn.Sequential(
    nn.Embedding(len(vocabulary)+1, 8),
    nn.ReLU(),
    nn.Flatten(),
    nn.Linear(MAX_LENGTH * 8, 8),
    nn.ReLU(),
    nn.Linear(8,2)
)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters())

In [23]:
model(batch)

tensor([[-0.4503,  0.0764],
        [-0.3945,  0.0449],
        [-0.4783,  0.2591],
        [-0.4680,  0.0527],
        [-0.4788,  0.3257],
        [-0.2919,  0.2556],
        [-0.4309, -0.0528],
        [-0.4518,  0.2438]], grad_fn=<AddmmBackward0>)

In [24]:
losses = []
accuracies = []
val_losses = []
val_accuracies = []

for epoch in range(10):
    total_loss = 0 
    total_accuracy = 0
    for (tokens, target) in tqdm(train_loader, total = len(train_loader)):
        optimizer.zero_grad()
        predict = model(tokens)
        loss = criterion(predict, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_accuracy += (predict.argmax(dim=1) == target).to(torch.float).mean()
    losses.append(total_loss/len(train_loader))
    accuracies.append(total_accuracy/len(train_loader))
    
    val_total_loss = 0
    val_total_accuracy = 0
    for (tokens, target) in tqdm(val_loader, total = len(val_loader)):
        with torch.no_grad():
            predict = model(tokens)
            loss = criterion(predict, target)
        val_total_loss += loss.item()
        val_total_accuracy += (predict.argmax(dim=1) == target).to(torch.float).mean()
    val_losses.append(val_total_loss/len(val_loader)) 
    val_accuracies.append(val_total_accuracy/len(val_loader))
    
    print(f"epoch {epoch}, train: {accuracies[-1]}, test: {val_accuracies[-1]}")

  0%|          | 0/668 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

epoch 0, train: 0.5881118178367615, test: 0.6269835233688354


  0%|          | 0/668 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

epoch 1, train: 0.667404294013977, test: 0.6926646828651428


  0%|          | 0/668 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

epoch 2, train: 0.7318508625030518, test: 0.7287799715995789


  0%|          | 0/668 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

epoch 3, train: 0.7769948840141296, test: 0.7517839670181274


  0%|          | 0/668 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

epoch 4, train: 0.8072991371154785, test: 0.768812358379364


  0%|          | 0/668 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

epoch 5, train: 0.8310641050338745, test: 0.7800648212432861


  0%|          | 0/668 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

epoch 6, train: 0.8515543937683105, test: 0.7912923693656921


  0%|          | 0/668 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

epoch 7, train: 0.8682553172111511, test: 0.7970932722091675


  0%|          | 0/668 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

epoch 8, train: 0.8822429776191711, test: 0.8006486892700195


  0%|          | 0/668 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

epoch 9, train: 0.8940501809120178, test: 0.8062624335289001


In [25]:
len(losses)

10