In [2]:
import torch
import torch.nn as nn
import torchtext.data as ttd
from torchtext.vocab import GloVe
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [12]:
data = pd.read_csv("../Data/spam.csv", encoding='ISO-8859-1')
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [13]:
data.columns = ['labels','data']

In [15]:
data.head()

Unnamed: 0,labels,data
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
data['b_labels'] = data['labels'].map({'ham':0, 'spam':1})

In [18]:
data

Unnamed: 0,labels,data,b_labels
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will Ì_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [19]:
df = data[['data','b_labels']]

In [20]:
df.to_csv("spam.csv", index=False)

In [21]:
TEXT = ttd.Field(sequential=True, batch_first=True, lower=True, tokenize='spacy', pad_first=True)

In [22]:
LABEL = ttd.Field(sequential=False, use_vocab=False, is_target=True)

In [23]:
dataset = ttd.TabularDataset(
    path="spam.csv",
    format="csv",
    skip_header=True,
    fields=[('data',TEXT),('label',LABEL)]
)

In [31]:
train_dataset, test_dataset = dataset.split() # default is .7

In [32]:
TEXT.build_vocab(train_dataset,)

In [33]:
vocab = TEXT.vocab

In [34]:
vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x00000248C40B0B50>>,
            {'<unk>': 0,
             '<pad>': 1,
             '.': 2,
             'i': 3,
             'to': 4,
             'you': 5,
             ',': 6,
             '?': 7,
             'a': 8,
             '!': 9,
             'the': 10,
             '...': 11,
             'u': 12,
             'and': 13,
             'is': 14,
             'in': 15,
             'me': 16,
             'my': 17,
             'it': 18,
             'for': 19,
             'do': 20,
             '..': 21,
             'your': 22,
             'have': 23,
             'of': 24,
             'that': 25,
             'call': 26,
             'on': 27,
             'are': 28,
             'now': 29,
             "'s": 30,
             '2': 31,
             '&': 32,
             'but': 33,
             'so': 34,
             'not': 35,
             'can': 36,
             'we': 37,
            

In [35]:
vocab.itos

['<unk>',
 '<pad>',
 '.',
 'i',
 'to',
 'you',
 ',',
 '?',
 'a',
 '!',
 'the',
 '...',
 'u',
 'and',
 'is',
 'in',
 'me',
 'my',
 'it',
 'for',
 'do',
 '..',
 'your',
 'have',
 'of',
 'that',
 'call',
 'on',
 'are',
 'now',
 "'s",
 '2',
 '&',
 'but',
 'so',
 'not',
 'can',
 'we',
 ':',
 'or',
 'if',
 'at',
 'will',
 'get',
 "'m",
 'ur',
 'with',
 ' ',
 'be',
 ';',
 'no',
 "n't",
 'just',
 'this',
 'nt',
 '*',
 'how',
 'up',
 '-',
 'what',
 'when',
 'ok',
 ')',
 '4',
 'go',
 'from',
 'all',
 'know',
 'free',
 'out',
 'was',
 'then',
 'like',
 'good',
 'come',
 'got',
 '"',
 '/',
 'he',
 'there',
 'lt;#&gt',
 'am',
 'only',
 'time',
 'day',
 'send',
 "'ll",
 'its',
 'love',
 'want',
 'did',
 'as',
 'text',
 'by',
 'home',
 'one',
 'she',
 'today',
 'txt',
 'about',
 'see',
 'sorry',
 'going',
 'need',
 'r',
 'still',
 'stop',
 'n',
 'lor',
 'think',
 'mobile',
 'our',
 'da',
 'back',
 'tell',
 'reply',
 'who',
 'her',
 'they',
 'any',
 'later',
 'take',
 'd',
 'new',
 'ca',
 '_',
 'an',


In [36]:
train_iter, test_iter = ttd.Iterator.splits(
    (train_dataset, test_dataset),
    sort_key=lambda x : len(x.data),
    batch_sizes=(32, 256)
)

In [37]:
for inputs, targets in train_iter:
    print(inputs.shape)
    print(targets.shape)

torch.Size([32, 50])
torch.Size([32])
torch.Size([32, 55])
torch.Size([32])
torch.Size([32, 52])
torch.Size([32])
torch.Size([32, 37])
torch.Size([32])
torch.Size([32, 76])
torch.Size([32])
torch.Size([32, 49])
torch.Size([32])
torch.Size([32, 36])
torch.Size([32])
torch.Size([32, 46])
torch.Size([32])
torch.Size([32, 107])
torch.Size([32])
torch.Size([32, 49])
torch.Size([32])
torch.Size([32, 44])
torch.Size([32])
torch.Size([32, 32])
torch.Size([32])
torch.Size([32, 39])
torch.Size([32])
torch.Size([32, 82])
torch.Size([32])
torch.Size([32, 54])
torch.Size([32])
torch.Size([32, 39])
torch.Size([32])
torch.Size([32, 95])
torch.Size([32])
torch.Size([32, 56])
torch.Size([32])
torch.Size([32, 44])
torch.Size([32])
torch.Size([32, 59])
torch.Size([32])
torch.Size([32, 76])
torch.Size([32])
torch.Size([32, 38])
torch.Size([32])
torch.Size([32, 78])
torch.Size([32])
torch.Size([32, 36])
torch.Size([32])
torch.Size([32, 43])
torch.Size([32])
torch.Size([32, 45])
torch.Size([32])
torch.Size(

In [None]:
# Define the model
class LSTM(nn.Module):
    def __init__