In [1]:
import pandas as pd
import spacy
from nltk.tokenize import NLTKWordTokenizer
from nltk.corpus import stopwords

In [2]:
nltk_tknzr = NLTKWordTokenizer()
stop_words = set(stopwords.words('english'))

In [3]:
train_data = pd.read_csv('Predictive_Data/train_file.csv')
test_data = pd.read_csv('Predictive_Data/test_file.csv')

In [4]:
train_data.head()

Unnamed: 0,ID,UsageClass,CheckoutType,CheckoutYear,CheckoutMonth,Checkouts,Title,Creator,Subjects,Publisher,PublicationYear,MaterialType
0,1,Physical,Horizon,2005,4,1,Tidal wave,,"Tsunamis, Tsunamis Juvenile literature",,,BOOK
1,2,Physical,Horizon,2005,4,1,London holiday / Richard Peck.,"Peck, Richard, 1934-",,"Viking,",1998.,BOOK
2,3,Physical,Horizon,2005,4,3,Cinco de Mayo : celebrating Hispanic pride / C...,"Gnojewski, Carol",Cinco de Mayo Mexican holiday History Juvenile...,"Enslow Publishers,",c2002.,BOOK
3,4,Physical,Horizon,2005,4,1,Annapolis,,"War stories, Historical fiction, Domestic fict...",,,BOOK
4,5,Physical,Horizon,2005,4,1,As a man thinketh,,Thought and thinking,,,BOOK


In [5]:
train_data['text'] = train_data['Title'] + train_data['Subjects']
test_data['text'] = test_data['Title'] + test_data['Subjects']

In [6]:
train_data = train_data[['ID','text','MaterialType']]

In [7]:
test_data = test_data[['ID','text']]

In [8]:
train_data.MaterialType.value_counts()

BOOK         21707
SOUNDDISC     4149
VIDEOCASS     2751
VIDEODISC     1420
SOUNDCASS     1020
MIXED          347
MUSIC          165
CR              94
Name: MaterialType, dtype: int64

In [9]:
train_data.text

0         Tidal waveTsunamis, Tsunamis Juvenile literature
1                                                      NaN
2        Cinco de Mayo : celebrating Hispanic pride / C...
3        AnnapolisWar stories, Historical fiction, Dome...
4                    As a man thinkethThought and thinking
                               ...                        
31648    California campingCalifornia Guidebooks, Camp ...
31649    silent world of Nicholas QuinnMorse Inspector ...
31650    big LebowskiVideo recordings for the hearing i...
31651    Fables. [3], Storybook love / Bill Willingham,...
31652                                                  NaN
Name: text, Length: 31653, dtype: object

In [10]:
train_data=train_data.fillna("Not Available")
test_data=test_data.fillna("Not Available")

In [11]:
def clean_text(df,column_name):
    df[column_name] = df[column_name].apply(lambda x: x.lower())
    df[column_name] = df[column_name].str.replace("[^a-zA-Z0-9]", " ")
    df[column_name] = df[column_name].apply(lambda x: nltk_tknzr.tokenize(x))
    df[column_name] = df[column_name].apply(lambda x: [word for word in x if word not in stop_words])
    df[column_name] = df[column_name].apply(lambda x:' '.join(x))
    df[column_name] = df[column_name].apply(lambda x: x.lower())
    return df

In [12]:
train_df = clean_text(train_data,'text')
test_df = clean_text(test_data,'text')

In [13]:
train_df.head()

Unnamed: 0,ID,text,MaterialType
0,1,tidal wavetsunamis tsunamis juvenile literature,BOOK
1,2,available,BOOK
2,3,cinco de mayo celebrating hispanic pride carol...,BOOK
3,4,annapoliswar stories historical fiction domest...,BOOK
4,5,man thinkeththought thinking,BOOK


In [14]:
train_df.to_csv('train.csv',index=False)

In [15]:
test_df.to_csv('test.csv',index=False)

In [85]:
train_df.MaterialType.value_counts()

1    21707
2     4149
3     2751
8     1420
4     1020
5      347
6      165
7       94
Name: MaterialType, dtype: int64

In [23]:
mapping = {'BOOK':1,'SOUNDDISC':2,'VIDEOCASS':3,'VIDEODISC':8,'SOUNDCASS':4, 'MIXED':5, 'MUSIC':6, 'CR':7}

In [24]:
train_df.MaterialType = train_df.MaterialType.apply(lambda x: mapping[x])

In [198]:
import pandas as pd
import spacy
import torch.optim as optim
import torch.nn as nn
import random
from torchtext import data , vocab
import torch
import torchtext
from tqdm import tqdm
from torchtext.datasets import text_classification

In [199]:
TEXT = data.Field(tokenize="spacy",
                  sequential=True,
                  batch_first=True,
                  include_lengths=True,
                  lower=True,
                  stop_words=set(stopwords.words('english')))
LABEL = data.LabelField(batch_first=True)



In [200]:
fields = [(None, None),('text',TEXT),('MaterialType', LABEL)]

In [201]:
training_data = data.TabularDataset(path='train.csv',
                                    format='csv',
                                    fields=fields,
                                    skip_header=True)

# print preprocessed text
print(vars(training_data.examples[0]))



{'text': ['tidal', 'wavetsunamis', 'tsunamis', 'juvenile', 'literature'], 'MaterialType': 'BOOK'}


In [202]:
import random
train_data, valid_data = training_data.split(split_ratio=0.9,stratified=True,strata_field='MaterialType',random_state = random.seed(24))

In [203]:
# create vocabulary from glove cache
vec = torchtext.vocab.GloVe(name='6B', dim=300,cache='/Users/subir/Downloads/glove/')

In [204]:
# initialize glove embeddings
TEXT.build_vocab(train_data, min_freq=3, vectors=vec,unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [205]:
# No. of unique tokens in text
print("Size of TEXT vocabulary:", len(TEXT.vocab))

# No. of unique tokens in label
print("Size of LABEL vocabulary:", len(LABEL.vocab))

# Commonly used words
print(TEXT.vocab.freqs.most_common(10))

Size of TEXT vocabulary: 11771
Size of LABEL vocabulary: 8
[('fiction', 25785), ('juvenile', 9143), ('music', 5175), ('literature', 5036), ('states', 4451), ('united', 4232), ('films', 3964), ('history', 3334), ('drama', 2453), ('recordings', 2132)]


In [206]:
# check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# set batch size
BATCH_SIZE = 32

# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device)



In [207]:
import torch.nn as nn


class classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
                 n_layers, bidirectional, dropout,pad_idx):

        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.RNN(embedding_dim,
                            hidden_dim,
                            num_layers=n_layers,
                            bidirectional=bidirectional,
                            dropout=dropout,
                            batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.act = nn.Sigmoid()

    def forward(self, text, text_lengths):
        # text = [batch size,sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded,
                                                            text_lengths,
                                                            batch_first=True)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # hidden = [batch size, num layers * num directions,hid dim]
        # cell = [batch size, num layers * num directions,hid dim]
        # concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        # hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)
        # Final activation function
        outputs = self.act(dense_outputs)
        return outputs

In [208]:
# define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 300
num_hidden_nodes = 32
num_output_nodes = len(LABEL.vocab)
num_layers = 2
bidirection = True
dropout = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
# instantiate the model
model = classifier(size_of_vocab,
                   embedding_dim,
                   num_hidden_nodes,
                   num_output_nodes,
                   num_layers,
                   bidirectional=True,
                   dropout=dropout,
                  pad_idx=PAD_IDX)

In [209]:
#architecture
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

classifier(
  (embedding): Embedding(11771, 300, padding_idx=1)
  (lstm): LSTM(300, 32, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=64, out_features=8, bias=True)
  (act): Sigmoid()
)
The model has 3,642,412 trainable parameters
torch.Size([11771, 300])


In [211]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(embedding_dim)
model.embedding.weight.data[PAD_IDX] = torch.zeros(embedding_dim)

In [212]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)


# define metric
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])


# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [213]:
def train(model, iterator, optimizer, criterion, scheduler):

    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # set the model in training phase
    model.train()

    for batch in iterator:

        # resets the gradients after every batch
        optimizer.zero_grad()

        # retrieve text and no. of words
        text, text_lengths = batch.text

        # convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()

        # compute the loss
        loss = criterion(predictions, batch.MaterialType)

        # compute the binary accuracy
        acc = categorical_accuracy(predictions, batch.MaterialType)

        # backpropage the loss and compute the gradients
        loss.backward()

        # update the weights
        optimizer.step()

        #loss and accuracy
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    scheduler.step()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [214]:
def evaluate(model, iterator, criterion):

    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()

    # deactivates autograd
    with torch.no_grad():

        for batch in iterator:

            # retrieve text and no. of words
            text, text_lengths = batch.text

            # convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()

            # compute loss and accuracy
            loss = criterion(predictions, batch.MaterialType)
            acc = categorical_accuracy(predictions, batch.MaterialType)

            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [215]:
N_EPOCHS = 50
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion,
                                  scheduler)

    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    print('Epoch: {}\n'.format(epoch))
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')




Epoch: 0

	Train Loss: 1.478 | Train Acc: 78.61%
	 Val. Loss: 1.402 |  Val. Acc: 85.23%
Epoch: 1

	Train Loss: 1.391 | Train Acc: 86.02%
	 Val. Loss: 1.399 |  Val. Acc: 85.14%
Epoch: 2

	Train Loss: 1.385 | Train Acc: 86.45%
	 Val. Loss: 1.397 |  Val. Acc: 85.55%
Epoch: 3

	Train Loss: 1.382 | Train Acc: 86.71%
	 Val. Loss: 1.394 |  Val. Acc: 85.64%
Epoch: 4

	Train Loss: 1.380 | Train Acc: 87.00%
	 Val. Loss: 1.394 |  Val. Acc: 85.71%
Epoch: 5

	Train Loss: 1.378 | Train Acc: 87.21%
	 Val. Loss: 1.393 |  Val. Acc: 85.68%
Epoch: 6

	Train Loss: 1.375 | Train Acc: 87.42%
	 Val. Loss: 1.394 |  Val. Acc: 85.45%
Epoch: 7

	Train Loss: 1.373 | Train Acc: 87.53%
	 Val. Loss: 1.394 |  Val. Acc: 85.74%
Epoch: 8

	Train Loss: 1.371 | Train Acc: 87.56%
	 Val. Loss: 1.395 |  Val. Acc: 85.61%
Epoch: 9

	Train Loss: 1.369 | Train Acc: 87.81%
	 Val. Loss: 1.398 |  Val. Acc: 85.68%
Epoch: 10

	Train Loss: 1.368 | Train Acc: 87.89%
	 Val. Loss: 1.397 |  Val. Acc: 85.52%
Epoch: 11

	Train Loss: 1.367 |

In [216]:
model.load_state_dict(torch.load('saved_weights.pt'))

test_loss, test_acc = evaluate(model, valid_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 1.393 | Test Acc: 85.68%


In [179]:

# inference
import spacy
nlp = spacy.load('en')


def predict_class(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]  # compute no. of words
    tensor = torch.LongTensor(indexed).to(device)
  
    tensor = tensor.unsqueeze(1).T  # reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length) 
    preds = model(tensor, length_tensor)
    max_preds = preds.argmax(dim = 1)
    return max_preds.item()

In [180]:
prediction = []
ids = []
for row in tqdm(test_df.iterrows()):
    ids.append(row[1]['ID'])
    prediction.append(predict_class(model, row[1]['text']))

21102it [01:31, 231.78it/s]


In [181]:
submission = pd.DataFrame(data={'ID':ids,'MaterialType':prediction})

In [182]:
submission.head()

Unnamed: 0,ID,MaterialType
0,31654,0
1,31655,2
2,31656,1
3,31657,0
4,31658,2


In [183]:
submission.MaterialType=submission.MaterialType.apply(lambda x: LABEL.vocab.itos[x])

In [184]:
submission.MaterialType.value_counts()

BOOK         15958
SOUNDDISC     2753
VIDEOCASS     2391
Name: MaterialType, dtype: int64

In [185]:
submission.to_csv('sub.csv',index=False)