In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
import jovian
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


# Check GPU

In [None]:
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")
torch.backends.cudnn.benchmark = True
print("Using {}: {}".format(device, torch.cuda.get_device_name(0)))

# 1. Global Variables

In [None]:
BATCH_SIZE = 512

LABEL_ENCODER = LabelEncoder()
LABEL_ENCODER.fit(pd.read_csv("input/train.csv")['category'])

TRAIN_CSV = 'input/split/train.csv'
VALID_CSV = 'input/split/valid.csv'


# 2. Preprocess data

## 2.1 Read data

In [None]:
def remov_duplicates(input): 
    input = input.split(" ") 
    for i in range(0, len(input)): 
        input[i] = "".join(input[i]) 
    UniqW = Counter(input) 
    s = " ".join(UniqW.keys()) 
    return s

#tokenization
tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

def encode_sentence(text, vocab2index, N=15):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

def preprocess_dataset(path):

    # read csv
    reviews = pd.read_csv(path)

    # combine categories
    reviews['description'] = reviews['gender'] + ' ' + reviews['baseColour'] + ' ' + reviews['usage'] + ' ' + reviews['noisyTextDescription']
    reviews = reviews[['id', 'description', 'category']]

    # remove duplicat
    reviews['description'] = reviews['description'].apply(lambda x: remov_duplicates(x))
    reviews['description_length'] = reviews['description'].apply(lambda x: len(x.split()))

    #count number of occurences of each word
    counts = Counter()
    for index, row in reviews.iterrows():
        counts.update(tokenize(row['description']))
    
    #deleting infrequent words
    print("num_words before:",len(counts.keys()))
    for word in list(counts):
        if counts[word] < 2:
            del counts[word]
    print("num_words after:",len(counts.keys()))

    #creating vocabulary
    vocab2index = {"":0, "UNK":1}
    words = ["", "UNK"]
    for word in counts:
        vocab2index[word] = len(words)
        words.append(word)

    reviews['encoded'] = reviews['description'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))

    return reviews, words
    

In [None]:

reviews, words = preprocess_dataset("input/train.csv")

X = list(reviews['encoded'])
y = list(reviews['category'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=11)

y_train, y_valid = LABEL_ENCODER.transform(y_train), LABEL_ENCODER.transform(y_valid)
y_valid




In [None]:
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [None]:
train_ds = ReviewsDataset(X_train, y_train)
valid_ds = ReviewsDataset(X_valid, y_valid)

In [None]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    best_valid_loss = float('inf')
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)

        # save best model
        if val_loss < best_valid_loss:
            best_valid_loss = val_loss
            torch.save(model.state_dict(), './output/rnn-model.pt')

        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.2f%%, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total * 100, sum_rmse/total

In [None]:
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=BATCH_SIZE)

# Model

In [None]:
class LSTM_variable_input(torch.nn.Module) :
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.3)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 27)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return out

# Train

In [None]:
model = LSTM_variable_input(vocab_size, 50, 50)

In [None]:
train_model(model, epochs=100, lr=0.001)