# Importing torch and other libraries

In [5]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

device = ("cuda" if torch.cuda.is_available() else "cpu")

# Load data

In [6]:
def load_data(data_path):
    def sparse_to_dense(sparse_r_d, vocab_size):
        """
        Convert a sparse representation of a document by tf-idf to dense form
        by mapping its word to a new vector of vocabulary size.
        Word without any appearance in the doc get the value 0.0
        """
        r_d = [0.0 for _ in range(vocab_size)]
        indices_tf_idfs = sparse_r_d.split()
        for index_tf_idf in indices_tf_idfs:
            index = int(index_tf_idf.split(":")[0])
            tfidf = float(index_tf_idf.split(":")[1])
            r_d[index] = tfidf
        return np.array(r_d)
    with open(data_path) as f:
        d_lines = f.read().splitlines()
    with open("../datasets/20news-bydate/words_idfs.txt") as f:
        vocab_size = len(f.read().splitlines())

    X = []
    Y = []
    # label_count = defaultdict(int)
    for data_id, d in enumerate(d_lines):
        features = d.split("<fff>")
        label, doc_id = int(features[0]), int(features[1])
        # label_count[label] += 1
        r_d = sparse_to_dense(
            sparse_r_d=features[2], vocab_size=vocab_size)
        X.append(r_d)
        one_hot_label = np.zeros((20), dtype=int)
        one_hot_label[label] = 1
        Y.append(one_hot_label)
    return np.array(X, dtype=np.float32), np.array(Y, dtype=np.float32)

In [8]:
X_train, Y_train = load_data('../datasets/20news-bydate/train_tf_idf.txt')
X_test, Y_test = load_data('../datasets/20news-bydate/test_tf_idf.txt')

In [9]:
X_train = torch.from_numpy(X_train)
X_test = torch.from_numpy(X_test)
Y_train = torch.from_numpy(Y_train.squeeze())
Y_test = torch.from_numpy(Y_test.squeeze())

In [5]:
X_train = X_train.to(torch.float32)

# Build the model

In [11]:
class MLP(nn.Module):
    def __init__(self, input_num, hidden_num, output_num):
        super().__init__()
        self.input_num = input_num
        self.hidden_num = hidden_num
        self.output_num = output_num
        self.fc1 = nn.Linear(input_num, hidden_num) # hidden layer
        self.fc2 = nn.Linear(hidden_num, output_num) # output layer
        self.sigmoid = nn.Sigmoid() # sigmoid activation function
        self.relu = nn.ReLU() # relu activation function
        self.softmax = nn.Softmax(dim = 1) # softmax activation function
    
    def forward(self, x):
        x = self.relu(self.fc1(x)) 
        x = self.softmax(self.fc2(x))
        return x
    
    def predict(self, x):
        self.eval()
        with torch.no_grad():
            output = self(x)
            predicted_labels = torch.argmax(output, dim=1)
            one_hot = torch.zeros_like(output)
            one_hot.scatter_(1, predicted_labels.unsqueeze(1), 1)
            return one_hot.tolist()


In [12]:
with open("../datasets/20news-bydate/words_idfs.txt") as f:
        vocab_size = len(f.read().splitlines())

input_num = vocab_size
hidden_num = 50
output_num = 20 # The output should be the same as the number of classes

model = MLP(input_num, hidden_num, output_num)
model.to(device)
print(model)
     

MLP(
  (fc1): Linear(in_features=14230, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=20, bias=True)
  (sigmoid): Sigmoid()
  (relu): ReLU()
  (softmax): Softmax(dim=1)
)


In [13]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

In [17]:
num_epochs = 100 # num of epochs

for epoch in range(num_epochs):
    # forward propagation
    Y_pred = model(X_train)
    loss = criterion(Y_pred, Y_train)
    
    # back propagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        print('Epoch [{}/{}], Loss: {:.5f}'.format(epoch, num_epochs, loss.item()))
print('\nTraining Complete')

Epoch [0/100], Loss: 2.08655
Epoch [10/100], Loss: 2.08644
Epoch [20/100], Loss: 2.08632
Epoch [30/100], Loss: 2.08629
Epoch [40/100], Loss: 2.08623
Epoch [50/100], Loss: 2.08619
Epoch [60/100], Loss: 2.08617
Epoch [70/100], Loss: 2.08611
Epoch [80/100], Loss: 2.08610
Epoch [90/100], Loss: 2.08610

Training Complete


In [18]:
model_prediction = model.predict(X_test)

In [19]:
print("Accuracy Score on test data ==>> {}%".format(accuracy_score(model_prediction, Y_test) * 100))

Accuracy Score on test data ==>> 80.40361125862985%


# Minibatch GD

In [20]:
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(X_train, Y_train)

# Create a DataLoader from the TensorDataset
batch_size = 16 # the batch size to use for training
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Train the model
num_epochs = 10# the number of training epochs
loss_lst = []
for epoch in range(num_epochs):
    for input_batch, target_batch in dataloader:
        # Zero the gradients
        optimizer.zero_grad()
        
        # Compute the model output
        output = model(input_batch)
        
        # Compute the loss
        loss = criterion(output, target_batch)
        if epoch == 0:
            loss_lst.append(loss.item())
        # Backpropagate the loss
        loss.backward()
        
        # Update the model parameters
        optimizer.step()
    
    print('Epoch [{}/{}], Loss: {:.5f}'.format(epoch, num_epochs, loss.item()))

Epoch [0/10], Loss: 2.07815
Epoch [1/10], Loss: 2.07815
Epoch [2/10], Loss: 2.07816
Epoch [3/10], Loss: 2.07815
Epoch [4/10], Loss: 2.07815
Epoch [5/10], Loss: 2.07815
Epoch [6/10], Loss: 2.07816
Epoch [7/10], Loss: 2.07815
Epoch [8/10], Loss: 2.07815
Epoch [9/10], Loss: 2.07816


In [21]:
model_prediction = model.predict(X_test)

In [22]:
print("Accuracy Score on test data ==>> {}%".format(accuracy_score(model_prediction, Y_test) * 100))

Accuracy Score on test data ==>> 80.28412108337758%
