In [3]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import random
import time
import math
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from matplotlib import pyplot as plt
import unicodedata
import string
import unicodedata
import string

In [6]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        # Put the declaration of RNN network here
        self.hidden_size = hidden_size
        
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        # Put the computation for forward pass here
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)

        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [22]:
X = scipy.sparse.load_npz('../new_data/X_sparse.npz')
Y = np.genfromtxt('../new_data/Y.csv', delimiter = ',')
Y = Y[1:]

In [28]:
from collections import defaultdict

all_categories = []
category_lines = defaultdict(lambda: list())
cnt = 0
for category in Y:
    all_categories.append(category)
    category_lines[category].append(X[cnt])
    cnt += 1

In [29]:
category_lines

defaultdict(<function __main__.<lambda>()>,
            {1.0: [<1x5000 sparse matrix of type '<class 'numpy.int64'>'
              	with 3 stored elements in Compressed Sparse Row format>,
              <1x5000 sparse matrix of type '<class 'numpy.int64'>'
              	with 5 stored elements in Compressed Sparse Row format>,
              <1x5000 sparse matrix of type '<class 'numpy.int64'>'
              	with 8 stored elements in Compressed Sparse Row format>,
              <1x5000 sparse matrix of type '<class 'numpy.int64'>'
              	with 7 stored elements in Compressed Sparse Row format>,
              <1x5000 sparse matrix of type '<class 'numpy.int64'>'
              	with 4 stored elements in Compressed Sparse Row format>,
              <1x5000 sparse matrix of type '<class 'numpy.int64'>'
              	with 9 stored elements in Compressed Sparse Row format>,
              <1x5000 sparse matrix of type '<class 'numpy.int64'>'
              	with 10 stored elements in C

In [35]:
def letterToIndex(letter):
    return all_letters.find(letter)

def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

def lineToTensor(line):
    tensor = torch.zeros(line.shape[0], 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = lineToTensor(line)
    return category, line, category_tensor, line_tensor

def train_iteration_CharRNN(rnn, learning_rate, category_tensor, line_tensor):
    criterion = nn.NLLLoss()
    hidden = rnn.initHidden()
    rnn.zero_grad()
    
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
    
    loss = criterion(output, category_tensor)
    loss.backward()
    
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.item()

def train_charRNN(rnn, n_iters, learning_rate, losses):
    print_every = 1000
    current_loss = 0
    
    def timeSince(since):
        now = time.time()
        s = now - since
        m = math.floor(s / 60)
        s -= m * 60
        return '%dm %ds' % (m, s)

    start = time.time()
    #for iter in range(1, n_iters + 1):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    output, loss = train_iteration_CharRNN(learning_rate, category_tensor, line_tensor)
    current_loss += loss
        
    #    if iter % print_every == 0:
    guess, guess_i = categoryFromOutput(output)
    #correct = '✓' if guess == category else '✗ (%s)' % category
    #print('%d %d%% (%s) %.4f %s / %s %s' % (
    #iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))
    #print('Average loss: %.4f' % (current_loss/print_every))
    #current_loss = 0
    average_loss = current_loss/print_every
    losses.append(average_loss)

    torch.save(rnn, 'char-rnn-classification.pt')

    
def predict(rnn, input_line, n_predictions = 8):
    hidden = rnn.initHidden()
    
    line_tensor = lineToTensor(input_line)
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
        
    topv, topi = output.topk(n_predictions, 1, True)
    softmax = nn.Softmax(dim=1)
    top_prob = softmax(topv)*100
    predictions = []
    
    for i in range(n_predictions):
        value = topv[0][i].item()
        prob = top_prob[0][i]
        category_index = topi[0][i].item()
        print('%s Probability: (%.2f), Score: (%.2f)' % (all_categories[category_index], prob, value))
        predictions.append([value, all_categories[category_index]])
    return predictions

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import scipy.sparse

In [32]:
X = scipy.sparse.load_npz('../new_data/X_sparse.npz')
Y = np.genfromtxt('../new_data/Y.csv', delimiter = ',')
Y = Y[1:]
X_comp, X_test, Y_comp, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
Xtr, Xva, Ytr, Yva = train_test_split(X_comp, Y_comp, test_size=0.2)
Xtr, Ytr = shuffle(Xtr, Ytr)

In [33]:
hidden_sizes=[32, 64, 100, 128, 256, 512, 1024]
learning_rates=[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
n_iters=[5000, 10000, 15000, 20000, 30000, 50000, 100000]

In [36]:
print("hidden_size training started")
input_size = 5000
output_size = 2

hidden_size_losses = []
#optimal_hidden_size = 0
for n_hidden in hidden_sizes:
    rnn = RNN(input_size, n_hidden, output_size)
    train_charRNN(rnn, 15000, 0.005, hidden_size_losses)

hidden_size training started


NameError: name 'n_letters' is not defined

In [None]:
plt.rcParams['figure.figsize'] = (15.0, 8.0)
plt.semilogy(hidden_sizes, hidden_size_losses, label='average_loss', linewidth=2, c='r')
plt.xticks(sizes)
plt.legend()
plt.show()

In [None]:
optimal_hidden_size = ...

In [None]:
print("learning_rate training started")
input_size = 5000
output_size = 2

learning_size_losses = []
for l_rate in learning_rates:
    rnn = RNN(input_size, optimal_hidden_size, output_size)
    train_charRNN(rnn, 15000, l_rate, learning_size_)

In [None]:
plt.rcParams['figure.figsize'] = (15.0, 8.0)
plt.semilogy(learning_rates, learning_size_losses, label='average_loss', linewidth=2, c='b')
plt.xticks(sizes)
plt.legend()
plt.show()

In [None]:
optimal_learning_rate = ...

In [None]:
print("n_iters training started")
input_size = 5000
output_size = 2

n_iters_losses = []
#optimal_hidden_size = 0
for n_iter in n_iters:
    rnn = RNN(input_size, optimal_hidden_size, output_size)
    train_charRNN(rnn, n_iter, optimal_learning_rate, n_iters_losses)

In [None]:
plt.rcParams['figure.figsize'] = (15.0, 8.0)
plt.semilogy(n_iters, n_iters_losses, label='average_loss', linewidth=2, c='g')
plt.xticks(sizes)
plt.legend()
plt.show()

In [None]:
optimal_n_iter = ...

In [None]:
optimal_rnn = RNN(input_size, optimal_hidden_size, output_size)
losses = []
train_charRNN(optimal_rnn, optimal_n_iter, optimal_learning_rate, losses)
optimal_average_loss = losses[0]
