In [60]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import re
import unicodedata
import string

In [61]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [201]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim= 1)
        
    def forward(self, input_tensor, hidden_tensor):
        combined = torch.cat((input_tensor, hidden_tensor), 1)
#         print(combined)
        hidden  = self.i2h(combined)
        output = self.softmax(self.i2o(combined))
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

In [192]:
train_df[train_df["target"] == 1]["text"].values[10]

'three people died from the heat wave so far'

In [193]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:len(train_df)])

print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 22328)
[[0 0 0 ... 0 0 0]]


In [194]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake ma...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,just got sent this photo from ruby alaska as s...,1


In [195]:
del train_df['keyword']
del train_df['location']

In [215]:
ALL_LETTERS = string.ascii_letters + " .,;'"
N_LETTERS = len(ALL_LETTERS)

def text_cleaner(text):
    text = text.lower() # convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text) # remove extra whitespace
    text = re.sub(r'https?://\S+', '', text) # remove URLs
    text = re.sub(r"#", "", text)
    return text

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in ALL_LETTERS
    )

# Find letter index from all_letters, e.g. "a" = 0
def letter_to_index(letter):
    return ALL_LETTERS.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letter_to_tensor(letter):
    tensor = torch.zeros(1, N_LETTERS)
    tensor[0][letter_to_index(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def line_to_tensor(line):
    tensor = torch.zeros(len(line), 1, N_LETTERS)
    for i, letter in enumerate(line):
        tensor[i][0][letter_to_index(letter)] = 1
    return tensor

def random_training_exmple(data_frame):
    example = data_frame.sample()
    return line_to_tensor(example["text"].to_numpy()[0]), torch.tensor(example["target"].to_numpy())

if __name__ == '__main__':
    print(ALL_LETTERS)
    print(unicode_to_ascii('Ślusàrski'))
    
    example_text, target = random_training_exmple(train_df)
    print(example_text)
    
    print(letter_to_tensor('J')) # [1, 57]
    print(target[0])
    print(example_text.size()) # [5, 1, 57]

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'
Slusarski
tensor([[[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]]])
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])
tensor(0)
torch.Size([85, 1, 57])


In [216]:
train_df['text'] = train_df['text'].apply(text_cleaner)

n_hidden = 128
rnn = RNN(N_LETTERS, n_hidden, 2)

input_tensor = letter_to_tensor('A')
hidden_tensor = rnn.init_hidden()

# implemeting one step
output, next_hidden = rnn(input_tensor, hidden_tensor)
print(output.size())
print(next_hidden.size())

input_tensor2 = line_to_tensor('Albert')
output, next_hidden = rnn(input_tensor2[0], hidden_tensor)
print(output.size())
print(next_hidden.size())

# Learning loop
criterion = nn.NLLLoss()
learning_rate = 0.005
optimizer = torch.optim.SGD(rnn.parameters(), lr = learning_rate)

torch.Size([1, 2])
torch.Size([1, 128])
torch.Size([1, 2])
torch.Size([1, 128])


In [217]:
def category_from_output(output):
    category_index = torch.argmax(output).item()
    return category_index


def train(line_tensor, category_tensor):
    hidden = rnn.init_hidden()
    
    # goes eover every charater in the name and keep feeding it to the nn
    # that and the hidden state fom before
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
    loss = criterion(output, category_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return output, loss.item()



def predict(input_line):
    print("\n> {input_line}")
    with torch.no_grad():
        line_tensor = line_to_tensor(input_line)
        hidden = rnn.init_hidden()
        # goes eover every charater in the name and keep feeding it to the nn
        # that and the hidden state fom before
        for i in range(line_tensor.size()[0]):
            output, hidden = rnn(line_tensor[i], hidden)
        guess = category_from_output(output)
        print(guess)

In [218]:
current_loss = 0
all_losses = []

plot_steps, print_steps = 1000, 100
n_iters = 50000
for i in range(n_iters):
    line_tensor, target  = random_training_exmple(train_df)
    output, loss = train(line_tensor, torch.tensor([target]))
#     print(output) if str(loss) == "nan" else True
    current_loss += loss
    if (i+1) % plot_steps == 0:
        all_losses.append(current_loss/ plot_steps)
        current_loss = 0
        
    if (i+1) % print_steps == 0:
        guess = category_from_output(output)
        correct = "Correct" if guess == target else f"Wrong"
        print(f"{i+1} {(i+1)/n_iters*100} {loss:.4f} / {guess} {correct}")

100 0.2 0.7204 / 0 Wrong
200 0.4 0.8420 / 0 Wrong
300 0.6 0.4954 / 0 Correct
400 0.8 0.5400 / 0 Correct
500 1.0 0.9513 / 0 Wrong
600 1.2 0.4849 / 0 Correct
700 1.4000000000000001 0.4613 / 0 Correct
800 1.6 0.4642 / 0 Correct
900 1.7999999999999998 0.5311 / 0 Correct
1000 2.0 0.5161 / 0 Correct
1100 2.1999999999999997 0.4678 / 0 Correct
1200 2.4 0.8252 / 0 Wrong
1300 2.6 0.5446 / 0 Correct
1400 2.8000000000000003 0.7425 / 0 Wrong
1500 3.0 0.5983 / 0 Correct
1600 3.2 0.7407 / 0 Wrong
1700 3.4000000000000004 0.5500 / 0 Correct
1800 3.5999999999999996 0.6093 / 0 Correct
1900 3.8 0.5919 / 0 Correct
2000 4.0 0.7695 / 0 Wrong
2100 4.2 0.7002 / 1 Wrong
2200 4.3999999999999995 0.5418 / 0 Correct
2300 4.6 0.4516 / 0 Correct
2400 4.8 0.7145 / 0 Wrong
2500 5.0 0.6552 / 0 Correct
2600 5.2 0.7060 / 0 Wrong
2700 5.4 0.7274 / 0 Wrong
2800 5.6000000000000005 0.9259 / 0 Wrong
2900 5.800000000000001 0.4927 / 0 Correct
3000 6.0 0.6709 / 0 Correct
3100 6.2 0.4801 / 0 Correct
3200 6.4 0.7899 / 0 Wrong
3300 

25600 51.2 0.4146 / 0 Correct
25700 51.4 0.8925 / 0 Wrong
25800 51.6 0.6954 / 1 Wrong
25900 51.800000000000004 0.2398 / 0 Correct
26000 52.0 0.3705 / 0 Correct
26100 52.2 0.6860 / 0 Correct
26200 52.400000000000006 0.6609 / 0 Correct
26300 52.6 0.5961 / 0 Correct
26400 52.800000000000004 0.3655 / 0 Correct
26500 53.0 0.6121 / 1 Correct
26600 53.2 1.1388 / 0 Wrong
26700 53.400000000000006 0.4146 / 1 Correct
26800 53.6 0.3428 / 0 Correct
26900 53.800000000000004 0.5049 / 1 Correct
27000 54.0 0.9690 / 0 Wrong
27100 54.2 0.9552 / 0 Wrong
27200 54.400000000000006 0.4940 / 0 Correct
27300 54.6 0.2701 / 0 Correct
27400 54.800000000000004 0.9564 / 0 Wrong
27500 55.00000000000001 0.5470 / 1 Correct
27600 55.2 0.7299 / 0 Wrong
27700 55.400000000000006 0.3359 / 0 Correct
27800 55.60000000000001 1.1436 / 0 Wrong
27900 55.800000000000004 1.1112 / 0 Wrong
28000 56.00000000000001 0.2250 / 0 Correct
28100 56.2 1.0292 / 1 Wrong
28200 56.39999999999999 0.4411 / 0 Correct
28300 56.599999999999994 1.4926 

In [None]:
plt.figure()
plt.plot(all_losses)
plt.show()
while True:
    sentence = input("Input:")
    if sentence =="Quit":
        break
    predict(sentence)