In [1]:
import torch.nn as nn
import torch
from sklearn.model_selection import train_test_split


In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/train.csv')

X_train, X_test, y_train, y_test = train_test_split(df.text,df.target,test_size=0.3,random_state=42, shuffle=False)
X_train.shape[0], X_test.shape[0], y_train.shape[0], y_test.shape[0]


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


(5329, 2284, 5329, 2284)

In [3]:
# one_hot_encoded_df= pd.read_csv('data/one_hot_encoded_df.csv')

In [4]:
import numpy as np
import pandas as pd

# Assuming df is your DataFrame containing the tweets
# Create the vocabulary

vocabulary = np.array([])
for i in range(len(df.index)):
    words = df['text'].iloc[i].split()  
    vocabulary = np.append(vocabulary, np.unique(words))  
    vocabulary = np.unique(vocabulary)

# Create a dictionary to map each word to an index
word_to_index = {word: idx for idx, word in enumerate(vocabulary)}

# Function to one-hot encode a tweet
def one_hot_encode(tweet, vocab_size):
    one_hot_vector = np.zeros(vocab_size)
    for word in tweet.split():
        if word in word_to_index:
            index = word_to_index[word]
            one_hot_vector[index] = 1
    return one_hot_vector

# Apply the one-hot encoding to each tweet
one_hot_encoded_tweets = np.array([one_hot_encode(tweet, len(vocabulary)) for tweet in df['text']])

# Convert to DataFrame for better readability (optional)
one_hot_encoded_df = pd.DataFrame(one_hot_encoded_tweets, columns=vocabulary)
print(one_hot_encoded_df.columns)

Index(['!', '!!', '!!!', '!!!!', '!!!!!', '!!!!!!!!!!!#MetroFmTalk', '!The',
       '#', '##book', '##fukushima',
       ...
       'å£9!', 'å¤}', 'å¨', 'å©Daniel', 'å¬'Only', 'åÇ', 'åÈ',
       'åÈMGN-AFRICAå¨', 'åÊ', 'åÊFedEx'],
      dtype='object', length=31924)


In [5]:
n_letters = one_hot_encoded_df.shape[1]
n_letters

31924

In [6]:
def letterToIndex(letter):
    return one_hot_encoded_df.columns.get_loc(letter)

def lineToTensor(line):
    line = line.split()
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

In [7]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2h = nn.Linear(hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        # all_categories.index
        hidden = F.tanh(self.i2h(input) + self.h2h(hidden))
        output = self.h2o(hidden)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [8]:
import torch.nn as nn
import torch.nn.functional as F

criterion = nn.NLLLoss()
all_categories = [1,0]

rnn = torch.load('rnn1.pth')

n_correct = 0
for index, row in y_test.items():
    line_tensor = lineToTensor(X_test.loc[index])
    hidden = rnn.initHidden()
    for i in range(line_tensor.size()[0]):
        category = torch.tensor([all_categories.index(y_test.iloc[i])], dtype=torch.long)
        output, hidden = rnn(line_tensor[i], hidden)
    guess, guess_i = categoryFromOutput(output)
    if guess == category:
        n_correct +=1

In [9]:
accuracy = n_correct/y_test.shape[0]
accuracy

0.5030647985989493

In [10]:
all_categories = [1,0]

In [11]:
learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn

criterion = nn.NLLLoss()

def train(category_tensor, line_tensor):
    hidden = rnn.initHidden()
    rnn.zero_grad()

    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    loss = criterion(output, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item()



In [12]:
import random

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(df[df.target==category].index)
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = lineToTensor(df.text.iloc[line])
    return category, line, category_tensor, line_tensor

randomChoice(all_categories)

randomChoice(df[df.target==1].index)

randomTrainingExample()

(1,
 2848,
 tensor([0]),
 tensor([[[0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.]],
 
         ...,
 
         [[0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.]]]))