In [261]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split, GridSearchCV

In [262]:
train_neg = pd.read_fwf('../Data/train_neg.txt', header=None, names=['tweet'])
train_pos = pd.read_fwf('../Data/train_pos.txt', header=None, names=['tweet'])
test = pd.read_csv('../Data/test_data.txt', sep='\n', header=None, names=['tweet'])
test['tweet-id'] = test.tweet.apply(lambda x: x.split(',')[0])
test['tweet'] = test.tweet.apply(lambda x: ' '.join(x.split(',')[1:]))
test = test.set_index('tweet-id')

In [263]:
preprocessor = PreProcessing()
# Clean the tweets
train_pos_clean = train_pos.copy()
train_neg_clean = train_neg.copy()
test_clean = test.copy()
train_pos_clean['tweet'] = train_pos_clean.tweet.apply(lambda x: preprocessor.clean(x))
train_neg_clean['tweet'] = train_neg_clean.tweet.apply(lambda x: preprocessor.clean(x))
test_clean['tweet'] = test_clean.tweet.apply(lambda x: preprocessor.clean(x))

In [264]:
train_pos_clean['sentiment'] = 1
train_neg_clean['sentiment'] = 0

In [265]:
def word_to_index(tweets):
    word_indexer = {}
    tweets = tweets['tweet']
    for tweet in tweets:
        tweet = tweet.split(' ')
        for word in tweet:
            if word not in word_indexer:
                word_indexer[word] = len(word_indexer)
    
    vocab_size = len(word_indexer)
    return word_indexer, vocab_size

In [266]:
def make_bow_vector(tweet, word_indexer):
    vec = torch.zeros(len(word_indexer))
    #tweet = tweet['tweet']
    tweet = tweet.split(' ')
    for word in tweet:
        vec[word_indexer[word]] += 1
    return vec.view(1, -1)

def make_target(label, label_indexer):
    return torch.LongTensor([label])

In [267]:
class BagOfWordsClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size):
        
        super(BagOfWordsClassifier, self).__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        self.linear = nn.Linear(vocab_size, num_labels)

    def forward(self, bow_vec):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        return F.log_softmax(self.linear(bow_vec), dim=1)

In [268]:
num_labels = 2
full_data = pd.concat([train_pos_clean[['tweet', 'sentiment']], train_neg_clean[['tweet', 'sentiment']]], ignore_index=True)
word_indexer, vocab_size = word_to_index(full_data)


In [269]:
model = BagOfWordsClassifier(2, vocab_size)

# the model knows its parameters.  The first output below is A, the second is b.
# Whenever you assign a component to a class variable in the __init__ function
# of a module, which was done with the line
# self.linear = nn.Linear(...)
# Then through some Python magic from the PyTorch devs, your module
# (in this case, BoWClassifier) will store knowledge of the nn.Linear's parameters
for param in model.parameters():
    print(param)

# To run the model, pass in a BoW vector
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    sample = full_data.iloc[0]
    bow_vector = make_bow_vector(sample, word_indexer)
    log_probs = model(bow_vector)
    print(log_probs)

Parameter containing:
tensor([[ 2.5242e-03,  4.4924e-04, -1.2654e-04,  ...,  3.1640e-03,
          3.1565e-03, -5.5773e-04],
        [ 2.4722e-03, -2.5690e-03, -3.3875e-03,  ..., -1.2283e-03,
          7.4937e-05,  2.7759e-03]], requires_grad=True)
Parameter containing:
tensor([-0.0032,  0.0020], requires_grad=True)


AttributeError: 'Series' object has no attribute 'split'

In [270]:
label_indexer = {0: "Negative", 1: "Positive"}

In [271]:
X = full_data['tweet']
y = full_data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
train_data = dict(zip(X_train, y_train))
test_data = dict(zip(X_test, y_test))

In [272]:
with torch.no_grad():
    pr = 0
    for instance in test_data:
        label = test_data[instance]
        bow_vec = make_bow_vector(instance, word_indexer)
        if pr < 10:
            print(bow_vec.shape)
            print(bow_vec)
        log_probs = model(bow_vec)
        pr = pr+1
        #print(log_probs)

torch.Size([1, 83422])
tensor([[0., 1., 0.,  ..., 0., 0., 0.]])
torch.Size([1, 83422])
tensor([[0., 1., 0.,  ..., 0., 0., 0.]])
torch.Size([1, 83422])
tensor([[0., 0., 0.,  ..., 0., 0., 0.]])
torch.Size([1, 83422])
tensor([[1., 0., 0.,  ..., 0., 0., 0.]])
torch.Size([1, 83422])
tensor([[1., 2., 0.,  ..., 0., 0., 0.]])
torch.Size([1, 83422])
tensor([[1., 0., 0.,  ..., 0., 0., 0.]])
torch.Size([1, 83422])
tensor([[1., 1., 0.,  ..., 0., 0., 0.]])
torch.Size([1, 83422])
tensor([[0., 0., 0.,  ..., 0., 0., 0.]])
torch.Size([1, 83422])
tensor([[1., 0., 0.,  ..., 0., 0., 0.]])
torch.Size([1, 83422])
tensor([[2., 0., 0.,  ..., 0., 0., 0.]])


In [274]:
print("\nPositive?")
print(next(model.parameters())[:, word_indexer["won"]])
print(next(model.parameters())[:, word_indexer["happy"]])
print(next(model.parameters())[:, word_indexer["hired"]])

print("\nNegative?")
print(next(model.parameters())[:, word_indexer["fired"]])
print(next(model.parameters())[:, word_indexer["sad"]])
print(next(model.parameters())[:, word_indexer["lost"]])

print("\nNeutral?")
print(next(model.parameters())[:, word_indexer["zuckerberg"]])
print(next(model.parameters())[:, word_indexer["house"]])
print(next(model.parameters())[:, word_indexer["blue"]])


Positive?
tensor([-0.0017, -0.0022], grad_fn=<SelectBackward>)
tensor([0.0013, 0.0030], grad_fn=<SelectBackward>)
tensor([-0.0031,  0.0002], grad_fn=<SelectBackward>)

Negative?
tensor([ 0.0023, -0.0011], grad_fn=<SelectBackward>)
tensor([ 0.0010, -0.0003], grad_fn=<SelectBackward>)
tensor([-0.0023,  0.0025], grad_fn=<SelectBackward>)

Neutral?
tensor([ 0.0013, -0.0008], grad_fn=<SelectBackward>)
tensor([-0.0003, -0.0006], grad_fn=<SelectBackward>)
tensor([ 0.0033, -0.0029], grad_fn=<SelectBackward>)


In [244]:

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Usually you want to pass over the training data several times.
# 100 is much bigger than on a real data set, but real datasets have more than
# two instances.  Usually, somewhere between 5 and 30 epochs is reasonable.
for epoch in range(2):
    print(epoch)
    for instance in train_data:
        label = train_data[instance]
        # Step 1. Remember that PyTorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Make our BOW vector and also we must wrap the target in a
        # Tensor as an integer. For example, if the target is SPANISH, then
        # we wrap the integer 0. The loss function then knows that the 0th
        # element of the log probabilities is the log probability
        # corresponding to SPANISH
        bow_vec = make_bow_vector(instance, word_indexer)
        target = make_target(label, label_indexer)

        # Step 3. Run our forward pass.
        log_probs = model(bow_vec)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

with torch.no_grad():
    for instance in test_data:
        label = test_data[instance]
        bow_vec = make_bow_vector(instance, word_indexer)
        log_probs = model(bow_vec)
        #print(log_probs)
        
print(next(model.parameters())[:, word_indexer["cancer"]])

tensor([ 0.2455, -0.2432], grad_fn=<SelectBackward>)
0
1
2
tensor([ 0.2053, -0.2030], grad_fn=<SelectBackward>)
