# Twitter sentiment analysis model

The purpose of this notebook is to develop a sentiment analysis model which will be used by my reputation monitoring application to assist companies in mearninging their reputation quantitatively through the average sentiment expressed by twitter uses in regards to their brand.

## Notebook setup

In [0]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
## Set the working directory
import os
if os.getcwd() != "/content/drive/My Drive/Data Science/Twitter nlp app":
  os.chdir("drive/My Drive/Data Science/Twitter nlp app")
print(os.getcwd())

/content/drive/My Drive/Data Science/Twitter nlp app


## Import libraries and data

In [0]:
# Libraries
import numpy as np
import pandas as pd
from string import punctuation
from collections import Counter
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

In [0]:
# Read in data
data = pd.read_csv("Data/training.1600000.processed.noemoticon.csv", encoding='latin-1', header=None, names=["sentiment","id","date","query","username","tweet"])

## Data exploration

In [0]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
sentiment    1600000 non-null int64
id           1600000 non-null int64
date         1600000 non-null object
query        1600000 non-null object
username     1600000 non-null object
tweet        1600000 non-null object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [0]:
data.head()

Unnamed: 0,sentiment,id,date,query,username,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## Data cleaning

In [0]:
# Remove unnecessary columns
data_dropped = data.drop(columns=["id","date","query","username"])

In [0]:
data_dropped.head()

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [0]:
# Remove links and usernames
link_pat = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*) ?"
user_pat = r"@[a-zA-Z0-9_]{1,20} ?"

data_dropped["tweet"] = data_dropped["tweet"].str.replace(link_pat, "").str.replace(user_pat, "")

In [0]:
data_dropped["tweet"].head()

0    - Awww, that's a bummer.  You shoulda got Davi...
1    is upset that he can't update his Facebook by ...
2    I dived many times for the ball. Managed to sa...
3      my whole body feels itchy and like its on fire 
4    no, it's not behaving at all. i'm mad. why am ...
Name: tweet, dtype: object

In [0]:
# Covert to lower case
data2 = data_dropped.copy()

data2["tweet"] = data2["tweet"].str.lower()

data2["tweet"].head()

0    - awww, that's a bummer.  you shoulda got davi...
1    is upset that he can't update his facebook by ...
2    i dived many times for the ball. managed to sa...
3      my whole body feels itchy and like its on fire 
4    no, it's not behaving at all. i'm mad. why am ...
Name: tweet, dtype: object

In [0]:
# Remove punctuation
data3 = data2.copy()

data3["tweet"] = data3["tweet"].str.replace('[{}]'.format(string.punctuation), '')

data3["tweet"].head()

0     awww thats a bummer  you shoulda got david ca...
1    is upset that he cant update his facebook by t...
2    i dived many times for the ball managed to sav...
3      my whole body feels itchy and like its on fire 
4    no its not behaving at all im mad why am i her...
Name: tweet, dtype: object

In [0]:
# Remove multiple spaces
data4 = data3.copy()

data4["tweet"] = data4["tweet"].str.replace(r" {2,10}", ' ')

data4["tweet"].head()

0     awww thats a bummer you shoulda got david car...
1    is upset that he cant update his facebook by t...
2    i dived many times for the ball managed to sav...
3      my whole body feels itchy and like its on fire 
4    no its not behaving at all im mad why am i her...
Name: tweet, dtype: object

In [0]:
# Strip leading and trainling spaces
data5 = data4.copy()

data5["tweet"] = data5["tweet"].str.strip()

data5["tweet"].head()

0    awww thats a bummer you shoulda got david carr...
1    is upset that he cant update his facebook by t...
2    i dived many times for the ball managed to sav...
3       my whole body feels itchy and like its on fire
4    no its not behaving at all im mad why am i her...
Name: tweet, dtype: object

In [0]:
# Convert each tweet into a list
data6 = data5.copy()

data6["tweet"] = data6["tweet"].str.split()

data6["tweet"].head()

0    [awww, thats, a, bummer, you, shoulda, got, da...
1    [is, upset, that, he, cant, update, his, faceb...
2    [i, dived, many, times, for, the, ball, manage...
3    [my, whole, body, feels, itchy, and, like, its...
4    [no, its, not, behaving, at, all, im, mad, why...
Name: tweet, dtype: object

## Build word_to_int and int_to_word

In [0]:
# Determine vocab
def create_vocab(data):
  vocab = []

  for tweet in data6["tweet"]:
    for word in tweet:
      vocab.append(word)
  
  return list(set(vocab))

vocab = create_vocab(data6)

len(vocab)

453852

In [0]:
# Map each word in vocab to an int
def create_word_to_int_vice_versa(vocab):
  
  word_to_int = {}
  int_to_word = {}
  
  for i, word in enumerate(vocab):
    # Skip 0 as this will be used of for the 'empty' word
    word_to_int[word] = i+1
    int_to_word[i+1] = word
    
  return word_to_int, int_to_word

word_to_int, int_to_word = create_word_to_int_vice_versa(vocab)

w2i = {'word_to_int': word_to_int,
       'int_to_word': int_to_word}

torch.save(w2i, 'w2i.pth')

In [0]:
# Test
print(word_to_int["happy"])
print(int_to_word[390000])

445499
hairhow


## Create matrix representation of dataset

In [0]:
# Determine if some tweets are too long and should be dropped

# Compile Series of tweet lengths
tweet_lens = data6["tweet"].str.len()

# Run value_counts 
tweet_lens.value_counts().sort_index()

0      3124
1     15085
2     39558
3     59779
4     79083
5     86193
6     90471
7     90309
8     88848
9     86238
10    82934
11    78348
12    74291
13    69999
14    65860
15    61851
16    58637
17    55056
18    52994
19    51957
20    48707
21    47548
22    45307
23    42346
24    37713
25    31010
26    23179
27    15397
28     9331
29     4948
30     2353
31      994
32      370
33      121
34       42
35       10
36        4
37        2
39        2
40        1
Name: tweet, dtype: int64

In [0]:
# It looks like we can safely drop any tweets over 30 words

# Determine indexes to drop
tweet_indices_to_drop = data6["tweet"].loc[tweet_lens > 30].index

# Drop these from both train and targets
data7 = data6.drop(labels=tweet_indices_to_drop)

# Check that it worked as expected
data7["tweet"].str.len().max()

30

In [0]:
# Convert tweets to ints and pad
def convert_to_ints_and_pad(data, mapping, normal_len=30):
  
  word_tweets = data
  int_tweets = np.zeros((data.shape[0], 30))
  
  for i, tweet in enumerate(data):
    for n, word in enumerate(tweet):
      int_tweets[i][n] = mapping[word]
  
  return np.array(int_tweets)
  
int_tweets = convert_to_ints_and_pad(data7["tweet"], word_to_int)

In [0]:
 int_tweets.shape

(1598454, 30)

# Convert targets into numpy array with values 0 or 1

In [0]:
# Determine the different values given in the sentiment column
data7["sentiment"].value_counts()

4    799611
0    798843
Name: sentiment, dtype: int64

In [1]:
# As per the dataset documentation, a 4 represents positive and a 0, negative
# Change this to pos = 1 and neg = 0
targets = data7["sentiment"].replace({4:1})

targets[:10]

NameError: ignored

In [0]:
# Convert targets to numpy for further processing
targets = targets.to_numpy()

In [0]:
# Define function to split dataset into train and val
def train_val_split(features, targets, split_frac=0.95):
  """
  Split the data into training and validation sets
  
  Parameters:
  - data -- data to split
  - split_frac -- fraction to be train set
  
  Returns:
  - split_dict -- dictionary containing two tuples: (train_x, train_y) and (val_x, val_y)
  """
  
  assert features.shape[0] == targets.shape[0]
  
  split_idx = int(len(targets)*split_frac)
  train_x, val_x = features[:split_idx], features[split_idx:]
  train_y, val_y = targets[:split_idx], targets[split_idx:]
  
  return {"train": (train_x, train_y), "val": (val_x, val_y)}

  ## print out the shapes of resulting feature data
  print("\t\t\tData Shapes:")
  print("Train set: \t\t{}".format(train_x.shape), 
        "\nValidation set: \t{}".format(val_x.shape))
  
split_dict = train_val_split(int_tweets, targets)

In [0]:
# Test that it worked
train_x, train_y = split_dict["train"]

print(train_x.shape)
print(train_y.shape)

(1518531, 30)
(1518531,)


# Dataloaders and batching

In [0]:
# Set batch size
batch_size = 50

In [0]:
train_x, train_y = split_dict["train"]
val_x, val_y = split_dict["val"]

In [0]:
# Make sure there are no unfull batches or the model will throw an error
number_val_batches = val_y.shape[0]//batch_size
number_train_batches = train_y.shape[0]//batch_size

val_x, val_y = val_x[0 : batch_size * number_val_batches], val_y[0 : batch_size * number_val_batches]
train_x, train_y = train_x[0 : batch_size * number_train_batches], train_y[0 : batch_size * number_train_batches]

print(val_x.shape)
print(val_y.shape)

(79900, 30)
(79900,)


In [0]:
# Create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))


# Shuffle training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

In [0]:
# Obtain one batch of training data and print to test
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 30])
Sample input: 
 tensor([[372370., 135750., 336772.,  ...,      0.,      0.,      0.],
        [116314., 219362., 122714.,  ...,      0.,      0.,      0.],
        [ 18064., 200867., 232944.,  ...,      0.,      0.,      0.],
        ...,
        [270050., 274154., 154339.,  ...,      0.,      0.,      0.],
        [ 26583., 236297.,  26583.,  ...,      0.,      0.,      0.],
        [348487., 321412., 288731.,  ...,      0.,      0.,      0.]],
       dtype=torch.float64)

Sample label size:  torch.Size([50])
Sample label: 
 tensor([1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1,
        1, 0])


# Model

In [0]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [0]:
class SentimentRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # Embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # Dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # Linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # Embeddings and lstm_out
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # Stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # Dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # Sigmoid function
        sig_out = self.sig(out)
        
        # Reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # Return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [0]:
# Instantiate the model w/ hyperparams
vocab_size = len(word_to_int)+1 # +1 for the 0 padding + word tokens
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

SentimentRNN(
  (embedding): Embedding(453853, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


# Training

In [0]:
# Learning rate and oss and optimization functions
lr=0.0005

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [0]:
def train(net, train_loader, valid_loader, epochs=1, print_every=100, checkpoint_every = 1000, train_on_gpu=True, clip=5):
  
  # Initilise counter
  counter = 0

  # Move model to GPU, if available
  if(train_on_gpu):
      net.cuda()

  net.train()
  #Ttrain for some number of epochs
  for e in range(epochs):
      # initialize hidden state
      h = net.init_hidden(batch_size)

      # Batch loop
      for inputs, labels in train_loader:
          counter += 1

          if(train_on_gpu):
              inputs, labels = inputs.cuda(), labels.cuda()

          # Creating new variables for the hidden state, otherwise
          # we'd backprop through the entire training history
          h = tuple([each.data for each in h])

          # Zero accumulated gradients
          net.zero_grad()

          # Get the output from the model
          output, h = net(inputs, h)

          # Calculate the loss and perform backprop
          loss = criterion(output.squeeze(), labels.float())
          loss.backward()
          # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
          nn.utils.clip_grad_norm_(net.parameters(), clip)
          optimizer.step()
          
          # Checkpoint model every 'checkpoint_every' steps
          if counter % checkpoint_every == 0:
            checkpoint = {'vocab_size': len(word_to_int)+1,
                'output_size': 1,
                'embedding_dim': 400,
                'hidden_dim': 256,
                'n_layers': 2,
                'state_dict': net.state_dict()}

            torch.save(checkpoint, 'checkpoint.pth')
          

          # Loss stats
          if counter % print_every == 0:
              # Get validation loss
              val_h = net.init_hidden(batch_size)
              val_losses = []
              net.eval()
              for inputs, labels in valid_loader:

                  # Creating new variables for the hidden state, otherwise
                  # we'd backprop through the entire training history
                  val_h = tuple([each.data for each in val_h])

                  if(train_on_gpu):
                      inputs, labels = inputs.cuda(), labels.cuda()

                  output, val_h = net(inputs, val_h)
                  val_loss = criterion(output.squeeze(), labels.float())

                  val_losses.append(val_loss.item())

              net.train()
              print("Epoch: {}/{}".format(e+1, epochs),
                    "Step: {}".format(counter),
                    "Loss: {:.6f}".format(loss.item()),
                    "Val Loss: {:.6f}".format(np.mean(val_losses)))
  
  return net
  
train(net, train_loader, valid_loader, epochs=1, print_every=100, train_on_gpu=train_on_gpu, clip=5)

Epoch: 1/1 Step: 100 Loss: 0.379927 Val Loss: 0.468119
Epoch: 1/1 Step: 200 Loss: 0.479346 Val Loss: 0.477924
Epoch: 1/1 Step: 300 Loss: 0.384448 Val Loss: 0.462696
Epoch: 1/1 Step: 400 Loss: 0.419432 Val Loss: 0.468909
Epoch: 1/1 Step: 500 Loss: 0.350077 Val Loss: 0.485590
Epoch: 1/1 Step: 600 Loss: 0.489647 Val Loss: 0.475267
Epoch: 1/1 Step: 700 Loss: 0.486520 Val Loss: 0.488223
Epoch: 1/1 Step: 800 Loss: 0.464204 Val Loss: 0.474295
Epoch: 1/1 Step: 900 Loss: 0.479766 Val Loss: 0.472518
Epoch: 1/1 Step: 1000 Loss: 0.517594 Val Loss: 0.484450
Epoch: 1/1 Step: 1100 Loss: 0.323740 Val Loss: 0.481244
Epoch: 1/1 Step: 1200 Loss: 0.292436 Val Loss: 0.475642
Epoch: 1/1 Step: 1300 Loss: 0.362053 Val Loss: 0.486634
Epoch: 1/1 Step: 1400 Loss: 0.464083 Val Loss: 0.476514
Epoch: 1/1 Step: 1500 Loss: 0.367959 Val Loss: 0.497679
Epoch: 1/1 Step: 1600 Loss: 0.555834 Val Loss: 0.495426
Epoch: 1/1 Step: 1700 Loss: 0.340682 Val Loss: 0.511055
Epoch: 1/1 Step: 1800 Loss: 0.562142 Val Loss: 0.487623
E

KeyboardInterrupt: ignored

## Testing

In [0]:
# Create function to load chackpoints
def load_checkpoint(filepath):
    checkpoint = torch.load(filepath)
    
    vocab_size = checkpoint['vocab_size']
    output_size = checkpoint['output_size']
    embedding_dim = checkpoint['embedding_dim']
    hidden_dim = checkpoint['hidden_dim']
    n_layers = checkpoint['n_layers']
    
    model = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
    model.load_state_dict(checkpoint['state_dict'])
    
    print("--Checkpoint loaded--")
    
    return model

In [0]:
# Get test data loss and accuracy

loaded_net = load_checkpoint('checkpoint.pth').cuda()

test_loader = valid_loader

test_losses = [] # track loss
num_correct = 0

# Init hidden state
h = loaded_net.init_hidden(batch_size)

loaded_net.eval()
# Iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # Get predicted outputs
    output, h = loaded_net(inputs, h)
    
    # Calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # Convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # Compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


loaded_net.train()

# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

In [0]:
test_review_neg = "If I have to work here for one more day I'm going to shoot myself"

In [0]:
def tokenize_review(test_review):
    test_review = test_review.lower() # lowercase
    # Get rid of punctuation
    test_text = ''.join([c for c in test_review if c not in punctuation])

    #Ssplitting by spaces
    test_words = test_text.split()

    # Tokens
    test_ints = []
    test_ints.append([word_to_int[word] for word in test_words])

    return test_ints[0]

# Test code and generate tokenized review
test_ints = tokenize_review(test_review_neg)
print(test_ints)

In [0]:
# Test sequence padding
def pad_features(test_ints, seq_length):
  features = [0 for i in range(seq_length)]
  
  for i in range(len(test_ints)):
    features[i] = test_ints[i]
  
  return np.array([features])

seq_length=30 
features = pad_features(test_ints, seq_length)

print(features)

In [0]:
# Test conversion to tensor and pass into model
feature_tensor = torch.from_numpy(features)
print(feature_tensor.size())

In [0]:
def predict(net, test_review, sequence_length=200):
    
    net.eval()
    
    # Tokenize review
    test_ints = tokenize_review(test_review)
    
    # Pad tokenized sequence
    seq_length=sequence_length
    features = pad_features(test_ints, seq_length)
    
    # Convert to tensor to pass into model
    feature_tensor = torch.from_numpy(features)
    
    batch_size = feature_tensor.size(0)
    
    # Initialize hidden state
    h = net.init_hidden(batch_size)
    
    if(train_on_gpu):
        feature_tensor = feature_tensor.cuda()
    
    # Get the output from the model
    output, h = net(feature_tensor, h)
    
    # Convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze()) 
    # Printing output value, before rounding
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))
    
    # Print custom response
    if(pred.item()==1):
        print("Positive tweet detected!")
    else:
        print("Negative tweet detected.")
        

In [0]:
# Call function
seq_length = 30 

predict(loaded_net, test_review_neg, seq_length)