In [1]:
import torch
import matplotlib.pyplot as plt
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab
from collections import Counter, defaultdict
import nltk
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import GloVe

from nltk.corpus import twitter_samples

In [2]:
nltk.download("twitter_samples")

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/i540927/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
nltk.download("stopwords")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/i540927/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

import re
import string

In [7]:
#Import the english stop words list from NLTK
stopwords_english = stopwords.words('english') 

('Stop words\n')
print(stopwords_english)

print('\nPunctuation\n')
print(string.punctuation)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
def process_tweet(tweet:str):
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)

    # remove hyperlinks
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)

    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)    
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)

    # tokenize tweets
    tweet_tokens = tokenizer.tokenize(tweet)
    tweets_clean = []

    for word in tweet_tokens: # Go through every word in your tokens list
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            tweets_clean.append(word)
    stemmer = PorterStemmer() 

    # Create an empty list to store the stems
    tweets_stem = [] 

    for word in tweets_clean:
        stem_word = stemmer.stem(word)  # stemming word
        tweets_stem.append(stem_word)  # append to the list

    return tweets_stem

In [9]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, positive_tweets, negative_tweets,vectors=None,tokenize=lambda x:x.split(),pad_token="<pad>",unk_token="<unk>"):
        self.tweets = []
      
        specials=["<pad>","<unk>"]

        def yield_tokens(data):
            for tweet in data:
                tokens = tokenize(tweet)
                yield tokens
            
        self.vocab = torchtext.vocab.build_vocab_from_iterator(
            yield_tokens(negative_tweets+positive_tweets),
            special_first=True,
            specials=specials)
        self.pad_idx = self.vocab["<pad>"]
        self.vocab.set_default_index(self.vocab["<unk>"])
        

        for p_tweet in positive_tweets:
            tokens = tokenize(p_tweet)
            self.tweets.append((1, self.vocab.forward(tokens)))
        for n_tweet in negative_tweets:
            tokens = tokenize(n_tweet)
            self.tweets.append((0, self.vocab.forward(tokens)))
       
       
    def __getitem__(self, idx):
        return self.tweets[idx]
    def __len__(self):
        return len(self.tweets)

    @staticmethod
    def collate_fn(batch):
            # batch in that case is List of batches that contain the elements of the iterator
        text_list = []
        target_list=[]
        len_list=[]
        
        for (label,x) in batch:
            x = torch.tensor(x, dtype=torch.int64)
            text_list.append(x)  
            target_list.append(label)
            len_list.append(len(x))

        len_list = torch.tensor(len_list,dtype=torch.int64)
        sorted_lens, sorted_idx = torch.sort(len_list,descending=True)
        target_list = torch.tensor(target_list, dtype=torch.int64)[sorted_idx]
        text_list = pad_sequence(text_list, batch_first=True)[sorted_idx]
        
        return text_list,target_list, sorted_lens
        

In [10]:
positive_tweets = twitter_samples.strings("positive_tweets.json")
negative_tweets = twitter_samples.strings("negative_tweets.json")

In [11]:

tweet_dataset = TweetDataset(positive_tweets,negative_tweets,tokenize=process_tweet)


n_train = int(0.8*len(tweet_dataset))
n_test = len(tweet_dataset)- n_train

train_dataset, test_dataset = torch.utils.data.random_split(tweet_dataset,[n_train,n_test])

train_loader = DataLoader(train_dataset, batch_size=64, collate_fn=TweetDataset.collate_fn,shuffle=True)
test_loader = DataLoader(test_dataset ,batch_size=64,collate_fn=TweetDataset.collate_fn)



In [38]:
# TODO add comments to explain what is happening
class TweetClassification(torch.nn.Module):
    def __init__(self,vocab_size, 
                        input_size, 
                        hidden_size,
                        output_size,
                        bidirectional=True,
                        padding_idx=0):
        super().__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.pad_idx = padding_idx
        self.dropout = torch.nn.Dropout()
        self.embeddings = torch.nn.Embedding(vocab_size,input_size,padding_idx=padding_idx)
        self.lstm = torch.nn.LSTM(input_size=input_size,
                                    hidden_size=hidden_size, 
                                    bidirectional=bidirectional,
                                    batch_first=True)
        if bidirectional:
            self.fc = torch.nn.Linear(in_features = 2*hidden_size,out_features=hidden_size)
        else:
            self.fc = torch.nn.Linear(in_features = hidden_size,out_features=hidden_size)
        self.fc2 = torch.nn.Linear(in_features=hidden_size,out_features=output_size)

    def forward(self, seq, lens):
        # batch first sequence seq (batch_size, seq_length)
        x = self.dropout(self.embeddings(seq))
        # After embedding layer the dim is bach_size, seq_length, embedding_dim
        # enforce_sorted is a problem because it changes the ordering, then the logits and the target ordering is not the same anymore?
        x_packed = torch.nn.utils.rnn.pack_padded_sequence(x, lens, batch_first=True)
        x,(h_n,c_n) = self.lstm(x_packed)
        x,x_lens = torch.nn.utils.rnn.pad_packed_sequence(x,batch_first=True)
     
        #x = torch.cat([h_n[0,:,:],h_n[1,:,:]],dim=1)
        
        x = x[range(x.shape[0]),x_lens-1,:]
        x = self.dropout(self.fc(x))
     
        return torch.sigmoid(self.fc2(x))


        

In [39]:
def train_model(model, dataloaders, criterion, optimizer, device, num_epochs=25):
    model.to(device)
    for epoch in range(num_epochs):
        print(f"Epoch {epoch}/{num_epochs}")
        print("-"*10)

        for phase in ["train","test"]:
            if phase == "train":
                model.train()
            else:
                model.eval()
            running_loss = 0
            running_corrects = 0

            for texts,targets,lens in dataloaders[phase]:
                optimizer.zero_grad()
                with torch.set_grad_enabled(phase=="train"):
                    x = texts.to(device)
                    y = targets.to(device)

                    outputs = model(x,lens).squeeze()
                    loss = criterion(outputs,y.float())
                
                    if phase== "train":
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * x.shape[0]
                running_corrects+= torch.sum((torch.round(torch.sigmoid(outputs)) ==  y)).item()
            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_accuracy = running_corrects / len(dataloaders[phase].dataset)
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_accuracy))
    print()
        
input_size= 200
hidden_size = 256
vocab_size = len(tweet_dataset.vocab)
output_size = 1
bidirectional = True

crit = torch.nn.BCELoss()
net = TweetClassification(vocab_size,
                            input_size,
                            hidden_size,
                            output_size,
                            padding_idx=tweet_dataset.pad_idx,
                            bidirectional=bidirectional)
optim = torch.optim.Adam(net.parameters())

EPOCHS=30
dataloaders ={"train":train_loader,"test":test_loader}
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
train_model(net,dataloaders,crit,optim,device,num_epochs=EPOCHS)

Epoch 0/30
----------
train Loss: 0.0692 Acc: 0.5675
test Loss: 0.0225 Acc: 0.5065
Epoch 1/30
----------
train Loss: 0.0143 Acc: 0.5867
test Loss: 0.0196 Acc: 0.8285
Epoch 2/30
----------
train Loss: 0.0074 Acc: 0.7274
test Loss: 0.0185 Acc: 0.8555
Epoch 3/30
----------
train Loss: 0.0056 Acc: 0.7462
test Loss: 0.0168 Acc: 0.8500
Epoch 4/30
----------
train Loss: 0.0061 Acc: 0.6707
test Loss: 0.0232 Acc: 0.8120
Epoch 5/30
----------
train Loss: 0.0026 Acc: 0.7147
test Loss: 0.0241 Acc: 0.8435
Epoch 6/30
----------
train Loss: 0.0014 Acc: 0.7561
test Loss: 0.0217 Acc: 0.9400
Epoch 7/30
----------
train Loss: 0.0020 Acc: 0.8317
test Loss: 0.0255 Acc: 0.8940
Epoch 8/30
----------
train Loss: 0.0022 Acc: 0.8461
test Loss: 0.0208 Acc: 0.9725
Epoch 9/30
----------
train Loss: 0.0021 Acc: 0.8950
test Loss: 0.0646 Acc: 0.9755
Epoch 10/30
----------
train Loss: 0.0014 Acc: 0.8191
test Loss: 0.0658 Acc: 0.9535
Epoch 11/30
----------
train Loss: 0.0255 Acc: 0.9049
test Loss: 0.0618 Acc: 0.9905
Ep

In [43]:
i = 1010
print(negative_tweets[i])
x=tweet_dataset.vocab.forward(process_tweet(negative_tweets[i]))
x = torch.tensor(x).unsqueeze(0)
lens = torch.tensor([len(x)],dtype=torch.int64)
net.eval()
net(x,lens)

Oh my god :(


tensor([[0.0604]], grad_fn=<SigmoidBackward>)