In [3]:
from google.colab import files

In [6]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [7]:
!kaggle datasets download -d ferno2/training1600000processednoemoticoncsv

Downloading training1600000processednoemoticoncsv.zip to /content
 93% 75.0M/80.9M [00:00<00:00, 161MB/s]
100% 80.9M/80.9M [00:00<00:00, 149MB/s]


In [8]:
!unzip training1600000processednoemoticoncsv.zip

Archive:  training1600000processednoemoticoncsv.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [5]:
import torch

if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')

device

device(type='cuda')

In [6]:
import pandas as pd

df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)

In [74]:
# df.head()
# df.shape
# df.info()


In [8]:
df['sentiment_category'] = df[0].astype('category')
df['sentiment_category']

0          0
1          0
2          0
3          0
4          0
          ..
1599995    4
1599996    4
1599997    4
1599998    4
1599999    4
Name: sentiment_category, Length: 1600000, dtype: category
Categories (2, int64): [0, 4]

In [9]:
df['sentiment'] = df['sentiment_category'].cat.codes
df['sentiment']
# print(f"categ {df['sentiment'].unique()}")
# print(f"categ {df['sentiment_category'].unique()}")

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: sentiment, Length: 1600000, dtype: int8

In [10]:
df.sample(20000).to_csv('train.csv', index=None, header=None)

In [None]:
!pip install -U torchtext==0.10.0

In [55]:
import torchtext
from torchtext.legacy import data

In [56]:
LABEL = data.LabelField()
TWEET = data.Field(tokenize='spacy', tokenizer_language = 'en_core_web_sm', lower = True)

In [57]:
fields = [('score',None), ('id',None), ('date',None), ('query',None), ('name',None), 
          ('tweet', TWEET),('category',None),('label',LABEL)]
fields

[('score', None),
 ('id', None),
 ('date', None),
 ('query', None),
 ('name', None),
 ('tweet', <torchtext.legacy.data.field.Field at 0x7f5b9b5a0e10>),
 ('category', None),
 ('label', <torchtext.legacy.data.field.LabelField at 0x7f5b9b5a0090>)]

In [58]:
twitterDataset = data.TabularDataset(path='train.csv', format="CSV", fields=fields, skip_header=False)

In [59]:
(train, validation, test) = twitterDataset.split(split_ratio = [0.8,0.1,0.1], stratified = True, strata_field = 'label')

In [60]:
print(f"len train {len(train)}\nlen test {len(test)}\nlen valid {len(validation)}")

len train 16000
len test 2000
len valid 2000


In [61]:
vocab_size = 30000
TWEET.build_vocab(train, max_size=vocab_size)
LABEL.build_vocab(train)

In [40]:
TWEET.vocab.freqs.most_common(12)

[('i', 10110),
 ('!', 8667),
 ('.', 8002),
 (' ', 5919),
 ('to', 5576),
 ('the', 5181),
 (',', 4920),
 ('a', 3846),
 ('my', 3114),
 ('you', 3039),
 ('it', 3026),
 ('and', 2945)]

In [62]:
train_dataloader, valid_dataloader, test_dataloader = data.BucketIterator.splits(
    (train, validation, test),
    batch_size = 16,
    device = device,
    sort_key = lambda x: len(x.tweet),
    sort_within_batch = False)

In [63]:
import torch.nn as nn

In [64]:
class MyLSTMModel(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super(MyLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(input_size=embedding_dim,
        hidden_size=hidden_size, num_layers=1)
        self.predictor = nn.Linear(hidden_size, 2)
        pass
    
    def forward(self, seq):
        output, (hidden,_) = self.encoder(self.embedding(seq))
        preds = self.predictor(hidden.squeeze(0))
        return preds
        pass

In [65]:
model = MyLSTMModel(hidden_size = 100,embedding_dim = 300, vocab_size = vocab_size)
model.to(device)

MyLSTMModel(
  (embedding): Embedding(30000, 300)
  (encoder): LSTM(300, 100)
  (predictor): Linear(in_features=100, out_features=2, bias=True)
)

In [66]:
criterion = nn.CrossEntropyLoss()

In [67]:
import torch.optim as optim

In [68]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [69]:
epochs = 20
def train(epochs, model, optimizer, criterion, train_dataloader, valid_dataloader):
    for epoch in range(1, epochs + 1):
     
        #set training and valid loss to zero
        training_loss = 0.0
        valid_loss = 0.0
        
        #set model for train
        model.train()
        
        for batch_idx, batch in enumerate(train_dataloader):
            
            # get the batch; batch is a list of [tweet, label]
            tweet, label = batch
            
            #optimizer set it to zero_grad(), means clear the gradients  
            optimizer.zero_grad()
            
            #Forward Pass
            predict = model(tweet)
            
            # Find the Loss
            loss = criterion(predict, label)
            
            # Calculate gradients 
            loss.backward()
            
            # Update Weights
            optimizer.step()
            
            # Calculate Loss
            training_loss += loss.data.item() * tweet.size(0)
            
        training_loss /= len(train_dataloader)
 
        #set model for evalution
        model.eval()
        for batch_idx,batch in enumerate(valid_dataloader):
            # get the batch; batch is a list of [tweet, label]
            tweet, label = batch
            
            predict = model(tweet)
            loss = criterion(predict, label)
            valid_loss += loss.data.item() * tweet.size(0)
 
        valid_loss /= len(valid_dataloader)
        print('Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}'.format(epoch, training_loss, valid_loss))

In [70]:
train(epochs, model, optimizer, criterion, train_dataloader, valid_dataloader)

Epoch: 1, Training Loss: 19.83, Validation Loss: 10.14
Epoch: 2, Training Loss: 14.26, Validation Loss: 9.34
Epoch: 3, Training Loss: 9.61, Validation Loss: 12.00
Epoch: 4, Training Loss: 5.68, Validation Loss: 14.02
Epoch: 5, Training Loss: 3.36, Validation Loss: 17.86
Epoch: 6, Training Loss: 2.00, Validation Loss: 17.59
Epoch: 7, Training Loss: 1.36, Validation Loss: 18.33
Epoch: 8, Training Loss: 0.89, Validation Loss: 20.91
Epoch: 9, Training Loss: 0.90, Validation Loss: 24.54
Epoch: 10, Training Loss: 0.49, Validation Loss: 24.65
Epoch: 11, Training Loss: 0.40, Validation Loss: 26.11
Epoch: 12, Training Loss: 0.50, Validation Loss: 22.67
Epoch: 13, Training Loss: 0.59, Validation Loss: 27.29
Epoch: 14, Training Loss: 0.37, Validation Loss: 27.93
Epoch: 15, Training Loss: 0.26, Validation Loss: 28.03
Epoch: 16, Training Loss: 0.31, Validation Loss: 27.11
Epoch: 17, Training Loss: 0.26, Validation Loss: 26.89
Epoch: 18, Training Loss: 0.24, Validation Loss: 27.76
Epoch: 19, Trainin

In [75]:
def classifyTweet(tweet):
    categories = {0: "Negative", 1:"Positive"}
    processed = TWEET.process([TWEET.preprocess(tweet)])
    processed = processed.to(device)
    
    model.eval()
    prediction = model(processed)
    print("Prediction: ",  prediction)
    pred_cat = categories[prediction.argmax().item()] 
    return pred_cat

In [95]:
classifyTweet("Working out is bad!")

Prediction:  tensor([[ 0.6446, -0.4574]], device='cuda:0', grad_fn=<AddmmBackward>)


'Negative'