# LSTM Model for Social Media Analysis

In this part, we use LSTM neural network to analyze Twitter data. In the final practical test, the effect is not bad.

# Import Basic Library

In [92]:
import torch
import pandas as pd
import torchtext
from torchtext.legacy import data
import torch.nn as nn
import torch.optim as optim

# Cleaning Data

### 1. Delete non-utf8 encoding block and save the csv file

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/BigData/ Sentiment140_dataset.csv', 'rb') as csv_in:
    with open('newdataset.csv', "w", encoding="utf-8") as csv_temp:
        for line in csv_in:
            if not line:
                break
            else:
                line = line.decode("utf-8", "ignore")
                csv_temp.write(str(line).rstrip() + '\n')

### 2. Find Nan value

In [93]:
df=pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/BigData/newdataset.csv')
print(df.isnull().any())

0                                                                                                                      False
1467810369                                                                                                             False
Mon Apr 06 22:19:45 PDT 2009                                                                                           False
NO_QUERY                                                                                                               False
_TheSpecialOne_                                                                                                        False
@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D    False
dtype: bool


### 3. Delete Nan and save csv file

In [None]:
df.dropna()
df.to_csv('Sentiment140DataSet_clean.csv')

# Load Dataset

In [99]:
input_file = '/content/drive/MyDrive/Colab Notebooks/BigData/Sentiment140DataSet_clean.csv'
df = pd.read_csv(input_file, header = None)

# View Dataset

In [94]:
print(df)

         0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  \
0        0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
1        0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
2        0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
3        0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4        0  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY   
...     ..         ...                           ...       ...   
1599994  4  2193601966  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599995  4  2193601969  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599996  4  2193601991  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599997  4  2193602064  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599998  4  2193602129  Tue Jun 16 08:40:50 PDT 2009  NO_QUERY   

         _TheSpecialOne_  \
0          scotthamilton   
1               mattycus   
2                ElleCTF   
3                 Karoli   
4               joy_wolf   
...                  ...   
1599994  Am

In [95]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599999 entries, 0 to 1599998
Data columns (total 6 columns):
 #   Column                                                                                                               Non-Null Count    Dtype 
---  ------                                                                                                               --------------    ----- 
 0   0                                                                                                                    1599999 non-null  int64 
 1   1467810369                                                                                                           1599999 non-null  int64 
 2   Mon Apr 06 22:19:45 PDT 2009                                                                                         1599999 non-null  object
 3   NO_QUERY                                                                                                             1599999 non-null  object
 4   _

# Create the Category and Index

In [100]:
df["sentiment_category"] = df[0].astype('category')
df["sentiment_category"]

df["sentiment"] = df["sentiment_category"].cat.codes
print(df["sentiment"])

0               -1
1                0
2                1
3                2
4                3
            ...   
1599995    1599994
1599996    1599995
1599997    1599996
1599998    1599997
1599999    1599998
Name: sentiment, Length: 1600000, dtype: int32


# Pre-process the Dataset

### 1. Create labels and fields for the dataset

In [None]:
LABEL = data.LabelField()
TWEET = data.Field(tokenize='spacy', tokenizer_language = 'en_core_web_sm', lower = True)

In [98]:
fields = [('score',None), ('id',None), ('date',None), ('query',None), ('name',None),  ('tweet', TWEET),('category',None),('label',LABEL)]

print(fields)

[('score', None), ('id', None), ('date', None), ('query', None), ('name', None), ('tweet', <torchtext.legacy.data.field.Field object at 0x7f4b7c8be310>), ('category', None), ('label', <torchtext.legacy.data.field.LabelField object at 0x7f4b7c8be460>)]


### 2. Create Dataset and Split the Dataset

In [None]:
twitterDataset = data.dataset.TabularDataset(
        path = "/content/drive/MyDrive/Colab Notebooks/BigData/Sentiment140DataSet_clean.csv", 
        format = "CSV", 
        fields = fields,
        skip_header = False)

In [None]:
(train, validation, test) = twitterDataset.split(
                            split_ratio = [0.8,0.1,0.1]
                            )

print("Length of the Training Set: " ,len(train))
print("Length of the Validation Set: " ,len(validation))
print("Length of the Testing Set: " ,len(test))

Length of the Training Set:  1280000
Length of the Validation Set:  160000
Length of the Testing Set:  160000


# To Build a Vocabulary and Display

In [None]:
vocab_size = 300000
TWEET.build_vocab(train, max_size = vocab_size)


print("Length of the vocabulary: " ,len(TWEET.vocab))

Length of the vocabulary:  300002


# Create DataLoader

In [None]:
train_dataloader, valid_dataloader, test_dataloader = data.BucketIterator.splits(
    (train, validation, test),
    batch_size = 32,
    sort_key = lambda x: len(x.tweet),
    sort_within_batch = False)

# Create LSTM Model

In [None]:
class Sentiment_LSTMModel(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super(Sentiment_LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(input_size=embedding_dim,
        hidden_size=hidden_size, num_layers=1)
        self.predictor = nn.Linear(hidden_size, 2)
        pass
    
    def forward(self, seq):
        output, (hidden,_) = self.encoder(self.embedding(seq))
        preds = self.predictor(hidden.squeeze(0))
        return preds
        pass

In [None]:
model = Sentiment_LSTMModel(hidden_size = 100,embedding_dim = 300, vocab_size = vocab_size)

# Define Loss Function and Optimizer

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-2)

# Preparing Training

In [None]:
epochs = 15
def training_function(epochs, model, optimizer, criterion, train_dataloader, valid_dataloader):
    for epoch in range(1, epochs + 1):
     
        #set training and valid loss to zero
        training_loss = 0.0
        valid_loss = 0.0
        
        #set model for train
        model.train()
        
        for batch_idx, batch in enumerate(train_dataloader):
            
            tweet, label = batch
            
            optimizer.zero_grad()
            
            predict = model(tweet)
            
            loss = criterion(predict, label)
            
            loss.backward()
            
            optimizer.step()
            
            training_loss += loss.data.item() * tweet.size(0)
            
        training_loss /= len(train_dataloader)
 
        #set model for evalution
        model.eval()
        for batch_idx,batch in enumerate(valid_dataloader):

            tweet, label = batch
            
            predict = model(tweet)
            loss = criterion(predict, label)
            valid_loss += loss.data.item() * tweet.size(0)
 
        valid_loss /= len(valid_dataloader)
        print('Epoch: {} - loss: {:.2f} - val_ loss: {:.2f}'.format(epoch, training_loss, valid_loss))


# Train

In [None]:
training_function(epochs, model, optimizer, criterion, train_dataloader, valid_dataloader)

Epoch 1 - loss: 2.6431 - val_loss: 2.6653
Epoch 2 - loss: 2.3759 - val_loss: 3.9411
Epoch 3 - loss: 2.0834 - val_loss: 7.2338
Epoch 4 - loss: 1.8380 - val_loss: 9.4135
Epoch 5 - loss: 1.6002 - val_loss: 10.0389
Epoch 6 - loss: 1.3725 - val_loss: 11.0042
Epoch 7 - loss: 1.1924 - val_loss: 10.2766
Epoch 8 - loss: 1.0529 - val_loss: 9.2593
Epoch 9 - loss: 0.9137 - val_loss: 9.9668
Epoch 10 - loss: 0.7928 - val_loss: 9.4821
Epoch 11 - loss: 0.6885 - val_loss: 8.7342
Epoch 12 - loss: 0.6094 - val_loss: 8.5325
Epoch 13 - loss: 0.5345 - val_loss: 7.9924
Epoch 14 - loss: 0.4800 - val_loss: 7.8522
Epoch 15 - loss: 0.4357 - val_loss: 7.1004


# Test the Model

I manually randomly sampled a tweet content data from the dataset.

In [None]:
def sentimentRecognition(tweet_context):
    categories = {0: "Negative", 1:"Positive"}
    processed = TWEET.process([TWEET.preprocess(tweet_context)])
    
    model.eval()
    prediction = model(processed)
    print("Prediction: ",  prediction)
    pred_result = categories[prediction.argmax().item()] 
    return pred_result

In [None]:
test_tweet = "@SkylineStudio Oh, yes, I'm pretty blessed. AND I love good food so we make a good match!"
sentimentRecognition(test_tweet)

Prediction:  tensor([[ 0.2838, -0.2630]], grad_fn=<AddmmBackward>)
'Positive'
