In [3]:
import pandas as pd
import gensim
import nltk
import pandas as pd
import string
import numpy as np
import re

In [5]:
df = pd.read_json("dataset/train_original_tweets_top1000full.json")

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

In [6]:
def cleanup_text(text):
    # remove &quot and &amp
    text = re.sub(r'&quot;(.*?)&quot;', "\g<1>", text)
    text = re.sub(r'&amp;', "", text)

    # replace emoticon
    text = re.sub(r'(^| )(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)', "\g<1>TOKEMOTICON", text)

    text = text.lower()
    text = text.replace("tokemoticon", "TOKEMOTICON")

    # replace url
    text = re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?',
                "TOKURL", text)

    # replace mention
    text = re.sub(r'@[\w]+', "TOKMENTION", text)

    # replace hashtag
    text = re.sub(r'#[\w]+', "TOKHASHTAG", text)

    # replace dollar
    text = re.sub(r'\$\d+', "TOKDOLLAR", text)

    # remove punctuation
    text = re.sub('[^a-zA-Z0-9]', ' ', text)

    # remove special word
    text = re.sub('TOKDDOLLAR', ' ', text)

    # TODO: test if they are useful in prediction
    text = re.sub('TOKMENTION', ' ', text)
    text = re.sub('TOKEMOTICON', ' ', text)
    text = re.sub('TOKHASHTAG', ' ', text)
    text = re.sub('TOKURL', ' ', text)

    # remove multiple spaces
    text = re.sub(r' +', ' ', text)

    # remove newline
    text = re.sub(r'\n', ' ', text)

    return text

In [7]:
def process(text, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    clean_text = cleanup_text(text)
    tokens = nltk.word_tokenize(clean_text)
    lemma_tokens = []
    for token in tokens:
        try:
            lemma_tokens.append(str(lemmatizer.lemmatize(token)))
        except:
            pass
    return lemma_tokens

In [8]:
def process_all(df, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    df["text"] = list(map(lambda x: process(x, lemmatizer), df["text"].values))
    df["len"] = df["text"].apply(len)
    df["label"] = df["retweet_count"] > df["retweet_median"]
    df["label"] = df["label"].astype('int')
    df = df[['label', 'screen_name', 'text', 'len']]
    return df

In [9]:
def load_model(file_name):
    # GloVe Model
    model = gensim.models.KeyedVectors.load_word2vec_format(file_name, binary=False) 
    return model

glove_model = load_model("../glove.twitter.27B/glove.twitter.27B.25d.txt")

In [10]:
processed_df = process_all(df, lemmatizer)

In [11]:
processed_df.head

<bound method NDFrame.head of         label      screen_name  \
0           1        katyperry   
1           1        katyperry   
10          0        katyperry   
100         1        katyperry   
1000        1     TheEllenShow   
10000       0             espn   
100000      0  harbhajan_singh   
100001      0  harbhajan_singh   
100002      0  harbhajan_singh   
100003      1  harbhajan_singh   
100004      0  harbhajan_singh   
100005      0  harbhajan_singh   
100006      0  harbhajan_singh   
100007      0  harbhajan_singh   
100008      0  harbhajan_singh   
100009      0  harbhajan_singh   
10001       0             espn   
100010      0  harbhajan_singh   
100011      0  harbhajan_singh   
100012      0  harbhajan_singh   
100013      0  harbhajan_singh   
100014      0  harbhajan_singh   
100015      0  harbhajan_singh   
100016      1  harbhajan_singh   
100017      0  harbhajan_singh   
100018      0  harbhajan_singh   
100019      0  harbhajan_singh   
10002       1     

In [15]:
processed_df['len'].median()

13.0

In [16]:
def creat_feature(text, model, dim=25, max_len=40):
    '''
    Create the sentence matrix as the input of LSTM cell
    We padding the first dim of the matrix to max_len
    '''
    # You can try concatenation, simple summation, pointwise multiplication, convolution etc. 
    feature = np.zeros((max_len, dim))
    for i in range(len(text)):
        try:
            feature[i, :] = model[text[i]]
        except:
            feature[i, :] = model[',']
    for i in range(len(text), max_len):
        feature[i, :] = model[',']
    feature = np.squeeze(feature)
    return feature

In [17]:
import torch
import torch.nn as nn
from torch.autograd import Variable

class LSTMClassifier(nn.Module):
    def __init__(self, label_size, embedding_dim, 
                 lstm_hidden_size, lstm_num_layers, dropout):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(
            batch_first=True,
            input_size=embedding_dim,
            hidden_size=lstm_hidden_size,
            num_layers=lstm_num_layers,
            dropout=dropout,
            bidirectional=False)
        self.dense = nn.Linear(
            in_features=lstm_hidden_size,
            out_features=label_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, embedding):
        # embedding (N, W, H)
        out_lstm, _ = self.lstm(embedding)
        out_lstm = out_lstm.permute(1, 0, 2)  # (W, N, C*H)
        out_linear = self.dense(out_lstm[-1])
        output = self.softmax(out_linear)
        return output

In [18]:
train_frac = 0.7

data = processed_df['text'].apply(lambda x: creat_feature(x, glove_model, dim=25, max_len=70)).values
label = processed_df['label'].values

data_size = len(processed_df)
train_data = data[:(int)(train_frac * data_size)]
train_label = label[:(int)(train_frac * data_size)]
val_data = data[(int)(train_frac * data_size):]
val_label = label[(int)(train_frac * data_size):]

In [19]:
use_cuda = False

In [None]:
batch_size=100
embedding_dim=25
lstm_hidden_size=32
lstm_num_layers=3
dropout=0.2

label_size = 2

loss_fn = nn.CrossEntropyLoss()
model = LSTMClassifier(label_size, embedding_dim,
                       lstm_hidden_size, lstm_num_layers, dropout)
model = model.cuda() if use_cuda else model
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

torch.manual_seed(2018)
for epoch in range(10):
    print("epoch: " + str(epoch))
    for batch_index in range(0, len(train_data), batch_size):
        model.zero_grad()
        batch_data = train_data[batch_index: batch_index + batch_size]
        batch_label = train_label[batch_index: batch_index + batch_size]
        batch_data = np.stack(batch_data, axis=0)
   
        train_x = torch.from_numpy(batch_data).float()
        train_y = torch.from_numpy(batch_label)

        if use_cuda:
            train_x = train_x.cuda()
            train_y = train_y.cuda()
        
        train_x = Variable(train_x)
        train_y = Variable(train_y)
        
        y_pred = model(train_x)
        
        loss = loss_fn(y_pred, train_y)
        loss.backward()
        optimizer.step()
        
        if use_cuda:
            print("batch_range:["+str(batch_index)+","+str(batch_index+batch_size)+"),loss:"+str(loss.cpu().numpy()[0]))
        else:
            print("batch_range:["+str(batch_index)+","+str(batch_index+batch_size)+"),loss:"+str(loss.data.numpy()[0]))
   

In [None]:
for batch_index in range(0, len(val_data), batch_size):
    batch_data = val_data[batch_index: batch_index + batch_size]
    batch_label = val_label[batch_index: batch_index + batch_size]
    batch_data = np.stack(batch_data, axis=0)

    val_x = torch.from_numpy(batch_data).float()

    if use_cuda:
        val_x = val_x.cuda()
    val_x = Variable(val_x)

    y_pred = model(val_x)
    _, label_pred = torch.max(y_pred, 1)
    print("accuracy:%f" % (sum(label_pred.data.numpy() == batch_label)/float(len(batch_label))))