<a href="https://colab.research.google.com/github/CarloCHEN/Tweet-Sentiment-Extraction_Kaggle/blob/master/Phrase_extraction_of_the_test_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# import the nltk tokenizer 
import os
from nltk.tokenize import sent_tokenize

In [2]:
# install the BERT pretrained model if not done yet
pip install pytorch_pretrained_bert

Collecting pytorch_pretrained_bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 27.3MB/s eta 0:00:01[K     |█████▎                          | 20kB 6.8MB/s eta 0:00:01[K     |████████                        | 30kB 7.7MB/s eta 0:00:01[K     |██████████▋                     | 40kB 8.2MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 7.2MB/s eta 0:00:01[K     |███████████████▉                | 61kB 8.2MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 8.9MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 9.9MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 9.0MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 9.3MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 9.3MB/s eta 0:00:01[K     |██████████████████████

In [3]:
# import libraries and utils
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

Using TensorFlow backend.


In [0]:
# Code to read csv file into Google Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
# load the dataset from the link
link = 'https://drive.google.com/open?id=1O3hcurUk6i_21vEdlJht0KKCiCayL7Dh'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('test.csv')  
test = pd.read_csv('test.csv')

In [0]:
# use CUDA is cuda is available
# otherwise use cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [24]:
# load the tokenizer from bert_base_uncased model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

100%|██████████| 231508/231508 [00:00<00:00, 900294.13B/s]


In [0]:
# initialize models
# each for different sentiment 
# number of labels is 2 ("0" for absent, "1" for present)
model_positive = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=2)
model_positive = model_positive.cuda()

model_negative = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=2)
model_negative = model_negative.cuda()

model_neutral = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=2)
model_neutral = model_neutral.cuda()

In [71]:
# mount the drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [72]:
# re-load the pretrained model from the drive
model_save_name = 'BERT_for_negative_extraction.pt'
path = F"/content/gdrive/My Drive/{model_save_name}"
model_negative.load_state_dict(torch.load(path))

model_save_name = 'BERT_for_positive_extraction.pt'
path = F"/content/gdrive/My Drive/{model_save_name}"
model_positive.load_state_dict(torch.load(path))

model_save_name = 'BERT_for_neutral_extraction.pt'
path = F"/content/gdrive/My Drive/{model_save_name}"
model_neutral.load_state_dict(torch.load(path))

<All keys matched successfully>

In [0]:
# define the keywordextract method 
def keywordextract(model, sentence):
    predicted_phrase = []
    extracted_phrase = ""
    text = sentence
    tkns = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tkns)
    segments_ids = [0] * len(tkns)
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    segments_tensors = torch.tensor([segments_ids]).to(device)
    # print(tokens_tensor)
    # print(segments_tensors)
    model.eval()
    prediction = []
    logit = model(tokens_tensor, token_type_ids=None,
                                  attention_mask=segments_tensors)
    logit = logit.detach().cpu().numpy()
    # print(logit)
    prediction.extend([list(p) for p in np.argmax(logit, axis=2)])
    for k, j in enumerate(prediction[0]):
      if j == 1:
          predicted_phrase.append(tokenizer.convert_ids_to_tokens(tokens_tensor[0].to('cpu').numpy())[k])
    for element in predicted_phrase: 
      extracted_phrase += element
      extracted_phrase += " "
    return extracted_phrase

In [0]:
# extract the key phrase that reflects the sentiment from the original tweet text
# use different model for different sentiment
# write extracted phrase to a new column 
test['extracted_phrase'] = ""
for i in range(test.shape[0]):
  if (test['sentiment'][i] == "positive"):
    test['extracted_phrase'][i] = keywordextract(model_positive,test['text'][i])
  if (test['sentiment'][i] == "negative"):
    test['extracted_phrase'][i] = keywordextract(model_negative,test['text'][i])
  if (test['sentiment'][i] == "neutral"):
    test['extracted_phrase'][i] = keywordextract(model_neutral,test['text'][i])

In [0]:
# get the submission dataframe
submission = test[['textID','extracted_phrase']]

In [0]:
# save submission csv file to google drive
submission.to_csv('/content/gdrive/My Drive/submission.csv')