In [3]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [4]:
import requests
request = requests.get("https://drive.google.com/uc?export=download&id=1wHt8PsMLsfX5yNSqrt2fSTcb8LEiclcf")
with open("data.zip", "wb") as file:
    file.write(request.content)

# Unzip data
import zipfile
with zipfile.ZipFile('data.zip') as zip:
    zip.extractall('data')

In [5]:
 # Load data and set labels
data_complaint = pd.read_csv('data/complaint1700.csv')
data_complaint['label'] = 0
data_non_complaint = pd.read_csv('data/noncomplaint1700.csv')
data_non_complaint['label'] = 1

# Concatenate complaining and non-complaining data
data = pd.concat([data_complaint, data_non_complaint], axis=0).reset_index(drop=True)

# Drop 'airline' column
data.drop(['airline'], inplace=True, axis=1)

# Display 5 random samples
data.sample(5)

Unnamed: 0,id,tweet,label
1777,8522,@Jlee0097 @ScottsdaleToDo @willbasset @dolphin...,1
704,149129,Soooo I spent $179 to upgrade and be comfy on ...,0
1507,123235,Feeling slightly frustrated @AmericanAir Fligh...,0
349,155838,Hey @united. You better get some training for ...,0
1548,53116,Hey @JetBlue I'm at Laguardia waiting to board...,0


In [6]:
from sklearn.model_selection import train_test_split

X = data.tweet.values
y = data.label.values

X_train, X_val, y_train, y_val =\
    train_test_split(X, y, test_size=0.1, random_state=2020)

In [7]:
X_train[0]

"@DeltaAssist What's the best way to get compensation for a musical instrument that was broken during one of your flights?"

In [8]:
y_train

array([1, 0, 1, ..., 1, 0, 0])

In [9]:
df = pd.read_csv("Data/Twitter_Data.csv")
df.columns = ["text","label"]
df.head()

Unnamed: 0,text,label
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [10]:
# Load test data
test_data = pd.read_csv('data/test_data.csv')

# Keep important columns
test_data = test_data[['id', 'tweet']]

# Display 5 samples from the test data
test_data.sample(5)

Unnamed: 0,id,tweet
196,7528,#fail @SouthwestAir restricting triple points ...
1281,50183,@americanair flight AA 5385 should have left a...
3971,151419,@BridgetPhetasy I had to fly @united in March ...
3806,144567,I usually like @SouthwestAir but today's fligh...
950,37789,Worst. MT@BostonDotCom: JetBlue doesn't love u...


In [11]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


### Data pre-processing

In [12]:
def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [13]:
# Print sentence 0
print('Original: ', X[0])
print('Processed: ', text_preprocessing(X[0]))

Original:  @united I'm having issues. Yesterday I rebooked for 24 hours after I was supposed to fly, now I can't log on &amp; check in. Can you help?
Processed:  I'm having issues. Yesterday I rebooked for 24 hours after I was supposed to fly, now I can't log on & check in. Can you help?


### Tokenizer

In [35]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=sent,  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True,      # Return attention mask
            truncation = True
            )
        #print(encoded_sent)
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [40]:
preprocessing_for_bert(tweet)[0].squeeze()



tensor([[ 101, 1056,  102,  ...,    0,    0,    0],
        [ 101, 1044,  102,  ...,    0,    0,    0],
        [ 101, 1041,  102,  ...,    0,    0,    0],
        ...,
        [ 101, 1044,  102,  ...,    0,    0,    0],
        [ 101, 1045,  102,  ...,    0,    0,    0],
        [ 101, 1055,  102,  ...,    0,    0,    0]])

In [38]:
preprocessing_for_bert(X[0])[0].squeeze()



tensor([[ 101, 1030,  102,  ...,    0,    0,    0],
        [ 101, 1057,  102,  ...,    0,    0,    0],
        [ 101, 1050,  102,  ...,    0,    0,    0],
        ...,
        [ 101, 1048,  102,  ...,    0,    0,    0],
        [ 101, 1052,  102,  ...,    0,    0,    0],
        [ 101, 1029,  102,  ...,    0,    0,    0]])

In [21]:
tweets = df["text"].to_numpy()[:1000]
tweet = tweets[123]
print(tweet)

preprocessing_for_bert(tweet)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


the first term prime minister won promise but the second term will have won performance not even narendra modi exception this


(tensor([[ 101, 1056,  102,  ...,    0,    0,    0],
         [ 101, 1044,  102,  ...,    0,    0,    0],
         [ 101, 1041,  102,  ...,    0,    0,    0],
         ...,
         [ 101, 1044,  102,  ...,    0,    0,    0],
         [ 101, 1045,  102,  ...,    0,    0,    0],
         [ 101, 1055,  102,  ...,    0,    0,    0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]))

In [None]:
# Concatenate train data and test data
all_tweets = np.concatenate([data.tweet.values, test_data.tweet.values])

# Encode our concatenated data
encoded_tweets = [tokenizer.encode(sent, add_special_tokens=True) for sent in all_tweets]

# Find the maximum length
max_len = max([len(sent) for sent in encoded_tweets])
print('Max length: ', max_len)

Max length:  68


In [22]:
len(X[0])

138

In [31]:
text_preprocessing(X[0])

"I'm having issues. Yesterday I rebooked for 24 hours after I was supposed to fly, now I can't log on & check in. Can you help?"

In [32]:
text_preprocessing(tweet)

'the first term prime minister won promise but the second term will have won performance not even narendra modi exception this'

In [26]:
# Specify `MAX_LEN`
MAX_LEN = 64

# Print sentence 0 and its encoded token ids
token_ids = list(preprocessing_for_bert([X[0]])[0].squeeze().numpy())
print('Original: ', X[0])
print('Token IDs: ', token_ids)


# Run function `preprocessing_for_bert` on the train set and the validation set
print('Tokenizing data...')
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)



Original:  @united I'm having issues. Yesterday I rebooked for 24 hours after I was supposed to fly, now I can't log on &amp; check in. Can you help?
Token IDs:  [101, 1045, 1005, 1049, 2383, 3314, 1012, 7483, 1045, 2128, 8654, 2098, 2005, 2484, 2847, 2044, 1045, 2001, 4011, 2000, 4875, 1010, 2085, 1045, 2064, 1005, 1056, 8833, 2006, 1004, 4638, 1999, 1012, 2064, 2017, 2393, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Tokenizing data...


101

In [24]:
tweets = df["text"].to_numpy()[:1000]
print(tweets[999])
print(X[5])

token = preprocessing_for_bert(tweets[0])[0].squeeze().numpy()


modis opposition trying defame him they not want succeed indiaabusing mister prime minister worst formbut spite all modi going win
@AmericanAir #AmericanAirlines  Flight 1179 Miami to DC. Gate closes at least 19 minutes before the flight. Agents show up and lie to me.
