In [1]:
import pandas as pd
import os
from glob import glob
import numpy as np

In [2]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda:0")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3060


In [3]:
# read corpora
ambitious = pd.read_csv('../corpora/ambitious_goals.csv').drop("Unnamed: 0", axis=1)
confidence = pd.read_csv('../corpora/confidence_in_goals.csv').drop("Unnamed: 0", axis=1)
contrast = pd.read_csv('../corpora/contrast.csv').drop("Unnamed: 0", axis=1)
lists = pd.read_csv('../corpora/lists_repetitions.csv').drop("Unnamed: 0", axis=1)
similie = pd.read_csv('../corpora/metaphor_similie.csv').drop("Unnamed: 0", axis=1)
rhetoricalq = pd.read_csv('../corpora/rhetorical_question.csv').drop("Unnamed: 0", axis=1)
sentiment = pd.read_csv('../corpora/sentiment_of_the_collective.csv').drop("Unnamed: 0", axis=1)
story = pd.read_csv('../corpora/story_anecdote.csv').drop("Unnamed: 0", axis=1)
vua_metaphor = pd.read_csv('../corpora/vua_metaphor.csv')
vua_metaphor = vua_metaphor[['sentence', 'label']].rename(columns={'label':'Metaphor', 'sentence':'Results'})
moral_conv = pd.read_csv('../corpora/moral_convictions.csv').rename(columns={'sentence':'Results'})

In [4]:
dfs = [ambitious, confidence, contrast, lists, similie, rhetoricalq, sentiment, story, vua_metaphor, moral_conv]
data = pd.concat(dfs, ignore_index=True).fillna(0)
data['Results'] = data['Results'].apply(lambda x: x.replace('"',''))
data = data.rename(columns={'Results':'sentence'})
data

Unnamed: 0,sentence,ambitious_goals,confidence_in_goals,contrast,repetition,similie,rhetorical_question,sentiment_of_the_collective,story_anecdote,Metaphor,Moral_conviction
0,"Within the next three years, we are determined...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"By 2030, our company's revenue will surpass $1...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"In the next five years, we will establish part...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"By 2025, we will successfully reduce our energ...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Within the next decade, we intend to open 100 ...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
84763,We cannot allow our personal biases to lead us...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
84764,It is important to hold ourselves and our inst...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
84765,The long-term consequences of our actions can ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
84766,We must ensure that our policies do not transg...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [5]:
from sklearn.model_selection import train_test_split
y = data.drop(columns=['sentence']).values #get the labels
train, test = train_test_split(data, test_size=0.2, random_state=1337)

In [None]:
sentences = train.sentence.values
labels =  train.drop(columns=['sentence']).values

In [None]:
np.unique(labels)

In [None]:
from transformers import BertTokenizer
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
# For every sentence...
for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        # This function also supports truncation and conversion
                        # to pytorch tensors, but we need to do padding, so we
                        # can't use these features :( .
                        #max_length = 128,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )
    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)
# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

In [None]:
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

In [None]:
# get the distribution of sentence lengths
sentences_len = [len(sen) for sen in sentences]
sentences_len[:5]

In [None]:
# We'll use the `pad_sequences` utility function to do this.
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Set the maximum sequence length.
# 149 is the mean of sequence lengths
MAX_LEN = max([len(sen) for sen in input_ids])
print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)
print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))
# Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")

labels = torch.tensor(labels,dtype=torch.long)
print('\Done.')

In [None]:
# Create attention masks
attention_masks = []
# For each sentence...
for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

In [None]:
# Use train_test_split to split our data into train and validation sets for
# training
from sklearn.model_selection import train_test_split
# Use 90% for training and 10% for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=2018, test_size=0.1)
