In [1]:
import pandas as pd
import os
from glob import glob
import torch
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import tensorflow as tf
from transformers import BertTokenizer
import warnings
warnings.filterwarnings("ignore")
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda:0")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


2024-03-20 12:41:07.295601: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-20 12:41:07.295664: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-20 12:41:07.296887: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-20 12:41:07.304040: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


There are 2 GPU(s) available.
We will use the GPU: NVIDIA A100-PCIE-40GB


In [2]:
path = '../test_speeches/'
files = sorted(glob(path+'*.xlsx'))
print(files)


['../test_speeches/Coding_Differneces_103.xlsx', '../test_speeches/Coding_Differneces_11.xlsx', '../test_speeches/Coding_Differneces_110.xlsx', '../test_speeches/Coding_Differneces_118.xlsx', '../test_speeches/Coding_Differneces_12.xlsx', '../test_speeches/Coding_Differneces_13.xlsx', '../test_speeches/Coding_Differneces_134.xlsx', '../test_speeches/Coding_Differneces_140.xlsx', '../test_speeches/Coding_Differneces_150.xlsx', '../test_speeches/Coding_Differneces_4.xlsx', '../test_speeches/Coding_Differneces_5.xlsx', '../test_speeches/Speech_10.xlsx', '../test_speeches/Speech_1_Obama_.xlsx', '../test_speeches/Speech_2_Maggie_Thatcher.xlsx', '../test_speeches/Speech_3_Testspeech.xlsx', '../test_speeches/Speech_4.xlsx', '../test_speeches/Speech_5.xlsx', '../test_speeches/Speech_6.xlsx', '../test_speeches/Speech_7.xlsx', '../test_speeches/Speech_8.xlsx', '../test_speeches/Speech_9.xlsx']


In [3]:
# load models
Metaphor = torch.load('Metaphor_similie_secondary_with_corpus.pt')
Rhetorical_questions = torch.load('Rhetorical question_secondary_with_corpus.pt')
Stories_anecdotes = torch.load('Story_Anecdote_secondary_with_corpus.pt')
Contrasts = torch.load('Contrast_secondary_with_corpus.pt')
Lists = torch.load('Lists_Repetitions_secondary_with_corpus.pt')
Moral_conviction = torch.load('Moral_conviction_secondary_label_with_corpus_20epoch.pt')
Sentiment_of_the_collective = torch.load('Sentiment of the Collective_secondary_with_corpus.pt')
ambitious_goals = torch.load('Ambitious goals_secondary_with_corpus.pt')
Confidence_in_goals = torch.load('Confidence in goals_secondary_with_corpus.pt')

secondary_models = [Metaphor,
                    Rhetorical_questions,
                    Stories_anecdotes,
                    Contrasts,
                    Lists,
                    Moral_conviction,
                    Sentiment_of_the_collective,
                    ambitious_goals,
                    Confidence_in_goals
                   ]

In [4]:

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [5]:
def preprocess_data(test_data):    
    label=[]
    for i in range(len(test_data)):
        if any(test_data.iloc[i]==1):
            label.append(1)
        else:
            label.append(0)
    test_data["label"]=label
    #print('Number of test sentences: {:,}\n'.format(test_data.shape[0]))
# Create sentence and label lists
    sentences = test_data.sentence.values
    labels = test_data.drop(columns=['sentence'])['label'].values
    MAX_LEN = 126
# Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []
# For every sentence...
    for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
        encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
    
        input_ids.append(encoded_sent)
# Pad our input tokens
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")
    labels = torch.tensor(labels,dtype=torch.long)
# Create attention masks
    attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask) 
# Convert to tensors.
    prediction_inputs = torch.tensor(input_ids)
    prediction_masks = torch.tensor(attention_masks)
    prediction_labels = torch.tensor(labels)
# Set the batch size.  
    batch_size = 32  
# Create the DataLoader.
    prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
    return  prediction_inputs, prediction_masks, prediction_labels,  prediction_data, prediction_sampler, prediction_dataloader

In [6]:
def make_prediction(model, test_data):
    prediction_inputs, prediction_masks, prediction_labels, prediction_data, prediction_sampler, prediction_dataloader = preprocess_data(test_data)
    # Prediction on test set
    #print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))
    # Put model in evaluation mode
    model.eval()
    model.to(device)
    # Tracking variables 
    predictions , true_labels, prediction_probs = [], [],[] #Atefeh
    # Predict 
    for batch in prediction_dataloader:
      # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
  
      # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
  
      # Telling the model not to compute or store gradients, saving memory and 
      # speeding up prediction
        with torch.no_grad():
      # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)
        
        logits = outputs[0]
  # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.detach().to('cpu').numpy()
  
        prediction_probs.append (tf.nn.softmax(logits))

  # Store predictions and true labels
        predictions.append(logits)
        true_labels.append(label_ids)
    
    print('DONE.')
    
    
    for i in range(len(predictions)):  
      # The predictions for this batch are a 2-column ndarray (one column for "0" 
      # and one column for "1"). Pick the label with the highest value and turn this
      # in to a list of 0s and 1s.
        pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
    
    flat_predictions = [item for sublist in prediction_probs for item in sublist]
    flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
    #print(flat_predictions)
    model.cpu()
    return flat_predictions

In [8]:
# for file in files do: 
# read_data
# for sentence in sentences
# primary model
# if yes: secondary models for each tactic
#tactics = ["Metaphor/Similie",
#           "Rhetorical questions", 
#           "Stories/Anecdotes", 
#           "Contrasts", 
#           "Lists",
#           "Repetition",
#           "Moral_conviction",
#           "Sentiment_of_the_collective",
#           "Setting_high_expectations",
#           "Confidence_in_goals"]
cols = pd.read_excel(files[0], skiprows=6).drop(columns=['Unnamed: 0']).columns

print(cols)
tactics = cols[1:]
for file in files:
    data = pd.read_excel(file, skiprows=6).drop(columns=['Unnamed: 0']).fillna('')
    #new_cols = dict(zip(cols, ["sentence","Metaphor/Similie",
    #       "Rhetorical questions", 
    #       "Stories/Anecdotes", 
    #       "Contrasts", 
    #       "Lists",
    #       "Repetition",
    #       "Moral_conviction",
    #       "Sentiment_of_the_collective",
    #       "Setting_high_expectations",
    #       "Confidence_in_goals"]))
    data = data.rename(columns = {'Sentence':'sentence'}).fillna(0)
    
    for i, pred_model in enumerate(secondary_models):
        print(tactics[i])
        prediction = make_prediction(pred_model, data)
        data[tactics[i]] = prediction
        
    final_df = data.fillna(0)
    final_df.to_excel('../test_speeches/predictions/onestep/'+file[17:-5]+'_AI.xlsx')
    print('File written to: ' + '../test_speeches/predictions/onestep/'+file[17:-5]+'_AI.xlsx')

Index(['Sentence', 'Metaphor/Simile', 'Rhetorical questions',
       'Stories / anecdotes', 'Contrasts', 'Lists / Repetition ',
       'Moral conviction', 'Sentiment of the collective',
       'Ambitious goals / Setting high expectations', 'Confidence in goals'],
      dtype='object')
Metaphor/Simile
DONE.
[1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1]
Rhetorical questions
DONE.
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 