In [None]:
# GloVe: 
# This is another popular word embedding technique that uses a 
# co-occurrence matrix to create word vectors. 
# It combines elements of both CountVectorizer and Word2Vec, 
# and has been shown to be effective in a variety of NLP tasks

In [1]:
import pandas as pd
import numpy as np
import ast

In [2]:
df = pd.read_csv("Processed_sarcasm.csv")
df.drop('Unnamed: 0', axis = 1)

Unnamed: 0,id,utterance,speaker,context,context_speakers,show,sarcasm
0,1_60,"['privilege', 'watch', 'mind', 'work']",sheldon,"[['never', 'would', 'identify', 'fingerprint',...","['leonard', 'sheldon']",0,1
1,1_70,"['dont', 'think', 'ill', 'able', 'stop', 'think']",penny,"[['one', 'favorite', 'place', 'kick', 'back', ...","['howard', 'penny', 'howard', 'howard', 'howar...",0,1
2,1_80,"['since', 'bee', 'season', 'epinephrine']",sheldon,"[['go', 'pad', 'thai', 'peanut'], ['peanut', '...","['leonard', 'howard', 'leonard']",0,0
3,1_90,"['lois', 'lane', 'fall', 'accelerate', 'initia...",sheldon,"[['marathon', 'many', 'superman', 'movie'], ['...","['penny', 'sheldon', 'penny', 'sheldon', 'shel...",0,0
4,1_105,"['im', 'infer', 'couch', 'evidence', 'suggests...",sheldon,"[['great', 'caesar', 'ghost', 'look', 'place']...","['sheldon', 'leonard', 'sheldon', 'sheldon', '...",0,1
...,...,...,...,...,...,...,...
685,2_169,"['he', 'right', 'part', 'suggest', 'boss', 'go...",chandler,"[['gonna'], ['pas', 'tape', 'along']]","['chandler', 'rachel']",1,1
686,2_235,"['oh', 'yeah', 'caretaker', 'old', 'brother', ...",chandler,"[['helo', 'anybody', 'order', 'celebrity'], ['...","['joey', 'person', 'chandler', 'person']",1,0
687,2_34,"['greeting', 'go', 'downhill', 'around']",chandler,"[['hey'], ['son', 'bitch']]","['chandler', 'joey']",1,1
688,2_608,"['right', 'say', 'nice', 'virtually', 'lick']",chandler,"[['go', 'school'], ['hey', 'there', 'missy', '...","['chandler', 'ross', 'chandler', 'ross']",1,1


In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,utterance,speaker,context,context_speakers,show,sarcasm
0,0,1_60,"['privilege', 'watch', 'mind', 'work']",sheldon,"[['never', 'would', 'identify', 'fingerprint',...","['leonard', 'sheldon']",0,1
1,1,1_70,"['dont', 'think', 'ill', 'able', 'stop', 'think']",penny,"[['one', 'favorite', 'place', 'kick', 'back', ...","['howard', 'penny', 'howard', 'howard', 'howar...",0,1
2,2,1_80,"['since', 'bee', 'season', 'epinephrine']",sheldon,"[['go', 'pad', 'thai', 'peanut'], ['peanut', '...","['leonard', 'howard', 'leonard']",0,0
3,3,1_90,"['lois', 'lane', 'fall', 'accelerate', 'initia...",sheldon,"[['marathon', 'many', 'superman', 'movie'], ['...","['penny', 'sheldon', 'penny', 'sheldon', 'shel...",0,0
4,4,1_105,"['im', 'infer', 'couch', 'evidence', 'suggests...",sheldon,"[['great', 'caesar', 'ghost', 'look', 'place']...","['sheldon', 'leonard', 'sheldon', 'sheldon', '...",0,1


In [4]:
def change_to_string(text):
    # str to list
    tokens = ast.literal_eval(text)
    
    temp = []
    if type(tokens[0]) == list:
        for lst in tokens:
            sentence = ' '.join([word for word in lst])
            temp.append(sentence)
            
        return ' '.join([sen for sen in temp])
        
    else:
        sentence = ' '.join([word for word in tokens])
        return sentence

In [5]:
df['context'] = df['context'].apply(change_to_string)
df['utterance'] = df['utterance'].apply(change_to_string)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,utterance,speaker,context,context_speakers,show,sarcasm
0,0,1_60,privilege watch mind work,sheldon,never would identify fingerprint string theory...,"['leonard', 'sheldon']",0,1
1,1,1_70,dont think ill able stop think,penny,one favorite place kick back quest great house...,"['howard', 'penny', 'howard', 'howard', 'howar...",0,1
2,2,1_80,since bee season epinephrine,sheldon,go pad thai peanut peanut oil im sure everyone...,"['leonard', 'howard', 'leonard']",0,0
3,3,1_90,lois lane fall accelerate initial rate foot pe...,sheldon,marathon many superman movie youre kid right l...,"['penny', 'sheldon', 'penny', 'sheldon', 'shel...",0,0
4,4,1_105,im infer couch evidence suggests coffee table ...,sheldon,great caesar ghost look place penny little mes...,"['sheldon', 'leonard', 'sheldon', 'sheldon', '...",0,1


In [7]:
input_df = pd.DataFrame()
input_df['context_utter'] = df['context'] + df['utterance']
input_df['sarcasm'] = df['sarcasm']

In [8]:
input_df.head()

Unnamed: 0,context_utter,sarcasm
0,never would identify fingerprint string theory...,1
1,one favorite place kick back quest great house...,1
2,go pad thai peanut peanut oil im sure everyone...,0
3,marathon many superman movie youre kid right l...,0
4,great caesar ghost look place penny little mes...,1


# BERT MODEL

In [9]:
get_length = lambda x: len(x.split())

# apply the lambda function to the 'text' column and get the maximum length
max_length = input_df['context_utter'].apply(get_length).max()

print("Max length context: ", max_length)

Max length context:  66


In [12]:
# preprocessing for bert
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def get_tokenized_bert_input(text):
    
    bert_input = tokenizer(text, padding='max_length', max_length = max_length, 
                       truncation = True, return_tensors="pt")
    
    return bert_input

In [13]:
input_df['context_utter'] = input_df['context_utter'].apply(get_tokenized_bert_input)

In [20]:
input_df.head()

Unnamed: 0,context_utter,sarcasm
0,"[input_ids, token_type_ids, attention_mask]",1
1,"[input_ids, token_type_ids, attention_mask]",1
2,"[input_ids, token_type_ids, attention_mask]",0
3,"[input_ids, token_type_ids, attention_mask]",0
4,"[input_ids, token_type_ids, attention_mask]",1


In [29]:
print(input_df.iloc[0,0]['input_ids'])
print(input_df.iloc[0,0]['token_type_ids'])
print(input_df.iloc[0,0]['attention_mask'])

tensor([[  101,  1309,  1156,  6183,  3602, 10988,  5101,  2749, 11026,  1992,
         21260, 13382,  1184,  1116,  2197,  1643,  2047, 15655, 27487,  2824,
          1713,  1250,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


In [34]:
example_text = tokenizer.decode(input_df.iloc[0,0].input_ids[0])
example_text

'[CLS] never would identify fingerprint string theory aftermath big bang apology whats planprivilege watch mind work [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [38]:
import torch

# Assume you have a pandas dataframe 'input_df' with a column named 'input_tensors'
input_tensors_col = input_df['context_utter']

input_ids_list = []
token_type_ids_list = []
attention_mask_list = []


# Iterate over the column and append each tensor to its corresponding list
for input_tensors in input_tensors_col:
    input_ids_list.append(input_tensors['input_ids'])
    token_type_ids_list.append(input_tensors['token_type_ids'])
    attention_mask_list.append(input_tensors['attention_mask'])

print(input_ids_list)
# Convert each list to a tensor
input_ids_tensor = torch.tensor(input_ids_list)
token_type_ids_tensor = torch.tensor(token_type_ids_list)
attention_mask_tensor = torch.tensor(attention_mask_list)

# Stack the tensors together to create the final tensor
input_tensors = torch.stack([input_ids_tensor, token_type_ids_tensor, attention_mask_tensor], dim=1)

[tensor([[  101,  1309,  1156,  6183,  3602, 10988,  5101,  2749, 11026,  1992,
         21260, 13382,  1184,  1116,  2197,  1643,  2047, 15655, 27487,  2824,
          1713,  1250,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), tensor([[  101,  1141,  5095,  1282,  5642,  1171, 12485,  1632,  1402, 23280,
          4348, 13778,  8147,   178,  2707,  1290,  1634,  1995,  1271,  6324,
          4050,  1342,  1959,  1180,  7311,  2654,  1301, 12485,  1839,  2199,
          1128,  2339,  1341,  3842,  1204,  1341,  5178,  1682,  1831,  1341,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     

TypeError: only integer tensors of a single element can be converted to an index