In [1]:
# Load Dependencies
from transformers import AutoTokenizer
import numpy as np
import json
import re
from difflib import SequenceMatcher
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
from copy import copy

In [2]:
# Load the tokenizer and prepare tokens vocabulary
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
id_vocab = {v:k for k,v in tokenizer.vocab.items()}
all_vocab_str = []
for i in range(len(id_vocab)):
    all_vocab_str.append(id_vocab[i])



In [3]:
# # Load the polarity and subjectivity of tokens from spacy text blob
# all_vocab_polarity = []
# all_vocab_subjectivity = []
# nlp = spacy.load('en_core_web_sm', disable=['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])
# nlp.add_pipe('spacytextblob')
# for i in range(len(all_vocab_str)):
#     text = all_vocab_str[i]
#     doc = nlp(text)
#     polarity = 0
#     subjectivity = 0
#     for t in doc:
#         polarity+=t._.blob.polarity
#         subjectivity+=t._.blob.subjectivity
#     all_vocab_polarity.append(polarity)
#     all_vocab_subjectivity.append(subjectivity)
# all_vocab_polarity = np.array(all_vocab_polarity)
# all_vocab_subjectivity = np.array(all_vocab_subjectivity)

In [4]:
# Creaete batch requests for OpenAI chat, to get subjectivity and polarity of tokens
temp_str = """ the texts: """
def create_batch_request_for_openai(tokens_list):
    batch_line = temp_str
    batch_line += tokens_list[0]
    for t in tokens_list[1:]:
        batch_line += ' | '
        batch_line += t
    return batch_line
lines = {}
interval = 50
for i in range(0, len(all_vocab_str), interval):
    upper_bound = min(i+interval, len(all_vocab_str))
    lines[(i, upper_bound)] = create_batch_request_for_openai(all_vocab_str[i:upper_bound])

jsonl_lines = []
for k, v in lines.items():
    # print(k, len(v.split('|')))
    system_prompt = """As sentiment classifier, write polarity and subjectivity for the provided texts. For example for the given text 'good' write 'good: polarity=0.7, subjectivity=0.6' and for the text 'bad' write 'bad: polarity=-0.7, subjectivity=0.67' and for the given text 'good | bad' write 'good: polarity=0.7, subjectivity=0.6 | bad: polarity=-0.7, subjectivity=0.67'. I will give you exactly {t_count} texts and you just write the polarity and sensitivity without any other words for exact same {t_count} texts."""
    jsonl_lines.append({"custom_id": f"{k}", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-4o-mini", "messages": [{"role": "system", "content": system_prompt.format(t_count=len(v.split('|')))},{"role": "user", "content": v}],"max_tokens": 1000}})

In [5]:
# Save the batch requests into '.jsonl' files
with open(r'DataManipulation\tokens_polarity.jsonl', 'w', encoding="utf-8") as f:
    for i in range(len(jsonl_lines)):
        f.write(json.dumps(jsonl_lines[i])+'\n')

In [6]:
# Read to response to batch requests
with open(r'DataManipulation\token_polarity_mini_2_batch_674650e577508190b2e501b187ce6ed7_output.jsonl', 'r') as f:
    all_lines = []
    line_stream = f.readlines()
    for line in line_stream:
        all_lines.append(json.loads(line))

In [7]:
# Take the polarity and sensitivy from the responses
def get_string_ints(string_sample):
    str_arr = re.findall(r'\d+', string_sample)
    int_arr = [int(s) for s in str_arr]
    return int_arr

def get_string_floats(string_sample):
    all_floats = re.findall(r"[-+]?(?:\d*\.*\d+)", string_sample)
    all_floats = [float(f) for f in all_floats]
    return all_floats

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def map_content_to_tokens(content, tokens):
    
    content_dict = dict()
    t_c_dict = dict()
    content_keys = []
    for c in content:
        c = c.split(':')
        if len(c) == 3:
            c= [':', c[2]]
        elif len(c) ==0:
            continue
        elif len(c)==1 or len(c) >3:
            print(f"failed: {c}")
            continue
        # print(f'cccc: {c}')
        content_dict[c[0]] = c[1]
        content_keys.append(c[0])
    tokens_ = copy(tokens)
    for c in content_keys:
        most_similar_token = tokens_[0]
        base_sim = similar(c, most_similar_token)
        for t in tokens_[1:]:
            new_sim = similar(c, t)
            if new_sim > base_sim:
                base_sim = new_sim
                most_similar_token = t
        t_c_dict[most_similar_token] = c
        tokens_.remove(most_similar_token)
    return t_c_dict, content_dict

all_vocab_sentiments = dict()
k = 0
for j in range(len(all_lines)):
    ids_range = get_string_ints(all_lines[j]['custom_id'])
    content = all_lines[j]['response']['body']['choices'][0]['message']['content']
    content = str.split(content, '|')
    # print(ids_range)
    # print(len(content), content)
    list_to_remove = []
    for i in range(len(content)):
        if content[i].strip() =='':
            list_to_remove.append(i)
    for ltr in list_to_remove:
        del content[ltr]
    t_c_dict, content_dict = map_content_to_tokens(content, all_vocab_str[ids_range[0]: ids_range[1]])
            
    for i in range(ids_range[0], ids_range[1]):
        
        if all_vocab_str[i] not in t_c_dict:
            k += 1
            # print(k, 'failed', i, t, all_vocab_str[i])
            all_vocab_sentiments[all_vocab_str[i]] = [0, 0]
            continue
        
        # print(i, all_vocab_str[i], t_c_dict[all_vocab_str[i]], content_dict[t_c_dict[all_vocab_str[i]]])
        content_info = content_dict[t_c_dict[all_vocab_str[i]]].split(',')
        content_info = [t.strip() for t in content_info]
        content_info = sorted(content_info)
        polarity = get_string_floats(content_info[0])
        # print(content_info)
        subjectivity = get_string_floats(content_info[1])
        all_vocab_sentiments[all_vocab_str[i]]  = [polarity, subjectivity]
    

failed: [' ']
failed: ['ATFORM ']


In [8]:
# Convert the token's subjectivity and polarity into a tensor with same order as tokens
vocab_sentiments = []
for i in range(len(all_vocab_str)):
    vocab_sentiments.append(np.array(all_vocab_sentiments[all_vocab_str[i]]).squeeze())
vocab_sentiments = np.vstack(vocab_sentiments)
vocab_sentiments.shape

(128001, 2)

In [9]:
# Save the sentiment data extracted from GPT
np.save(r'Data\ReducedEmbeddings\polarity_debertav3_tokens_gpt_mini_emb.npy', vocab_sentiments, allow_pickle=False)