In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../')
import os
import re 
import logging
import json
from pprint import pprint as pp
import numpy as np
import pandas as pd
from collections import defaultdict
from scipy.sparse import csr_matrix
from typing import List
from tqdm import tqdm
from argparse import ArgumentParser
import datetime
from dateutil import tz
import torch
import torch.optim as optim
import torch.multiprocessing as mp
import torch
import pickle
import math
import warnings
from sentence_transformers import SentenceTransformer
from model.decoderMLP import decoderMLP, decoderAttention, movieTransformer
from helper.sampler import NegSampler, negsamp_vectorized_bsearch_preverif
from model.MF import MatrixFactorization, MatrixFactorizationLLM
from trainer.training_utils import *
from helper.eval_metrics import *
from helper.dataloader import *
from StyleTransfer.scorer import *
from StyleTransfer.editor import RobertaEditor
from StyleTransfer.config import get_args
import random
import json
from pprint import pprint as pp


import debugpy


def get_args():

    parser = argparse.ArgumentParser(description="model parameters")
    parser.add_argument('--seed', type=int, default=42, help='Seed for random number generator')
    
    parser.add_argument('--output_dir', type=str, default="output/", help='Output directory path to store checkpoints.')
    parser.add_argument('--class_name',default='../EleutherAI/gpt-neo-1.3B',type=str)
    parser.add_argument('--topk', default=20, type=int,help="top-k words in masked out word prediction")
    parser.add_argument("--fluency_weight", type=int, default=1, help='fluency')
    parser.add_argument("--sem_weight",type=int, default=1, help='semantic similarity')
    parser.add_argument("--style_weight", type=int, default=8, help='style')
    parser.add_argument("--max_steps", type=int, default=5)
    parser.add_argument("--bs",type=int,default=4,help="batch size")
    parser.add_argument('--keyword_pos', default=True, type=bool)
    parser.add_argument("--early_stop",default=True, type=bool)
    parser.add_argument("--data_name", default='ml-100k', type=str)
    parser.add_argument("--embedding_module", default='t5', type=str)
    parser.add_argument('--debugger', action='store_true')

    args, _ = parser.parse_known_args([]) 
    args.device = torch.device('cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu') )
    args.device = 'cpu'
    return args


args = get_args()

if args.debugger: 
    debugpy.listen(5678)
    print("Waiting for debugger attach")
    debugpy.wait_for_client()
    
def find_string_differences(str1, str2):
    # Find the indices where the strings differ
    str1 = str1.split()
    str2 = str2.split()
    
    diff_indices = [i for i, (c1, c2) in enumerate(zip(str1, str2)) if c1 != c2]

    # Print 5 indices before and after each difference
    for index in diff_indices:
        start_index = max(0, index - 30)
        end_index = min(len(str1), index + 30)


        print(f"Difference at index {index}:")

        print(f"String 2: {str2[start_index:end_index]}")
        print(f"String 1: {str1[start_index:end_index]}")
        print()
        

# Split the text into words
def do_not_edit(text):
    text = text[0] if not isinstance(text,str) else text
    
    words = text.split()

    # Initialize a list to store the indices of words with ':'
    indices_of_words_with_colon = []

    # Iterate through the words to find the indices of words with ':'
    for index, word in enumerate(words):
        if ':' in word:
            # Add the index to the list
            indices_of_words_with_colon.append(index)

    # Print the list of indices of words with ':'
    return indices_of_words_with_colon
        
def get_preds(summaries ,USER_INDEX): 
    args.embedding_module = 't5'
    topk= args.topk
    embs = get_genrewise_embeddings(summaries,args, model= transformer_model )

    genre_list = get_genres()
    embs_tens = model.user_embeddings.prepare_input(embs,genre_list).to(args.device)


    rating_pred = model.predict(embs_tens.unsqueeze(0)).cpu().detach().numpy()
    
    rating_pred[train_matrix[USER_INDEX].toarray() > 0] = 0


    # reference: https://stackoverflow.com/a/23734295, https://stackoverflow.com/a/20104162
    ind = np.argpartition(rating_pred, -topk)
    ind = ind[:, -topk:]
    arr_ind = rating_pred[np.arange(len(rating_pred))[:, None], ind]
    arr_ind_argsort = np.argsort(arr_ind)[np.arange(len(rating_pred)), ::-1]
    

    ranked_items = ind[np.arange(len(rating_pred))[:, None], arr_ind_argsort]
    recall_val = recall_at_k_one(actual_list_val[3], ranked_items[0].tolist(), 20)


    reversed_movie_title_to_id = {v: k for k, v in movie_title_to_id.items()}
    movie_titles_ranked = [f'{index} : {reversed_movie_title_to_id[i+1]} {id_genre_map[i+1]}' for index,i in enumerate(ranked_items[0][:20])]



    return torch.tensor(ranked_items).to(args.device),torch.tensor(rating_pred).to(args.device)

def make_string_dict(data):
    
    genre_summary_dict = {}
    data = data[0].lower().replace('\n',' ').replace('-','').replace('summary:','summary')

    # Use regular expression to find genre and summary information
    matches = re.finditer(r'(\w+): (.+?)(?=\w+:|$)', data)

    # Iterate through the matches and extract genre and summary information
    for match in matches:
        genre, summary = match.group(1), match.group(2)
        genre_summary_dict[genre] = summary.strip()

    # Convert the dictionary to JSON
    return genre_summary_dict

def make_dict_string(data):
    data_out = []
    for d in data:
        user_str = ''
        for k,v in d.items():
            user_str += f" {k}: {v}"
        data_out.append(user_str)
        
    return data_out

            
    
    
# %%

def style_ranker(text,movie_id =57 ,user_id=None,original_index =16,model= None ):
    preds_new,scores = get_preds(make_string_dict(text),user_id)
    if movie_id in preds_new: 
        return scores[:,movie_id],preds_new[0]
    else:
        return torch.tensor([0]).to(args.device),torch.tensor([0]).to(args.device)
    
    
# %%
lr= 0.00001
epochs = 400
num_heads = 6
cosine = False
num_layers = 3
output_emb = 64
embedding_dim = 768
saved_path = f'../saved_model/ml-100k/attn_best_model_{lr}_{epochs}_{num_heads}_{cosine}_{num_layers}.pth'

model_path = saved_path + '_best_model.pth'
embedder_path = saved_path + '_embedder.pth'
item_embeddings_path = saved_path + '_item_embeddings.pth'
user_embeddings_path = saved_path + '_user_embeddings.pth'
model_rankings_path = saved_path + '_rankings_matrix.npy'
id_genre_map = map_id_to_genre('../data/ml-100k/movies.dat')

# 1. Data Loading & Preprocessing
train_data = load_dataset("../data_preprocessed/ml-100k/data_split/train_set_leave_one.json")
valid_data = load_dataset("../data_preprocessed/ml-100k/data_split/valid_set_leave_one.json")
test_data = load_dataset("../data_preprocessed/ml-100k/data_split/test_set_leave_one.json")
movie_title_to_id = map_title_to_id("../data/ml-100k/movies.dat")

train_data = convert_titles_to_ids(train_data, movie_title_to_id)
valid_data = convert_titles_to_ids(valid_data, movie_title_to_id)
test_data = convert_titles_to_ids(test_data, movie_title_to_id)

train_matrix, actual_list_val, actual_list_test = create_train_matrix_and_actual_lists(train_data, valid_data,
                                                                                        test_data, movie_title_to_id)
train_matrix = csr_matrix(train_matrix)  # Convert train_matrix to a CSR matrix

num_users, num_items = train_matrix.shape
args.output_emb = 64
user_embedder = decoderAttention(embedding_dim,num_heads,num_layers,output_emb, 0  ,bias = True)
model = MatrixFactorizationLLM(num_users, user_embedder,num_items, args).to(args.device)
rankings_true = np.load(model_rankings_path)

model.load_state_dict(torch.load(model_path,map_location=torch.device('cuda')))
user_embedder.load_state_dict(torch.load(user_embeddings_path,map_location=torch.device('cuda')))
model.user_embeddings = user_embedder
model.eval()

transformer_model = SentenceTransformer('sentence-transformers/sentence-t5-large').to(args.device) 

with open('../saved_user_summary/ml-100k/user_summary_gpt3.5_in1_title0_full.json','r') as j:
    data = json.load(j)
    data = {int(key): value for key, value in data.items()}
  



last_items = rankings_true[:,19]
data =[v for k,v in data.items()]


rankings = np.load('./rankings.npy' )

data = make_dict_string(data)

args.max_len = 514
# print(f"{args.max_len=}")
# exit(0)

os.environ["TOKENIZERS_PARALLELISM"] = "false"


  from .autonotebook import tqdm as notebook_tqdm
2023-11-16 11:25:02,441	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
  torch.nn.init.xavier_uniform(m.weight)


In [2]:
editor = RobertaEditor(args).to(device)
sahc = SteepHC(args, editor).to(device)   


Editor built


In [3]:


of_dir = 'results/' + args.output_dir

if not os.path.exists(of_dir):
    os.makedirs(of_dir)


bsz = args.bs


tzone = tz.gettz('')
timestamp = datetime.datetime.now().astimezone(tzone).strftime('%Y-%m-%d_%H:%M:%S')

output_file =f'{timestamp}_{dst}_seed={str(args.seed)}_{str(args.style_weight)}.txt'

log_txt_path=os.path.join(of_dir, output_file.split('.txt')[0] + '.log')



for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
logging.basicConfig(format='',
                    filename=log_txt_path,
                    filemode='w',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)

word_pairs ={"ca n't": "can not", "wo n't": "will not"}
logging.info(args)

def print_es():
    print("Early Stopping!")
    logging.info("Early Stopping!")
    
num_batches = len(data)//bsz
result_d = {}


In [21]:

with open(of_dir + output_file, 'w', encoding='utf8') , torch.no_grad():
    for i in range(len(data)):
        batch_data = data[i]


        for k, v in word_pairs.items():
            batch_data = batch_data.strip().lower().replace(k, v)
        
        ref_oris = ref_olds = batch_data
        state_vec, _ = editor.state_vec([ref_olds])

        break_flag = False
        max_score=-np.inf
        step_max_score_list=[-np.inf]

        max_len=len(batch_data.split())
        select_sent = None
        movie_id = rankings[i,19]
        
        
        for step in range(args.max_steps):
            indices_of_words_with_colon = do_not_edit(ref_olds)
            sampled_indices = random.sample(range(max_len), 1)
            input_tuples = [[[ref_olds],[ops],[positions],bsz,max_len]
                                                    for positions in sampled_indices if positions not in indices_of_words_with_colon for ops in [0,1,2]]


            ref_news = [editor.edit(*inp)for inp in tqdm(input_tuples,desc = "Making Edits")]

            
            for idx in ( pbar := tqdm(range(len(ref_news)),desc = 'going through styles')):

                ref_new_batch_data=ref_news[idx]
               

                # Calculating the acceptance probability


                ref_old_score, ref_new_score, new_style_labels,_,pos_new,pos_old \
                    = sahc.acceptance_prob(ref_new_batch_data, ref_olds, ref_oris, state_vec,style_ranker,movie_id = movie_id,user_id = i)

                ref_hat = ref_new_batch_data

               
                new_style_label=new_style_labels
                
                # Updating the maximum score and selected sentence
           
                if ref_new_score>max_score and ref_new_score>ref_old_score:
                    select_sent = ref_hat
                    max_score=ref_new_score
                    print(f"New Score {ref_new_score=}")
                    
                pbar.set_description(f'score = {ref_new_score}')

                if args.early_stop == True and new_style_label == 1:
                        print(f"{new_style_label=}")
                        select_sent = ref_hat
                        print_es()
                        break_flag = True
                        break
            # Checking if the current score is larger than previous max score
       
            if max_score>=step_max_score_list[step]: 
                print("hill climbing!")
                logging.info("hill climbing!")
                if select_sent is None: 
                    random_draw = random.sample(range(len(input_tuples)),1)
                    print('randomly drawing')
                    select_sent = ref_news[random_draw[0]]

                    # find_string_differences(select_sent[0],ref_oris[0])

            



                ref_olds = select_sent

                pp(pos_new)
                pp(pos_old)
                result_d[i] = {'old' : ref_oris, 'new':select_sent,'movie' : movie_id,'steps':step}

                    
                step_max_score_list.append(max_score)
            else:
                print("don't climb, stop!")
                logging.info("don't climb, stop!")
                break_flag=True
            if break_flag:
                steps = step
                break
        if break_flag:
            select_sent = select_sent


        logging.info('climb {} steps, the selected sentence is: {}'.format(step+1,select_sent))
        print('climb {} steps, the selected sentence is: {}'.format(step+1,select_sent))
        print(f'The original sentence is: {ref_oris} ')
        break 
    
with open('./result_dict.pkl','wb+') as f:
    pickle.dump(result_d,f)









[A[A[A[A[A[A[A[A

masked_inputs=["action: summary: action-packed movies from the 90s with a mix of cyberpunk, post-apocalyptic, and martial arts themes. these films feature intense action sequences and are directed by renowned filmmakers. drama: summary: a collection of drama films ranging from disaster survival, epic western, historical biographical, romantic, and psychological thriller. these films explore themes of love, loss, personal struggles, and the human condition. romance: summary: a collection of romantic movies from various genres including drama, comedy, musical, and fantasy. these films explore themes of love and relationships, featuring a mix of well-known actors and diverse storylines. scifi: summary: a collection of sci-fi films ranging from space operas to post-apocalyptic adventures, with elements of cyberpunk and dystopia. the movies explore themes of technology, survival, and thrilling action, featuring memorable characters and imaginative settings. thriller: summary: a collection o









[A[A[A[A[A[A[A[A

masked_inputs=["action: summary: action-packed movies from the 90s with a mix of cyberpunk, post-apocalyptic, and martial arts themes. these films feature intense action sequences and are directed by renowned filmmakers. drama: summary: a collection of drama films ranging from disaster survival, epic western, historical biographical, romantic, and psychological thriller. these films explore themes of love, loss, personal struggles, and the human condition. romance: summary: a collection of romantic movies from various genres including drama, comedy, musical, and fantasy. these films explore themes of love and relationships, featuring a mix of well-known actors and diverse storylines. scifi: summary: a collection of sci-fi films ranging from space operas to post-apocalyptic adventures, with elements of cyberpunk and dystopia. the movies explore themes of technology, survival, and thrilling action, featuring memorable characters and imaginative settings. thriller: summary: a collection o









Making Edits: 100%|██████████| 3/3 [00:04<00:00,  1.62s/it]


masked_inputs=["action: summary: action-packed movies from the 90s with a mix of cyberpunk, post-apocalyptic, and martial arts themes. these films feature intense action sequences and are directed by renowned filmmakers. drama: summary: a collection of drama films ranging from disaster survival, epic western, historical biographical, romantic, and psychological thriller. these films explore themes of love, loss, personal struggles, and the human condition. romance: summary: a collection of romantic movies from various genres including drama, comedy, musical, and fantasy. these films explore themes of love and relationships, featuring a mix of well-known actors and diverse storylines. scifi: summary: a collection of sci-fi films ranging from space operas to post-apocalyptic adventures, with elements of cyberpunk and dystopia. the movies explore themes of technology, survival, and thrilling action, featuring memorable characters and imaginative settings. thriller: summary: a collection o









  total_scores = torch.tensor(fluency_scores) * torch.tensor(sim_scores) * torch.tensor(style_scores)
  total_scores = torch.tensor(style_scores)
going through styles:   0%|          | 0/3 [01:07<?, ?it/s]


KeyboardInterrupt: 

In [11]:
def find_different_word_indices(s1, s2):
    # Split the strings into lists of words
    words_s1 = s1.split()
    words_s2 = s2.split()

    # Find the minimum length of the two lists
    min_len = min(len(words_s1), len(words_s2))

    # Initialize a list to store the indices of differences
    different_indices = []

    # Iterate through each word and compare
    i = 0
    k=0
    while i < len(words_s1):
        print(f"{words_s1[i]=}")
        print(f"{ words_s2[k]=}",k)
        while  i < len(words_s1) and words_s1[i] != words_s2[k]:
            different_indices.append(i)
            i += 1
        i+=1
        k+=1

    return different_indices

# Example usage:
s1 = "I love llamas and coffee in the mornings"
s2 = "I love coffee in the mornings"

result = find_different_word_indices(s1, s2)
print(result)


words_s1[i]='I'
 words_s2[k]='I' 0
words_s1[i]='love'
 words_s2[k]='love' 1
words_s1[i]='llamas'
 words_s2[k]='coffee' 2
words_s1[i]='in'
 words_s2[k]='in' 3
words_s1[i]='the'
 words_s2[k]='the' 4
words_s1[i]='mornings'
 words_s2[k]='mornings' 5
[2, 3]


I have two strings for example (but not limited to ) s1 = "I love llamas and coffee in the mornings" and s2 = "I love coffee in the nights" 

Given two strings I want to find the indeces at which they are different in s1 for this it would be [2,3,7]. Write such an alorithm in python please