In [None]:
import pandas as pd
import omdb
import time

In [None]:
from utils.read_config import read_config_class
config_value = read_config_class().config
omdb.set_default('apikey', config_value['omdb_api_key'])

In [None]:
# omdb.get(title='Revolution', year=1985)

In [None]:
# omdb.imdbid('tt5621080')

In [None]:
df = pd.read_csv('Inspired/data/movie_database.tsv', sep='\t')

## 1. Fix movie year (2020 and later)

In [None]:
# Create a copy of the dataframe and convert year to numeric
df_copy = df.copy()
df_copy['year'] = pd.to_numeric(df_copy['year'], errors='coerce')

recent_movies = df_copy[df_copy['year'] >= 2020]

In [None]:
num_processed_movies = 0
for idx in range(len(recent_movies)):
    row = df.iloc[idx]
    if pd.notna(row['imdb_id']):
        movie_data = omdb.imdbid(row['imdb_id'])
        if movie_data and 'year' in movie_data:
            omdb_year = movie_data['year']
            if str(row['year']) != str(omdb_year):
                print(f"{num_processed_movies + 1}: Updating {row['title']}: {row['year']} -> {omdb_year}")
                df.at[idx, 'year'] = omdb_year
                num_processed_movies += 1
        time.sleep(0.1)

print(f"\nProcessed {num_processed_movies} movies")

In [None]:
# df

## 2. Fill missing imdb_id values by title and year search

In [None]:
# Fill missing imdb_id values by title and year search
missing_imdb_count = df['imdb_id'].isna().sum()
filled_count = 0

for idx, row in df[df['imdb_id'].isna()].iterrows():
    title = row['title']
    year = row['year']
    print(f"Searching IMDb ID for {title} ({year})")
    search_results = omdb.search(title, year=year)
    if search_results:
        imdb_id = search_results[0].get('imdb_id')
        if imdb_id:
            print(f"Found IMDb ID: {imdb_id}")
            df.at[idx, 'imdb_id'] = imdb_id
            filled_count += 1
    time.sleep(0.1)  # Add delay to respect API rate

print(f"\nFilled {filled_count} out of {missing_imdb_count} missing IMDb IDs")

## 3. Drop rows where imdb_id is missing

In [None]:
# Drop rows where imdb_id is missing
print(f"Original dataframe shape: {df.shape}")
df = df.dropna(subset=['imdb_id'])
print(f"Dropped {missing_imdb_count} rows with missing IMDb IDs")
print(f"New dataframe shape: {df.shape}")

In [None]:
# df_new = df.copy()
# df_new

In [None]:
# df_queen = df_new[df_new['imdb_id']=='tt10263718']


In [None]:
# df = df_new.copy()

In [None]:
# df

## 4. Fill the missing information using IMDB_id

In [None]:
# Fetch multiple movie fields from OMDb API using IMDb ID
def fetch_movie_data(imdb_id):
    try:
        output_parser = omdb.imdbid(imdb_id, fullplot=True)
        return {
            'title': output_parser.get('title'),
            'year': output_parser.get('year'),
            'movie_runtime': output_parser.get('runtime'),
            'country': output_parser.get('country'),
            'director': output_parser.get('director'),
            'actors': output_parser.get('actors'),
            'genre': output_parser.get('genre'),
            'writer': output_parser.get('writer'),
            'long_plot': output_parser.get('plot'),
            'short_plot': output_parser.get('plot')
        }
    except Exception as e:
        print(f"Error fetching data: {str(e)}")
        return {field: None for field in ['title', 'year', 'movie_runtime', 'country', 'director', 'actors', 'genre', 'writer', 'long_plot', 'short_plot']}

In [None]:
def update_missing_fields():
    # Fields to check and update
    fields = ['movie_runtime', 'country', 'director', 'actors', 'genre', 'writer', 'long_plot', 'short_plot']
    
    # Find rows with any missing fields
    missing_mask = df[fields].isna().any(axis=1)
    missing_count = missing_mask.sum()
    print(f"Found {missing_count} movies with missing fields")
    
    # Update missing values
    for idx, row in df[missing_mask].iterrows():
        imdb_id = row['imdb_id']
        print(f"Processing movie {idx}: {row['title']} (IMDbID: {imdb_id})")
        
        # Fetch new data
        new_data = fetch_movie_data(imdb_id)
        
        # Update only missing fields and only if the new data isn't 'N/A'
        for field in fields:
            if pd.isna(row[field]):
                if new_data[field] != 'N/A' and new_data[field] is not None:
                    df.at[idx, field] = new_data[field]
                else:
                    pass
        
        # Add a small delay to avoid hitting API rate limits
        time.sleep(0.1)

update_missing_fields()

In [None]:
# df = pd.read_csv('movie_database_filled_final.tsv', sep='\t')
# df.shape

In [None]:
# Drop rows with missing values in required fields
fields_to_check = ['country', 'genre', 'movie_runtime', 'writer', 'director', 'actors', 'long_plot']
original_shape = df.shape[0]
df = df.dropna(subset=fields_to_check)
rows_dropped = original_shape - df.shape[0]
print(f"Dropped {rows_dropped} rows with missing values in required fields")
print(f"New dataframe shape: {df.shape}")

In [None]:
# df

## 5. Drop duplicated movies 

In [None]:
# Find rows with duplicated imdb_id values
duplicated_rows = df[df['imdb_id'].duplicated(keep=False)]
duplicated_rows.sort_values('imdb_id', inplace=True)
# duplicated_rows

In [None]:
remaining_duplicates = duplicated_rows.drop_duplicates(subset='imdb_id', keep='first')
remaining_duplicates.sort_values('imdb_id', inplace=True)
# remaining_duplicates

In [None]:
# Update fields for each remaining unique movie
update_count = 0
for idx, row in remaining_duplicates.iterrows():
    imdb_id = row['imdb_id']
    print(f"Processing {update_count+1}/{len(remaining_duplicates)}: {row['title']} (IMDbID: {imdb_id})")
    
    omdb_data = fetch_movie_data(imdb_id)
    if omdb_data:
        for field, value in omdb_data.items():
            remaining_duplicates.at[idx, field] = value
        update_count += 1
    
    # Add delay to respect API rate limits
    time.sleep(0.1)

print(f"\nUpdated {update_count} movies")

remaining_duplicates
# # Save the updated dataset
# output_file = 'movie_database_updated_fields.tsv'
# df_no_duplicates.to_csv(output_file, sep='\t', index=False)
# print(f"Saved updated dataset to {output_file}")

In [None]:
df_temp = df.drop_duplicates(subset='imdb_id', keep='first')

In [None]:
# Update df_temp with the values from df_no_duplicates based on imdb_id
df_temp.update(remaining_duplicates)

# Save the merged dataframe to a new file
df_temp.to_csv('movie_database_final.tsv', sep='\t', index=False)
print(f"Saved updated dataset with {len(df_temp)} rows to movie_database_final.tsv")

## Test Everything


In [None]:
import json
import os
from tqdm import tqdm


# Chroma database
import chromadb

# Llamaindex framework
from llama_index.core import Document
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore

from utils.read_config import read_config

# Create prompt to fit to embedding
def get_document_string(df_current_movie):
    embedding_string = f'''Embedding this movie with the information:
- Title: {df_current_movie["title"]}.
- Release Year: {df_current_movie["year"]}.
- Country: {df_current_movie["country"]}.
- Genre: {df_current_movie["genre"]}.
- Duration: {df_current_movie["runtime"]}.
- Writer: {df_current_movie["writer"]}.
- Director: {df_current_movie["director"]}.
- Cast: {df_current_movie["actors"]}.
- Description: {df_current_movie["plot"]}.
'''
    return embedding_string


# Get all document string
def get_document_list(movie_data_path: os.PathLike):
    movie_data = []
    for line in open(movie_data_path, "r"):
        movie_data.append(json.loads(line))
        
    print(len(movie_data))
        
    documents = [] # Drop all na value of movieId
    for i in tqdm(range(len(movie_data))):
        df_current_movie = movie_data[i]
        if len(df_current_movie) > 2:# Get current row
            embedding_string = get_document_string(df_current_movie)  # Get prompt to fit
            documents.append(Document(text=embedding_string,
                                      doc_id=str(df_current_movie['movieId'])))  # Append to llamaindex document
    return documents


# Generate embedding
def create_embedding_db(db_path, collection_name, embedding_model, api_key, movie_data_path):
    documents = get_document_list(movie_data_path=movie_data_path)
    gemini_embedding_model = GeminiEmbedding(api_key=api_key, 
                                             model_name=embedding_model)
    client = chromadb.PersistentClient(path=db_path)
    chroma_collection = client.get_or_create_collection(collection_name)
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(documents, 
                                            storage_context=storage_context, 
                                            embed_model=gemini_embedding_model)
    return index




In [None]:
config_value = read_config()

chromadb_path = config_value['insp_chroma_db_path'] 
collection_name = config_value['insp_collection_name'] 
google_api_key = config_value['google_api_key']
embedding_model_name = config_value['model_embedding']
movie_data_path = config_value['processed_insp_movie_data_path'] 

# 

# # Create embedding model
# gemini_embedding_model = GeminiEmbedding(api_key= google_api_key, model_name= embedding_model_name)

# # Load meta data from disk
# movie_data = []
# for line in open(config_value['file_path_output_movie'], "r"):
#     movie_data.append(json.loads(line))

# Create document
# documents = get_document_list(movie_data_path=movie_data_path)

# Call and save embedding
create_embedding_db(db_path=chromadb_path, 
                    collection_name=collection_name, 
                    embedding_model=embedding_model_name,
                    movie_data_path=movie_data_path,
                    api_key=google_api_key)







In [None]:
# %pip install -r requirements.txt

In [None]:
file = open('dataset/preprocessed_data/INSPIRED/movie_data/movie_database_no_missing.json', 'r', encoding='utf-8')
df_movie = json.load(file)

df_movie



In [None]:
import pandas as pd

data = pd.read_csv("output/INSPIRED/output_100_inspMovie_origin_insp_preprocess.tsv", delimiter="\t", names=["recall", "id", "target", "summary", "top_movie", "candidate"])
data



In [None]:
s = "   Hello        World"
" ".join(s.split())



In [None]:
import pandas as pd

data = pd.read_csv('dataset\INSPIRED\movie_data\movie_database.tsv', 
                   delimiter='\t')
data

In [None]:
import json
from data_preprocessing.dialog_merge import insp_dialog_merge

with open('dataset\preprocessed_data\INSPIRED\dialog_data\dialog_train_data.json') as f:
    data = json.load(f)

print(insp_dialog_merge(data[0])[1])



In [None]:
with open("dataset\preprocessed_data\INSPIRED\dialog_data\dialog_train_data_original_preprocessed.json", "r") as f:
    train_data = json.load(f)

print('\n'.join(train_data[0]))



In [None]:
import re
import json
import os
import pandas as pd
from typing import List, Dict, Any, Union


def replace_movie_titles(original_text: str, movie_names: List[str], item_mask: str = "[MOVIE_TITLE]") -> str:
    """
    Replace movie titles in text with a mask.

    Args:
        original_text (str): The text to process
        movie_names (list): A list of movie names to mask in the text
        item_mask (str, optional): The placeholder to use for masked movie titles. Defaults to "[MOVIE_TITLE]".

    Returns:
        str: The text with movie titles replaced by the mask
    """
    mask_text = original_text

    for movie_name in movie_names:
        if movie_name.endswith(")") and "(" in movie_name:
            last_open_parenthesis_index = movie_name.rfind("(")
            movie_name_no_brackets = movie_name[:last_open_parenthesis_index].rstrip()
        else:
            movie_name_no_brackets = movie_name

        # If the movie name is found, replace it with the mask
        pattern = re.compile(re.escape(movie_name_no_brackets), re.IGNORECASE)
        # Replace all occurrences of the movie name in the text with the placeholder
        mask_text = re.sub(pattern, item_mask, mask_text)

    return mask_text


def process_inspired_data(raw_data: List[Dict[str, Any]]) -> Dict[int, Dict[str, Any]]:
    """
    Process raw Inspired dataset data into a structured format.

    Args:
        raw_data (list): Raw data entries from the Inspired dataset

    Returns:
        dict: Processed data with masked movie titles
    """
    processed_data = {}

    for idx, row in enumerate(raw_data):
        text = row.get("text", "")
        
        # Check if movies field is NaN or a float
        if pd.isna(row.get("movies")) or isinstance(row.get("movies"), float):
            movie_names = []
        # Handle different input formats (string with separator or list)
        elif isinstance(row.get("movies", ""), str):
            movie_names = row.get("movies", "").split(";")
            movie_names = [name.strip() for name in movie_names if name.strip()]
        else:
            movie_names = row.get("movies", [])

        # Replace movie titles with mask
        masked_text = replace_movie_titles(text, movie_names)

        # Add processed data to results
        processed_data[idx] = {
            "original_text": text,
            "masked_text": masked_text,
            "movie_names": movie_names,
            # Include other fields as needed
            "dialogue_id": row.get("dialog_id", ""),
            "turn_id": row.get("turn_id", ""),
            "speaker": row.get("speaker", ""),
            "entities": row.get("entity", []),
        }

    return processed_data


def load_inspired_tsv(file_path: str) -> List[Dict[str, Any]]:
    """
    Load Inspired dataset from TSV file and convert to list of dictionaries.

    Args:
        file_path (str): Path to the TSV file

    Returns:
        list: List of dictionaries containing the data
    """
    df = pd.read_csv(file_path, sep="\t")
    records = df.to_dict("records")
    return records


def extract_conversation(processed_data: Dict[int, Dict[str, Any]], dialogue_id: str) -> List[Dict[str, Any]]:
    """
    Extract a single conversation from the processed data by dialogue ID.

    Args:
        processed_data (dict): Processed data dictionary
        dialogue_id (str): The dialogue ID to extract

    Returns:
        list: Ordered list of utterances in the conversation
    """
    conversation = []
    for idx, item in processed_data.items():
        if item["dialogue_id"] == dialogue_id:
            conversation.append(item)

    # Sort by turn_id if available
    if conversation and "turn_id" in conversation[0]:
        conversation.sort(key=lambda x: x.get("turn_id", 0))

    return conversation

In [None]:
def merge_dialog(dialog: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Merge consecutive utterances from the same speaker into a single utterance.

    Args:
        dialog (list): A list of utterance dictionaries representing a conversation

    Returns:
        list: A list of merged utterance dictionaries

    Example:
        Input: [
            {"speaker": "RECOMMENDER", "text": "Hello", "movie_names": []},
            {"speaker": "RECOMMENDER", "text": "How are you?", "movie_names": []},
            {"speaker": "SEEKER", "text": "I'm good", "movie_names": ["Avengers"]}
        ]
        Output: [
            {"speaker": "RECOMMENDER", "text": "Hello How are you?", "movie_names": []},
            {"speaker": "SEEKER", "text": "I'm good", "movie_names": ["Avengers"]}
        ]
    """
    merged_dialog = []
    last_speaker = None

    for utterance in dialog:
        # Get current utterance information
        speaker = utterance.get("speaker", "")
        text = utterance.get("text", "")
        movie_names = utterance.get("movie_names", [])
        entities = utterance.get("entities", [])

        # If this is from the same speaker as last utterance, merge them
        if speaker == last_speaker and merged_dialog:
            # Append text with a space separator
            merged_dialog[-1]["text"] += f" {text}"

            # Merge masked text if available
            if "masked_text" in utterance and "masked_text" in merged_dialog[-1]:
                merged_dialog[-1]["masked_text"] += f" {utterance['masked_text']}"

            # Extend movie names and entities lists
            merged_dialog[-1]["movie_names"].extend(movie_names)
            merged_dialog[-1]["entities"].extend(entities)

            # Preserve other fields that might have been added
            for key, value in utterance.items():
                if key not in ("text", "masked_text", "movie_names", "entities", "speaker"):
                    if isinstance(value, list):
                        if key in merged_dialog[-1]:
                            merged_dialog[-1][key].extend(value)
                        else:
                            merged_dialog[-1][key] = value
        else:
            # Create a new utterance entry
            new_utterance = {
                "text": text,
                "speaker": speaker,
                "movie_names": movie_names.copy() if movie_names else [],
                "entities": entities.copy() if entities else [],
            }

            # Copy any other fields from the original utterance
            for key, value in utterance.items():
                if key not in ("text", "speaker", "movie_names", "entities"):
                    if isinstance(value, list):
                        new_utterance[key] = value.copy()
                    else:
                        new_utterance[key] = value

            merged_dialog.append(new_utterance)

        # Update the last speaker
        last_speaker = speaker

    return merged_dialog


def build_conversation_history(dialog: List[Dict[str, Any]], include_current: bool = False) -> List[Dict[str, Any]]:
    """
    Build a conversation history for each utterance in the dialog.
    For each utterance, create a context that includes all previous utterances.

    Args:
        dialog (list): A list of utterance dictionaries (can be already merged)
        include_current (bool): Whether to include the current utterance in its own history

    Returns:
        list: A list of dictionaries with original utterance plus conversation history
    """
    augmented_dialog = []
    history = []

    for i, utterance in enumerate(dialog):
        # Create a copy of the current utterance
        augmented_utterance = utterance.copy()

        # Get the conversation history up to this point
        if include_current:
            augmented_utterance["history"] = history + [utterance]
        else:
            augmented_utterance["history"] = history.copy()

        # Add formatted conversation history as text
        history_text = ""
        for h in augmented_utterance["history"]:
            speaker_prefix = "Recommender: " if h["speaker"] == "RECOMMENDER" else "User: "
            history_text += f"{speaker_prefix}{h['text']} "

        augmented_utterance["history_text"] = history_text.strip()

        # Append this utterance to the history for future utterances
        history.append(utterance)

        # Add to the result
        augmented_dialog.append(augmented_utterance)

    return augmented_dialog

In [None]:
def main():
    # Define the path to the TSV file - update with your actual path
    tsv_path = "dataset/INSPIRED/dialog_data/train.tsv"

    # Check if the file exists
    if not os.path.exists(tsv_path):
        print(f"File not found: {tsv_path}")
        print("Please provide the correct path to the TSV file.")
        return

    print(f"Loading data from: {tsv_path}")

    # Load the raw data
    raw_data = load_inspired_tsv(tsv_path)
    print(raw_data[0])
    print(f"Loaded {len(raw_data)} records from the TSV file.")

    # Process the data
    print("Processing data...")
    processed_data = process_inspired_data(raw_data)
    print(f"Processed {len(processed_data)} records.")

    # Show some sample data
    sample_idx = list(processed_data.keys())[0]
    print("\nSample processed item:")
    sample_item = processed_data[sample_idx]
    print(f"Original text: {sample_item['original_text']}")
    print(f"Masked text: {sample_item['masked_text']}")
    print(f"Movies: {sample_item['movie_names']}")

    # Extract a conversation
    if len(processed_data) > 0:
        # Get the first dialog_id
        dialogue_id = processed_data[0]["dialogue_id"]
        conversation = extract_conversation(processed_data, dialogue_id)
        print(f"\nExtracted conversation with ID {dialogue_id}:")
        print(f"Number of turns before merging: {len(conversation)}")

        # # Demonstrate dialog merging
        # merged_conversation = merge_dialog(conversation)
        # print(f"Number of turns after merging: {len(merged_conversation)}")

        # # Demonstrate building conversation history
        # history_added = build_conversation_history(merged_conversation)

        # # Print sample of the merged conversation with history
        # if len(history_added) > 1:
        #     print("\nSample of merged conversation with history:")
        #     print(f"Speaker: {history_added[1]['speaker']}")
        #     print(f"Text: {history_added[1]['text']}")
        #     print(f"History text: {history_added[1]['history_text']}")

        # Save processed data as JSON
        output_file = "processed_inspired_data.json"
        with open(output_file, "w") as f:
            json.dump({"original": processed_data, 
                       }, 
                      f, 
                      indent=2)
        print(f"\nProcessed data saved to {output_file}")


if __name__ == "__main__":
    main()



In [None]:
from data_preprocessing.dialog_data_transform import redial_dialog_data_transform

data = []
with open('dataset/ReDial/dialog_data/train_data.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line.strip()))

trans = redial_dialog_data_transform("dataset/ReDial/dialog_data/train_data.jsonl")
trans[1]



In [None]:
import re


def remove_target_from_conversation(conversation, target_item):
    """
    Xử lý hội thoại và loại bỏ target_item (movie) từ các phần không nên có trong dữ liệu huấn luyện.

    Args:
        conversation (list): Danh sách các câu hội thoại.
        target_item (str): Tên bộ phim (item) được đề xuất.

    Returns:
        list: Danh sách hội thoại đã được lọc bỏ thông tin target_item.
    """
    cleaned_conversation = []

    for sentence in conversation:
        # Loại bỏ target_item từ hội thoại nếu xuất hiện trong câu
        cleaned_sentence = re.sub(rf"\b{re.escape(target_item)}\b", "[REDACTED]", sentence, flags=re.IGNORECASE)
        
        cleaned_conversation.append(cleaned_sentence)

    return cleaned_conversation


# Ví dụ: Sử dụng target item 'Knives Out'
target_item = "Knives Out"

# # Cuộc hội thoại ban đầu
# conversation = [
#     "RECOMMENDER: Hi There! <opinion_inquiry>What types of movies do you like to watch?",
#     "SEEKER: Hello! I'm more of an [MOVIE_GENRE_0] movie or a good [MOVIE_GENRE_1] and [MOVIE_GENRE_2] movie. [SEP] genre: action, romance, mystery;",
#     "RECOMMENDER: <self_modeling>I just saw the trailer for [MOVIE_TITLE_0] when I went to see Joker and it looked like a good mix of [MOVIE_GENRE_0] and [SEP]movie: Knives Out (2019);genre: action;",
#     "SEEKER: I seen that one too as I seen [MOVIE_TITLE_1] about a month ago. I thought about asking my fiance about going and seeing it. [SEP] movie: Joker (2019);",
#     "RECOMMENDER: <personal_opinion>It looks like a good movie for people who like many different movies. <personal_opinion>It also has a great cast! <personal_opinion>I was surprised to see [MOVIE_P_ACTOR_0] the trailer! [SEP]people_name: Chris Evans;",
#     "SEEKER: Maybe with [MOVIE_P_ACTOR_0] it it'll be easier to convince my fiance to see it. Do you know who else is in the cast? [SEP] people_name: Chris Evans;",
#     "RECOMMENDER: <credibility>[MOVIE_P_ACTOR_1] [MOVIE_P_ACTOR_2] also in the cast. <encouragement>[MOVIE_P_ACTOR_1] a lot of 007 so definitely a good hearthrob role to convince the misses lol! [SEP]people_name: Daniel Craig, Jamie Lee Curtis, Daniel Craig;",
#     "SEEKER: [MOVIE_TITLE_2] But he loves the bond movies so that should be a good incentive for him to go see it. Do you have any other recommendations? [SEP] movie: I Am Michael (2015);",
#     "RECOMMENDER: <encouragement>The new [MOVIE_TITLE_3] comes out in less than a month, if you are into the franchise. [SEP]movie: Star Wars (1977);",
#     "SEEKER: He is, I think he told me we're getting it when it comes out to add to our movie collection.",
#     "RECOMMENDER: <encouragement>Well that is another great [MOVIE_GENRE_0] movie. <encouragement>I also recommend the [MOVIE_TITLE_4] series [SEP]movie: John Wick (2014);genre: action;",
#     "SEEKER: I haven't seen any of that series. Could you tell me what the general plot is>",
#     "RECOMMENDER: <credibility>[MOVIE_PLOT] <credibility>[MOVIE_PLOT] <credibility>[MOVIE_PLOT] <personal_opinion>I have yet to watch the 3rd one but the [MOVIE_GENRE_0] scenes were really cool! [SEP]genre: action;people_name: John Wick;",
#     "SEEKER: Oh I'd definitely would cry at the dogs death.",
#     "RECOMMENDER: <similarity>It is really sad! <personal_opinion>the dog was a last gift from his dying wife which makes it so much worse",
#     "SEEKER: I couldn't even finish [MOVIE_TITLE_5] because of the dog dying. Anything with animal death makes me ball like a baby. [SEP] movie: I Am Legend (2007);",
#     "RECOMMENDER: <similarity>[MOVIE_TITLE_6] & Me had me crying for a good half hour so I completely understand that! [SEP]movie: Marley (2012);",
#     "SEEKER: I avoided that movie because someone told me he passed away. My fiance took me to see jurrasic world as our first date and I cried at the dinosuars dying.",
#     "RECOMMENDER: <similarity>I would definitely avoid that movie if animal deaths make you said. <no_strategy>Oh that is so cute though!",
#     "SEEKER: Yeah, he had to calm me down for about an hour and bought me ice cream to apologize for it.",
#     "RECOMMENDER: <no_strategy>Aww that is so sweet. <rephrase_preference>[MOVIE_TITLE_0] that you dont want to see animals die, and you are looking for an [MOVIE_TITLE_7]/Mystery I think [MOVIE_TITLE_8] would be a good movie choice. <preference_confirmation>Do you agree? [SEP]movie: Knives Out (2019), Given (2017), Action (1980);",
#     "SEEKER: I do agree with that. When it comes out i'll bring it up for date night. Thank you!!",
# ]

# # Loại bỏ target item 'Knives Out'
# cleaned_conversation = remove_target_from_conversation(conversation, target_item)

# # In ra kết quả đã được lọc
# for sentence in cleaned_conversation:
#     print(sentence)

In [None]:
import json
from data_preprocessing.dialog_merge import insp_dialog_merge

with open("dataset/preprocessed_data/INSPIRED/dialog_data/dialog_train_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

merge = insp_dialog_merge(data[0])[1]
print(merge)



In [None]:
clean = remove_target_from_conversation(merge, target_item=target_item)
for sentence in clean:
    print(sentence)



In [5]:
import pandas as pd

data = pd.read_csv(
    "output\INSPIRED\output_100_inspMovie_remove_target_recall@10.tsv", delimiter="\t", names=["recall", "id", "target", "summary", "top_movie", "candidate"]
)
data

Unnamed: 0,recall,id,target,summary,top_movie,candidate
0,1.0,20191127-210600_875_live.pkl,Knives Out,"The seeker enjoys action, romance, and mystery...",Casino Royale|Skyfall|Spectre|GoldenEye|Kingsm...,[Wanted|Jurassic World: Fallen Kingdom|Spectre...
1,1.0,20191130-130606_969_live.pkl,Terminator: Dark Fate,The seeker enjoys futuristic sci-fi movies and...,Back to the Future|Back to the Future Part II|...,[Terminator 2: Judgment Day|T2 3-D: Battle Acr...
2,0.0,20191201-150808_843_live.pkl,A Christmas Story 2,"The seeker generally likes comedies, especiall...",Movie 43|Orgazmo|Dirty Movie|Serial Mom|Spaceb...,[Favorite Deadly Sins|Looking for Comedy in th...
3,0.0,20191203-173914_685_live.pkl,Ready or Not,The seeker loves thrillers and suspenseful mov...,Get Out|Us|It|It Follows|Sinister|Split|The Co...,[Get Out|It|Hide and Seek|Ma|Us|It Follows|Sea...
4,0.0,20191201-133417_454_live.pkl,The Grinch,"The seeker enjoys Christmas movies, especially...",Miracle on 34th Street|One Magic Christmas|A R...,[A Royal Christmas|A Prince for Christmas|Secr...
...,...,...,...,...,...,...
269,0.0,20191130-062928_598_live.pkl,Kingsman: The Secret Service,"The seeker didn't like the movie ""A Day in the...",John Wick|Knives Out|Mission: Impossible|Kiss ...,[A Beautiful Day in the Neighborhood|Knight an...
270,1.0,20191204-061314_897_live.pkl,Isn't It Romantic,The seeker likes action/adventure movies and e...,Isn't It Romantic|No Strings Attached|Valentin...,[Charlie's Angels|Charlie's Angels|Isn't It Ro...
271,1.0,20191128-150946_640_live.pkl,Christmas Land,"The seeker is looking for a Christmas movie, s...",A Christmas Prince|A Royal Christmas|My Christ...,[A Prince for Christmas|A Royal Christmas|A Pe...
272,1.0,20200221-145630_327_live.pkl,You've Got Mail,"The seeker enjoys romantic comedies, historica...",You've Got Mail|Sleepless in Seattle|Something...,[Sleepless in Seattle|You've Got Mail|What Wom...
