In [None]:
import os
import json
import warnings
import pandas as pd

from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from funct.data_processing import filter_semantic_similarity_2

warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
event_no_list_all = ["001", "002", "003", "004", "005", "006", "007", "008", "009", "010", "011", "012", "013", "014", "015", "016", "017", "018"]

In [None]:
# set folder output
folder = "./output"

# load the embedding model
embedding_model = SentenceTransformer("BAAI/bge-m3", device="cuda")

# load from folder json_files the file "date_to_request.json"
with open('./json_files/date_to_request.json') as f:
    date_to_request = json.load(f)

In [None]:
def load_and_select_data(event_no, folder, date_to_request, quantile=0.4):
    filtered_data = pd.DataFrame()
    
    for file in os.listdir(folder):
        if not file.startswith(event_no):
            continue
        
        request_id = next(
            (days.split("_")[-1] for days in date_to_request[event_no]
                if days.replace("/", "_").split("_")[2] == file.split("_")[1] 
            ), None
        )
        
        if not request_id:
            continue
        
        data = pd.read_csv(os.path.join(folder, file))
        data['request_id'] = request_id
        
        if {'cross-score', 'text'}.issubset(data.columns):
            data = data.sort_values(by='cross-score', ascending=False).reset_index(drop=True)
            filtered_data = pd.concat([filtered_data, data], ignore_index=True)
            
    if {'cross-score', 'text'}.issubset(filtered_data.columns):
        filtered_data = filter_semantic_similarity_2(filtered_data, embedding_model=embedding_model, text_column="search_text", threshold=0.95)
        filtered_data = filtered_data.sort_values(by='cross-score', ascending=False).reset_index(drop=True)
        top_len = int(len(filtered_data) * quantile)
        filtered_data = filtered_data.head(top_len)
        
    return filtered_data

def create_submission(filtered_data, event_no, char_limit=500, top_facts_limit=7):
    rows = []
    grouped = filtered_data.groupby(['request_id', 'query_id'])
    tot_selected_per_event = 0
    for (request_id, query_id), group in grouped:
        top_facts = group.sort_values(by='cross-score', ascending=False).head(top_facts_limit)

        fact_texts = []
        total_chars = 0

        for _, fact in top_facts.iterrows():
            fact_text = fact["text"]
            if total_chars + len(fact_text) + 1 > char_limit:
                break
            fact_texts.append(fact_text)
            total_chars += len(fact_text) + 1  # Account for spaces

        # Filter `top_facts` to match selected `fact_texts`
        fact_texts_df = top_facts.iloc[:len(fact_texts)]
        tot_selected_per_event += len(fact_texts_df)
        # Store results
        rows.append({
            'requestID': request_id,
            'factText': ". ".join(fact_texts).replace("..", "."),  # Concatenate facts properly
            'unixTimestamp': fact_texts_df['unix_timestamp'].iloc[0] if not fact_texts_df.empty else None,
            'importance': fact_texts_df['cross-score'].mean() if not fact_texts_df.empty else 0,
            'sources': fact_texts_df['doc_id'].tolist() if not fact_texts_df.empty else [],
            'streamID': None,
            'informationNeeds': [query_id]
        })
    
    # print(f"Total selected facts for event {event_no}: {tot_selected_per_event}")
    
    # sort by importance
    rows = sorted(rows, key=lambda x: x['importance'], reverse=True)
        
    return rows

def save_submission(output_data, json_path, gz_path):
    with open(json_path, "w") as f:
        for _, row in tqdm(output_data.iterrows(), total=output_data.shape[0], desc="Writing to file"):
            f.write("%s\n" % json.dumps(dict(row)))
    
    pd.read_json(json_path, lines=True).to_json(gz_path, orient="records", lines=True, compression="gzip")

def process_submission(event_list, quantile=0.5, char_limit=500, top_facts_limit=7):
    rows = []
    for event_no in tqdm(event_list, desc="Processing events"):
        # print(f"\nProcessing CrisisFACTS-{event_no}")
        data = load_and_select_data(event_no, folder, date_to_request, quantile=quantile)
        # data = select_fact_with_mistral(data)
        if data.empty:
            # print(f"No data found for CrisisFACTS-{event_no}")
            continue
        rows.extend(create_submission(data, event_no, char_limit=char_limit, top_facts_limit=top_facts_limit))
    
    output_data = pd.DataFrame(rows)
    # sort by importance
    if rows:
        save_submission(output_data, "./json_files/submission_test.json", "./json_files/Thesis_Retriver.gz")
        print("Processing complete! Output saved.")
    else:
        print("No valid data to save.")

In [None]:
process_submission(event_no_list_all, quantile=0.5, char_limit=500, top_facts_limit=7)