In [11]:
import pandas as pd

def load_parquet_with_filter(parquet_path, columns_to_keep):
    """
    Load a Parquet file, filter rows by model_prob between 0 and 1, and keep only specified columns in order.

    Args:
        parquet_path (str): Path to the Parquet file.
        columns_to_keep (list of str): List of column names to keep in the given order.

    Returns:
        pd.DataFrame: Filtered DataFrame with specified columns.
    """
    df = pd.read_parquet(parquet_path)
    # Filter rows where match_prob is between 0 and 1
    df_filtered = df[(df['match_prob'] >= 0.0) & (df['match_prob'] <= 0.50)]
    # Filter only the columns requested, if they exist in the DataFrame
    cols_to_use = [col for col in columns_to_keep if col in df_filtered.columns]
    return df_filtered[cols_to_use]

# Example usage:
df = load_parquet_with_filter('/Users/borismartinez/Documents/GitHub/engage/predicted_best_matches_prototype.parquet', ['registration_form_id', 'first_name_att', 'first_name_vf', 'last_name_att', 'last_name_vf', 'match_prob'])
print(df.head())


   registration_form_id first_name_att         first_name_vf  \
0            6001285689          pedro               michael   
2            6001287712           rora                  rosa   
5         1010000005010   rosana david        rosaura daniel   
7         1010000008304          mareo                 marco   
8         1010000010099        micaela  gloria aurora agusti   

         last_name_att last_name_vf  match_prob  
0  hernandez rodriguez        hodge    0.000162  
2                nunez        nunez    0.004387  
5                 dome         dome    0.004387  
7               vargas       vargas    0.004387  
8        quentes elias      quevedo    0.000112  
