In [16]:
import os
import pandas as pd
from multiprocessing import Pool, cpu_count
from functools import partial

def process_file(filename, ground_truth_folder, algorithm_folder):
    ground_truth_file = os.path.join(ground_truth_folder, filename)
    algorithm_file = os.path.join(algorithm_folder, filename)
    
    if not os.path.exists(algorithm_file):
        # If the algorithm file doesn't exist, return None
        return None
    
    try:
        # Read the CSV files
        ground_truth_df = pd.read_csv(ground_truth_file)
        algorithm_df = pd.read_csv(algorithm_file)
        
        # Remove duplicates from the algorithm dataframe
        algorithm_df = algorithm_df.drop_duplicates()
        
        # Get the first 10 records of the ground truth file
        ground_truth_first_10 = ground_truth_df.head(10)
        
        # Use set intersection for faster matching
        algorithm_ids = set(algorithm_df['ID'])
        ground_truth_ids = set(ground_truth_first_10['ID'])
        match_count = len(algorithm_ids.intersection(ground_truth_ids))
        
        return {'query': filename, 'match': match_count}
    except Exception as e:
        # Handle any errors that occur while processing
        print(f"Error processing {filename}: {str(e)}")
        return None

def process_folder(ground_truth_folder, algorithm_folder, output_file):
    # List all files in the ground truth folder
    ground_truth_files = os.listdir(ground_truth_folder)
    
    # Check if the output directory exists, and create it if not
    output_directory = os.path.dirname(output_file)
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    # Create a partial function with the folders already specified
    process_file_partial = partial(process_file, 
                                   ground_truth_folder=ground_truth_folder,
                                   algorithm_folder=algorithm_folder)
    
    # Use all available CPU cores
    num_processes = cpu_count()
    
    # Create a pool of worker processes
    with Pool(processes=num_processes) as pool:
        # Process files in parallel
        results = pool.map(process_file_partial, ground_truth_files)
    
    # Filter out None results and create DataFrame
    results = [r for r in results if r is not None]
    results_df = pd.DataFrame(results)
    
    # Save results to the output CSV file
    results_df.to_csv(output_file, index=False)
    #print(f"Results have been written to {output_file}")
    
    return results_df

def main():
    ground_truth_folder = 'GROUND_TRUTH'
    algorithm_base_folder = 'RESULTFOLDER'
    output_base_folder = 'FOLDERTOHOLDINTERMEDIATEVALUES'
    # 40, 60, 100, 200, 300, 400, 500, 600, 800, 1000, 1200, 1500,
    #                  1700, 1900, 2100, 2400, 2700, 3000, 3200, 3500, 3700, 4000
    # Folder suffixes to iterate over
    folder_suffixes = [20,40, 60, 100, 200, 300, 400, 500, 600, 800, 1000, 1200, 1500,
                    1700, 1900, 2100]
    
    all_results = []
    result_strings = []  # List to store results
    for folder_suffix in folder_suffixes:
        algorithm_folder = os.path.join(algorithm_base_folder, str(folder_suffix))
        output_file = os.path.join(output_base_folder, f'Res{folder_suffix}.csv')
        
        # Process the folder and get the results
        results_df = process_folder(ground_truth_folder, algorithm_folder, output_file)
        
        # Calculate the average match for the current folder
        average_match = results_df['match'].mean()
        # print(f"Average match for folder {folder_suffix}: {average_match/10}")
        
        # Append to overall results
        all_results.append({'folder': folder_suffix, 'average_match': average_match})
        result_strings.append(f"{average_match/10:.4f}")  # Store the result with formatting

    # Save all average matches in a single summary file
    summary_df = pd.DataFrame(all_results)
    summary_file = os.path.join(output_base_folder, 'SummaryResults.csv')
    summary_df.to_csv(summary_file, index=False)
    print(f"Summary of averages has been written to {summary_file}")
    print(", ".join(result_strings))
if __name__ == '__main__':
    main()


Average match for folder 20: 0.9688666666666667
Average match for folder 40: 0.9804666666666666
Average match for folder 60: 0.9827333333333333
Average match for folder 100: 0.9848000000000001
Average match for folder 200: 0.9869333333333333
Average match for folder 300: 0.9869999999999999
Average match for folder 400: 0.9872
Average match for folder 500: 0.9874666666666666
Average match for folder 600: 0.9878666666666666
Average match for folder 800: 0.9878
Average match for folder 1000: 0.9877333333333332
Average match for folder 1200: 0.9878
Average match for folder 1500: 0.9880666666666666
Average match for folder 1700: 0.9889333333333333
Average match for folder 1900: 0.9889333333333333
Average match for folder 2100: 0.9896666666666667
Summary of averages has been written to /data3/Adeel/Data4/InstantResultCheck/SummaryResults.csv
0.9689, 0.9805, 0.9827, 0.9848, 0.9869, 0.9870, 0.9872, 0.9875, 0.9879, 0.9878, 0.9877, 0.9878, 0.9881, 0.9889, 0.9889, 0.9897


df['predicate'] = df['predicate'].str.strip(';')
df