In [28]:
import os
import pandas as pd
from multiprocessing import Pool, cpu_count
from functools import partial

def process_file(filename, ground_truth_folder, algorithm_folder):
    ground_truth_file = os.path.join(ground_truth_folder, filename)
    algorithm_file = os.path.join(algorithm_folder, filename)
    
    if not os.path.exists(algorithm_file):
        # If the algorithm file doesn't exist, return None
        return None
    
    try:
        # Read the CSV files
        ground_truth_df = pd.read_csv(ground_truth_file)
        algorithm_df = pd.read_csv(algorithm_file)
        
        # Remove duplicates from the algorithm dataframe
        algorithm_df = algorithm_df.drop_duplicates()
        
        # Get the first 10 records of the ground truth file
        ground_truth_first_10 = ground_truth_df.head(10)
        
        # Use set intersection for faster matching
        algorithm_ids = set(algorithm_df['ID'])
        ground_truth_ids = set(ground_truth_first_10['ID'])
        match_count = len(algorithm_ids.intersection(ground_truth_ids))
        
        return {'query': filename, 'match': match_count}
    except Exception as e:
        # Handle any errors that occur while processing
        print(f"Error processing {filename}: {str(e)}")
        return None

def process_folder(ground_truth_folder, algorithm_folder, output_file):
    # List all files in the ground truth folder
    ground_truth_files = os.listdir(ground_truth_folder)
    
    # Check if the output directory exists, and create it if not
    output_directory = os.path.dirname(output_file)
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    # Create a partial function with the folders already specified
    process_file_partial = partial(process_file, 
                                   ground_truth_folder=ground_truth_folder,
                                   algorithm_folder=algorithm_folder)
    
    # Use all available CPU cores
    num_processes = cpu_count()
    
    # Create a pool of worker processes
    with Pool(processes=num_processes) as pool:
        # Process files in parallel
        results = pool.map(process_file_partial, ground_truth_files)
    
    # Filter out None results and create DataFrame
    results = [r for r in results if r is not None]
    results_df = pd.DataFrame(results)
    
    # Save results to the output CSV file
    results_df.to_csv(output_file, index=False)
    #print(f"Results have been written to {output_file}")
    
    return results_df

def main():
    ground_truth_folder = '/data4/hnsw/TripClick/GroundTruth'
    algorithm_base_folder = '/data4/hnsw/yt8m/Res_Gener/'
    output_base_folder = '/data4/hnsw/paper/InstantResultCheck'
    # 40, 60, 100, 200, 300, 400, 500, 600, 800, 1000, 1200, 1500,
    #                  1700, 1900, 2100, 2400, 2700, 3000, 3200, 3500, 3700, 4000
    # Folder suffixes to iterate over
    folder_suffixes = [60]
    
    all_results = []
    
    for folder_suffix in folder_suffixes:
        algorithm_folder = os.path.join(algorithm_base_folder, str(folder_suffix))
        output_file = os.path.join(output_base_folder, f'Res{folder_suffix}.csv')
        
        # Process the folder and get the results
        results_df = process_folder(ground_truth_folder, algorithm_folder, output_file)
        
        # Calculate the average match for the current folder
        average_match = results_df['match'].mean()
        print(f"Average match for folder {folder_suffix}: {average_match}")
        
        # Append to overall results
        all_results.append({'folder': folder_suffix, 'average_match': average_match})
    
    # Save all average matches in a single summary file
    summary_df = pd.DataFrame(all_results)
    summary_file = os.path.join(output_base_folder, 'SummaryResults.csv')
    summary_df.to_csv(summary_file, index=False)
    print(f"Summary of averages has been written to {summary_file}")

if __name__ == '__main__':
    main()


Average match for folder 60: 9.884
Summary of averages has been written to /data4/hnsw/paper/InstantResultCheck/SummaryResults.csv


In [7]:
import pandas as pd
df = pd.read_csv('/data4/hnsw/paper/meta_data.csv')  # Adjust delimiter as needed

# Display the DataFrame
df

Unnamed: 0,Meta
0,11
1,2
2,1
3,12
4,5
...,...
2029992,12
2029993,8
2029994,3
2029995,8


In [6]:
# Directory to save files (optional, create in the same folder otherwise)
output_directory = "/data3/""/Disk_optimization/0.txt"
import os
os.makedirs(output_directory, exist_ok=True)

# Iterate through each record in the DataFrame
for index, value in enumerate(df['Meta']):  # Use only the "Meta" column
    file_name = os.path.join(output_directory, f"{index}.txt")  # File name
    with open(file_name, "w") as f:
        f.write(str(value))  # Convert value to string before writing

print("Files created successfully!")

Files created successfully!


df['predicate'] = df['predicate'].str.strip(';')
df

In [31]:
df 


Unnamed: 0,ID,distance
0,38440,73.9805
1,79487,78.3234
2,571234,78.3933
3,206055,83.4641
4,504900,84.2268
5,224335,86.1127
6,555626,88.0368
7,36437,89.2751
8,94534,89.2751
9,502983,90.0738


In [18]:
df[' predicate'] = df[' predicate'].str.strip(';')
df[' predicate'] = df[' predicate'].fillna('Nothing')
df.to_csv('/data3/""/Corelation/Result/merged_file.csv', index=False)

In [21]:
filtered_df = df.loc[(df[' predicate'] != 'Nothing')]
filtered_df

Unnamed: 0,ID,distance,predicate
2057,2057,136.518780,Allergies and Immunology;Anesthesiology;Cardio...
2164,2164,123.625273,Dermatology
3098,3098,120.049076,Ophthalmology;Pediatrics;Women's Health
6104,6104,132.364670,Endocrinology;Gastroenterology;OB-Gyn;Psychiat...
11518,11518,129.085755,Oncology;Urology
...,...,...,...
807966,807966,118.372848,Cardiology;Dermatology;Emergency Medicine;Endo...
809626,809626,122.183173,Pediatrics
810414,810414,106.900848,Infectious Disease;Orthopedic;Rheumatology;Sur...
810531,810531,139.787614,Allergies and Immunology;Anesthesiology;Cardio...


In [14]:
import pandas as pd
import os
from glob import glob

# Folder containing CSV files
folder_path = '/data3/""/Corelation_Graph'  # Update this path

# Get all CSV files in the folder
csv_files = glob(os.path.join(folder_path, '*.csv'))

# Initialize an empty DataFrame
combined_df = pd.DataFrame()

# Read and concatenate all CSV files
for file in csv_files:
    temp_df = pd.read_csv(file)
    combined_df = pd.concat([combined_df, temp_df])

# Group by ID and merge predicates
# def merge_predicates(predicates):
#     # Split each predicate string by ';', flatten, and remove duplicates
#     unique_predicates = sorted(set(';'.join(predicates).split(';')))
#     return ';'.join(unique_predicates)

# result_df = combined_df.groupby('ID').agg({
#     'distance': 'mean',  # Or another aggregation method
#     'predicate': merge_predicates
# }).reset_index()

# # Save the final DataFrame to CSV
# result_df.to_csv('/data3/""/Corelation_Graph/Result/merged_file.csv', index=False)


  interactivity=interactivity, compiler=compiler, result=result)


SpecificationError: Column(s) ['predicate'] do not exist

In [21]:
import pandas as pd
import os
from glob import glob
from concurrent.futures import ProcessPoolExecutor

# Folder containing CSV files
folder_path = '/data3/""/Corelation_Graph'  # Update this path

# Get all CSV files in the folder
csv_files = glob(os.path.join(folder_path, '*.csv'))

# Function to read a CSV file
def read_csv(file):
    return pd.read_csv(file, low_memory=False)

# Initialize an empty DataFrame
combined_df = pd.DataFrame()

# Use ProcessPoolExecutor to read CSV files in parallel
with ProcessPoolExecutor() as executor:
    # Map the read_csv function to the list of files
    results = executor.map(read_csv, csv_files)
    
    # Concatenate the results into a single DataFrame
    combined_df = pd.concat(results, ignore_index=True)

# Group by 'ID' and apply aggregation
# def merge_predicates(predicates):
#     # Split each predicate string by ';', flatten, and remove duplicates
#     unique_predicates = sorted(set(';'.join(predicates).split(';')))
#     return ';'.join(unique_predicates)

# # Perform the aggregation on the DataFrame
# result_df = combined_df.groupby('ID').agg({
#     'distance': 'mean',  # Or another aggregation method
#     'predicate': merge_predicates
# }).reset_index()

# # Save the final DataFrame to CSV
#

In [22]:
# Strip leading/trailing spaces


combined_df[' predicate'] = combined_df[' predicate'].fillna('')


In [23]:
def merge_predicates(predicates):
    # Split each predicate string by ';', flatten, and remove duplicates
    unique_predicates = sorted(set(';'.join(predicates).split(';')))
    return ';'.join(unique_predicates)

# Perform the aggregation on the DataFrame
result_df = combined_df.groupby('ID').agg({
    'distance': 'mean',  # Or another aggregation method
    ' predicate': merge_predicates
}).reset_index()

In [24]:
result_df[' predicate'] =result_df[' predicate'].fillna('Nothing')

In [25]:
result_df.to_csv('/data3/""/Corelation/Result/merged_file.csv', index=False)

In [33]:
import os
import pandas as pd
from multiprocessing import Pool, cpu_count
from functools import partial

def process_file(filename, ground_truth_folder, algorithm_folder):
    ground_truth_file = os.path.join(ground_truth_folder, filename)
    algorithm_file = os.path.join(algorithm_folder, filename)
    
    if not os.path.exists(algorithm_file):
        # If the algorithm file doesn't exist, return None
        return None
    
    try:
        # Read the CSV files
        ground_truth_df = pd.read_csv(ground_truth_file)
        algorithm_df = pd.read_csv(algorithm_file)
        
        # Remove duplicates from the algorithm dataframe
        algorithm_df = algorithm_df.drop_duplicates()
        
        # Get the first 10 records of the ground truth file
        ground_truth_first_10 = ground_truth_df.head(10)
        
        # Use set intersection for faster matching
        algorithm_ids = set(algorithm_df['ID'])
        ground_truth_ids = set(ground_truth_first_10['ID'])
        match_count = len(algorithm_ids.intersection(ground_truth_ids))
        
        return {'query': filename, 'match': match_count}
    except Exception as e:
        # Handle any errors that occur while processing
        print(f"Error processing {filename}: {str(e)}")
        return None

def process_folder(ground_truth_folder, algorithm_folder, output_file):
    # List all files in the ground truth folder
    ground_truth_files = os.listdir(ground_truth_folder)
    
    # Check if the output directory exists, and create it if not
    output_directory = os.path.dirname(output_file)
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    # Create a partial function with the folders already specified
    process_file_partial = partial(process_file, 
                                   ground_truth_folder=ground_truth_folder,
                                   algorithm_folder=algorithm_folder)
    
    # Use all available CPU cores
    num_processes = cpu_count()
    
    # Create a pool of worker processes
    with Pool(processes=num_processes) as pool:
        # Process files in parallel
        results = pool.map(process_file_partial, ground_truth_files)
    
    # Filter out None results and create DataFrame
    results = [r for r in results if r is not None]
    results_df = pd.DataFrame(results)
    
    # Save results to the output CSV file
    results_df.to_csv(output_file, index=False)
    #print(f"Results have been written to {output_file}")
    
    return results_df

def main():
    ground_truth_folder = '/data3/""/Corelation/GroundTruth'
    algorithm_base_folder = '/data4/hnsw/yt8m/Res_Gener/'
    output_base_folder = '/data4/hnsw/paper/InstantResultCheck'
    # 40, 60, 100, 200, 300, 400, 500, 600, 800, 1000, 1200, 1500,
    #                  1700, 1900, 2100, 2400, 2700, 3000, 3200, 3500, 3700, 4000
    # Folder suffixes to iterate over
    folder_suffixes = [ 40, 60, 100, 200, 300, 400, 500, 600, 800, 1000, 1200, 1500, 1700, 1900, 2100, 2400, 2700, 3000]
    
    all_results = []
    
    for folder_suffix in folder_suffixes:
        algorithm_folder = os.path.join(algorithm_base_folder, str(folder_suffix))
        output_file = os.path.join(output_base_folder, f'Res{folder_suffix}.csv')
        
        # Process the folder and get the results
        results_df = process_folder(ground_truth_folder, algorithm_folder, output_file)
        
        # Calculate the average match for the current folder
        average_match = results_df['match'].mean()
        print(f"Average match for folder {folder_suffix}: {average_match}")
        
        # Append to overall results
        all_results.append({'folder': folder_suffix, 'average_match': average_match})
    
    # Save all average matches in a single summary file
    summary_df = pd.DataFrame(all_results)
    summary_file = os.path.join(output_base_folder, 'SummaryResults.csv')
    summary_df.to_csv(summary_file, index=False)
    print(f"Summary of averages has been written to {summary_file}")

if __name__ == '__main__':
    main()


KeyError: 'match'

In [7]:
import pandas as pd
import os

# Folder containing the CSV files
folder_path = '/data3/""/Corelation/GroundTruth'  # Update this with the correct path

total_rows = 0
file_count = 1000

for i in range(file_count):
    file_path = os.path.join(folder_path, f'Q{i}.csv')
    df = pd.read_csv(file_path)
    total_rows += len(df)

average_rows = total_rows / file_count
print(f"Average number of records across all files: {average_rows}")


Average number of records across all files: 20211.415


In [5]:
import pandas as pd

# # Read the Parquet file into a DataFrame
# df = pd.read_parquet('/data4/hnsw/yt8m/yt_data.parquet', engine='pyarrow')

# # Display the first few rows
# df = pd.read_parquet('/data4/hnsw/yt8m/yt_data.parquet', engine='pyarrow')

df = pd.read_csv('/data4/hnsw/yt8m/queries_views_range.csv', sep=";")
df

Unnamed: 0,id,video,audio,start_range,end_range
0,deSR,"0.0616164207,-1.00776923,0.726923585,1.4439954...","0.770636,-0.360448927,-1.39757311,0.121328838,...",1349,2608
1,9E9h,"0.410531461,0.747263491,-0.825599253,-0.709939...","-0.0263835788,-1.0880307,0.777590275,-0.682958...",4464,38450
2,P6Cq,"-0.111586325,0.744629443,-0.799393654,-0.00393...","-0.454693735,-1.1063509,0.378908336,-1.0537979...",11310,46750
3,YYX3,"0.013093546,0.270139307,-0.858854175,0.1951588...","-0.660370529,-0.67108947,-0.120762661,0.868989...",11407,32454
4,qeE7,"-0.484161347,0.386270016,1.73121119,-0.1305927...","1.2543484,-0.031926062,0.421093553,-0.20060579...",20879,20977
...,...,...,...,...,...
1495,NHsY,"-0.0219669119,0.0346507356,-0.343290448,1.5682...","-1.65493262,1.7869792,-0.953952193,1.98489583,...",1288,2336
1496,hffa,"-1.93749464,-0.483324766,-1.37655354,0.4070281...","0.0180608667,-1.12264502,-0.393180966,-0.07987...",7821,28455
1497,wDfq,"0.690078318,0.77443558,-0.22730732,-0.05580405...","-0.106087282,-0.155150458,-0.361773551,0.80441...",3584,4426
1498,CxCu,"0.362154245,-1.2555629,-1.17699146,1.42959118,...","1.24919903,-0.553111851,-0.9114452,-0.00920430...",9041,56063


In [14]:

df_meta_data= df['genre']
df_meta_data.to_csv("/data4/hnsw/yt8m/genre_meta_data.csv", index=False)


In [28]:

df_sampled=pd.read_csv("/data4/hnsw/yt8m/Queries_Genre.csv", sep=";")
df_sampled 

Unnamed: 0,id,video,audio,genre,publication_date,views,likes
0,deSR,"0.0616164207,-1.00776923,0.726923585,1.4439954...","0.770636,-0.360448927,-1.39757311,0.121328838,...",Nonprofits & Activism,2010-03-15T19:56:02-07:00,2608,8
1,9E9h,"0.410531461,0.747263491,-0.825599253,-0.709939...","-0.0263835788,-1.0880307,0.777590275,-0.682958...",Howto & Style,2012-07-11T03:25:20-07:00,38450,0
2,P6Cq,"-0.111586325,0.744629443,-0.799393654,-0.00393...","-0.454693735,-1.1063509,0.378908336,-1.0537979...",Autos & Vehicles,2015-02-07T21:09:24-08:00,46750,52
3,YYX3,"0.013093546,0.270139307,-0.858854175,0.1951588...","-0.660370529,-0.67108947,-0.120762661,0.868989...",Howto & Style,2012-11-09T13:55:48-08:00,11407,26
4,qeE7,"-0.484161347,0.386270016,1.73121119,-0.1305927...","1.2543484,-0.031926062,0.421093553,-0.20060579...",Music,2013-08-02T13:46:50-07:00,20879,267
...,...,...,...,...,...,...,...
1495,NHsY,"-0.0219669119,0.0346507356,-0.343290448,1.5682...","-1.65493262,1.7869792,-0.953952193,1.98489583,...",Sports,2014-03-12T05:00:03-07:00,2336,3
1496,hffa,"-1.93749464,-0.483324766,-1.37655354,0.4070281...","0.0180608667,-1.12264502,-0.393180966,-0.07987...",Gaming,2013-09-20T20:01:08-07:00,28455,141
1497,wDfq,"0.690078318,0.77443558,-0.22730732,-0.05580405...","-0.106087282,-0.155150458,-0.361773551,0.80441...",People & Blogs,2014-07-19T11:19:51-07:00,4426,20
1498,CxCu,"0.362154245,-1.2555629,-1.17699146,1.42959118,...","1.24919903,-0.553111851,-0.9114452,-0.00920430...",Science & Technology,2007-12-17T22:03:49-08:00,9041,9


In [30]:
import numpy as np 
df_sampled['shuffled_likes'] = np.random.permutation(df_sampled['likes'].values)

# Now, create start and end ranges based on views
df_sampled['start_range'] = np.minimum(df_sampled['likes'], df_sampled['shuffled_likes'])
df_sampled['end_range'] = np.maximum(df_sampled['likes'], df_sampled['shuffled_likes'])
df_sampled

Unnamed: 0,id,video,audio,genre,publication_date,views,likes,shuffled_likes,start_range,end_range
0,deSR,"0.0616164207,-1.00776923,0.726923585,1.4439954...","0.770636,-0.360448927,-1.39757311,0.121328838,...",Nonprofits & Activism,2010-03-15T19:56:02-07:00,2608,8,12,8,12
1,9E9h,"0.410531461,0.747263491,-0.825599253,-0.709939...","-0.0263835788,-1.0880307,0.777590275,-0.682958...",Howto & Style,2012-07-11T03:25:20-07:00,38450,0,16,0,16
2,P6Cq,"-0.111586325,0.744629443,-0.799393654,-0.00393...","-0.454693735,-1.1063509,0.378908336,-1.0537979...",Autos & Vehicles,2015-02-07T21:09:24-08:00,46750,52,225,52,225
3,YYX3,"0.013093546,0.270139307,-0.858854175,0.1951588...","-0.660370529,-0.67108947,-0.120762661,0.868989...",Howto & Style,2012-11-09T13:55:48-08:00,11407,26,13,13,26
4,qeE7,"-0.484161347,0.386270016,1.73121119,-0.1305927...","1.2543484,-0.031926062,0.421093553,-0.20060579...",Music,2013-08-02T13:46:50-07:00,20879,267,0,0,267
...,...,...,...,...,...,...,...,...,...,...
1495,NHsY,"-0.0219669119,0.0346507356,-0.343290448,1.5682...","-1.65493262,1.7869792,-0.953952193,1.98489583,...",Sports,2014-03-12T05:00:03-07:00,2336,3,11,3,11
1496,hffa,"-1.93749464,-0.483324766,-1.37655354,0.4070281...","0.0180608667,-1.12264502,-0.393180966,-0.07987...",Gaming,2013-09-20T20:01:08-07:00,28455,141,28,28,141
1497,wDfq,"0.690078318,0.77443558,-0.22730732,-0.05580405...","-0.106087282,-0.155150458,-0.361773551,0.80441...",People & Blogs,2014-07-19T11:19:51-07:00,4426,20,19,19,20
1498,CxCu,"0.362154245,-1.2555629,-1.17699146,1.42959118,...","1.24919903,-0.553111851,-0.9114452,-0.00920430...",Science & Technology,2007-12-17T22:03:49-08:00,9041,9,13,9,13


In [None]:
df_sampled = df_sampled.drop(columns=['views', 'genre', 'likes', 'publication_date','shuffled_likes'])


KeyError: "['views' 'genre' 'likes' 'publication_date' 'shuffled_likes'] not found in axis"

In [33]:
df_sampled

Unnamed: 0,id,video,audio,start_range,end_range
0,deSR,"0.0616164207,-1.00776923,0.726923585,1.4439954...","0.770636,-0.360448927,-1.39757311,0.121328838,...",8,12
1,9E9h,"0.410531461,0.747263491,-0.825599253,-0.709939...","-0.0263835788,-1.0880307,0.777590275,-0.682958...",0,16
2,P6Cq,"-0.111586325,0.744629443,-0.799393654,-0.00393...","-0.454693735,-1.1063509,0.378908336,-1.0537979...",52,225
3,YYX3,"0.013093546,0.270139307,-0.858854175,0.1951588...","-0.660370529,-0.67108947,-0.120762661,0.868989...",13,26
4,qeE7,"-0.484161347,0.386270016,1.73121119,-0.1305927...","1.2543484,-0.031926062,0.421093553,-0.20060579...",0,267
...,...,...,...,...,...
1495,NHsY,"-0.0219669119,0.0346507356,-0.343290448,1.5682...","-1.65493262,1.7869792,-0.953952193,1.98489583,...",3,11
1496,hffa,"-1.93749464,-0.483324766,-1.37655354,0.4070281...","0.0180608667,-1.12264502,-0.393180966,-0.07987...",28,141
1497,wDfq,"0.690078318,0.77443558,-0.22730732,-0.05580405...","-0.106087282,-0.155150458,-0.361773551,0.80441...",19,20
1498,CxCu,"0.362154245,-1.2555629,-1.17699146,1.42959118,...","1.24919903,-0.553111851,-0.9114452,-0.00920430...",9,13


In [None]:
df_sampled.to_csv("/data4/hnsw/yt8m/GroundTruthGenre/Q0.csv")


In [3]:
import pandas as pd
df=pd.read_csv("/data4/hnsw/paper/paper_queries.csv", sep=";")
df

Unnamed: 0,embedding,rand_int
0,"0.18394100666046143,-0.0625929981470108,-0.108...",11
1,"0.20332999527454376,-0.09437700361013412,-0.10...",2
2,"0.11066299676895142,-0.044043999165296555,-0.0...",1
3,"0.24615100026130676,-0.10814200341701508,-0.00...",12
4,"0.32651498913764954,-0.16267600655555725,-0.10...",5
...,...,...
9995,"0.17453999817371368,-0.16027599573135376,-0.05...",7
9996,"0.16341200470924377,-0.13118599355220795,-0.01...",2
9997,"0.1755719929933548,-0.21993699669837952,-0.085...",4
9998,"0.17396600544452667,-0.06954299658536911,-0.08...",8


Unnamed: 0,id,video,audio,start_range,end_range
0,deSR,"0.0616164207,-1.00776923,0.726923585,1.4439954...","0.770636,-0.360448927,-1.39757311,0.121328838,...",1349,2608
1,9E9h,"0.410531461,0.747263491,-0.825599253,-0.709939...","-0.0263835788,-1.0880307,0.777590275,-0.682958...",4464,38450
2,P6Cq,"-0.111586325,0.744629443,-0.799393654,-0.00393...","-0.454693735,-1.1063509,0.378908336,-1.0537979...",11310,46750
3,YYX3,"0.013093546,0.270139307,-0.858854175,0.1951588...","-0.660370529,-0.67108947,-0.120762661,0.868989...",11407,32454
4,qeE7,"-0.484161347,0.386270016,1.73121119,-0.1305927...","1.2543484,-0.031926062,0.421093553,-0.20060579...",20879,20977
...,...,...,...,...,...
1495,NHsY,"-0.0219669119,0.0346507356,-0.343290448,1.5682...","-1.65493262,1.7869792,-0.953952193,1.98489583,...",1288,2336
1496,hffa,"-1.93749464,-0.483324766,-1.37655354,0.4070281...","0.0180608667,-1.12264502,-0.393180966,-0.07987...",7821,28455
1497,wDfq,"0.690078318,0.77443558,-0.22730732,-0.05580405...","-0.106087282,-0.155150458,-0.361773551,0.80441...",3584,4426
1498,CxCu,"0.362154245,-1.2555629,-1.17699146,1.42959118,...","1.24919903,-0.553111851,-0.9114452,-0.00920430...",9041,56063


In [18]:
import pandas as pd
import os

folder_path = '/data4/hnsw/yt8m/GroundTruthGenre'  # Replace with the path to your folder
total_records = 0
file_count = 0

for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        total_records += len(df)
        file_count += 1

average_records = total_records / file_count if file_count > 0 else 0
print(f"Average records per CSV: {average_records}")


Average records per CSV: 449231.9066666667
