In [3]:
import os
import csv

def count_csv_lines(start_directory="."):
    """
    Traverse directories from start_directory, find .csv files, and count lines in each file.

    Args:
        start_directory (str): The starting directory for the traversal. Default is the current directory.

    Returns:
        dict: A dictionary where keys are file paths and values are the line counts.
    """
    line_counts = {}

    for root, dirs, files in os.walk(start_directory):
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, "r", newline='', encoding="utf-8") as f:
                        reader = csv.reader(f)
                        line_count = sum(1 for _ in reader)
                        line_counts[file_path] = line_count
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

    return line_counts

# Example usage
if __name__ == "__main__":
    start_dir = input("Enter the directory to start searching (default is current directory): ") or "."
    counts = count_csv_lines(start_dir)
    
    total_lines = 0
    total_files = 0
    
    if counts:
        for file, count in counts.items():
            print(f"{file}: {count} lines")
            total_lines += count
            total_files += 1
    else:
        print("No CSV files found.")
    
    print(f"total number of lines: {total_lines} lines")
    print(f"total number of files: {total_files} files")


.\A-recipes\air-fryer-recipes.csv: 72 lines
.\A-recipes\allrecipes-allstars-recipes.csv: 72 lines
.\A-recipes\angel-food-cake-recipes.csv: 19 lines
.\A-recipes\antipasto-recipes.csv: 38 lines
.\A-recipes\appetizers-and-snacks-recipes.csv: 77 lines
.\A-recipes\apple-pie-recipes.csv: 72 lines
.\A-recipes\applesauce-recipes.csv: 35 lines
.\A-recipes\artichoke-dip-recipes.csv: 55 lines
.\B-recipes\bagels-recipes.csv: 27 lines
.\B-recipes\baked-beans-recipes.csv: 68 lines
.\B-recipes\banana-bread-recipes.csv: 68 lines
.\B-recipes\bar-cookies-recipes.csv: 68 lines
.\B-recipes\beef-recipes.csv: 68 lines
.\B-recipes\beef-stroganoff-recipes.csv: 51 lines
.\B-recipes\biscotti-recipes.csv: 56 lines
.\B-recipes\biscuits-recipes.csv: 68 lines
.\B-recipes\blintz-recipes.csv: 13 lines
.\B-recipes\blondies-recipes.csv: 59 lines
.\B-recipes\bloody-marys-recipes.csv: 23 lines
.\B-recipes\blueberry-pie-recipes.csv: 65 lines
.\B-recipes\borscht-recipes.csv: 24 lines
.\B-recipes\bread-recipes.csv: 72 lines

In [3]:
# Function to load already processed files from the log
def load_merged_files(log_file):
    if os.path.exists(log_file):
        with open(log_file, "r") as f:
            return set(f.read().splitlines())
    return set()

In [4]:
# Function to save newly merged files to the log
def save_merged_files(log_file, file_list):
    with open(log_file, "a") as f:
        for file in file_list:
            f.write(f"{file}\n")

In [29]:
import os
import pandas as pd
import string

# Path for the directory
main_directory = '/'

# Txt to track already merged files
merged_files_log = "merged_files_log.txt"

# Load the list of already merged files
already_merged_files = load_merged_files(merged_files_log)

# Dynamically generate directory names A-recipes to Z-recipes
directories = [f"{letter}-recipes/" for letter in string.ascii_uppercase]

# Create an empty list
dataframes = []

# Track new files that are merged in this session
newly_merged_files = []

for directory in directories:
    if os.path.exists(directory): #check if the directory exists
        # Iterate through all files in the directory
        for file in os.listdir(directory):
            if file.endswith(".csv"):
                file_path = os.path.join(directory, file)
                if file_path not in already_merged_files:
                    df = pd.read_csv(file_path)
                    dataframes.append(df)
                    newly_merged_files.append(file_path)

if dataframes: 
    # Concatenate all Dataframes into a single DataFrame               
    merged_df = pd.concat(dataframes, ignore_index=True)
    
    # Save the merged DataFrame to a new CSV file
    output_file = "AllRecipesDataset.csv"
    merged_df.to_csv(output_file, index=False)
    
    # Update the log with newly merged files
    save_merged_files(merged_files_log, newly_merged_files)
    
    print(f"Merged CSV saved as {output_file}")
    print(f"Log updated with {len(newly_merged_files)} newly merged files.")
else:
    print("No new CSV files to merge. All files are already merged.")
                

Merged CSV saved as AllRecipesDataset.csv
Log updated with 373 newly merged files.
