In [5]:
import os
from datetime import datetime

def create_sorted_parquet_file_list(directory, output_file):
    """
    Create a sorted list of .parquet files in the specified directory and save it in a text file.
    
    Args:
    - directory (str): Directory path to list files from.
    - output_file (str): Path to the output text file.
    """
    try:
        # List all files in the directory
        files = os.listdir(directory)
        
        # Filter out directories and only keep .parquet files
        parquet_files = [f for f in files if os.path.isfile(os.path.join(directory, f)) and f.endswith('.parquet')]
        
        # Extract the starting date information and filter out incorrectly formatted files
        sorted_parquet_files = []
        for f in parquet_files:
            parts = f.split('_')
            # Assuming the date part comes after the city name (Los_Angeles)
            for i, part in enumerate(parts):
                try:
                    # Try to parse the date part
                    date_obj = datetime.strptime(part, '%Y-%m-%d')
                    sorted_parquet_files.append((f, date_obj))
                    break  # If a valid date is found, no need to check further parts
                except ValueError:
                    continue  # Continue to the next part
        
        # Sort the parquet files based on their starting date information
        sorted_parquet_files.sort(key=lambda x: x[1])
        
        # Write the sorted list of .parquet files to the output text file
        with open(output_file, 'w') as f:
            for file, _ in sorted_parquet_files:
                f.write(file + '\n')
        
        print(f"Sorted list of .parquet files saved to {output_file}")
    
    except Exception as e:
        print(f"An error occurred: {e}")

# Specify the directory path
directory = '/home/ec2-user/SageMaker/Aleksei/Weather_Analysis/Working_analysis/LA/'

# Specify the output text file
output_file = '/home/ec2-user/SageMaker/Aleksei/Weather_Analysis/Working_analysis/LA/123.txt'

# Create a sorted list of .parquet files in the directory and save it to a text file
create_sorted_parquet_file_list(directory, output_file)




Sorted list of .parquet files saved to /home/ec2-user/SageMaker/Aleksei/Weather_Analysis/Working_analysis/LA/123.txt


In [2]:
!pip install pandas




In [3]:
!pip install pyarrow




In [4]:
!pip install fastparquet


Collecting fastparquet
  Downloading fastparquet-2024.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading fastparquet-2024.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cramjam-2.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cramjam, fastparquet
Successfully installed cramjam-2.8.3 fastparquet-2024.5.0
