In [6]:
from pathlib import Path
import pandas as pd
import os

def preprocess_and_merge_data(ghi_path, pr_path):
    months = os.listdir(ghi_path)
    all_data = pd.DataFrame()
    
    for month in months:
        ghi_month_path = Path(ghi_path) / month
        pr_month_path = Path(pr_path) / month
        
        ghi_files = sorted(ghi_month_path.glob('*.csv'))
        pr_files = sorted(pr_month_path.glob('*.csv'))
        
        ghi_month_data = pd.concat([pd.read_csv(f) for f in ghi_files], ignore_index=True)
        pr_month_data = pd.concat([pd.read_csv(f) for f in pr_files], ignore_index=True)
        
        merged_data = pd.merge(ghi_month_data, pr_month_data, on='Date', how='outer')
        all_data = pd.concat([all_data, merged_data], ignore_index=True)
        
    all_data['Date'] = pd.to_datetime(all_data['Date'])  # Ensure the 'Date' column is in datetime format
    all_data = all_data.sort_values(by='Date', ascending=True).reset_index(drop=True)
    
    return all_data


# Assuming the script is running in a directory that is a sibling to 'Data'
base_path = Path('./Data')  # Adjust the './' if your script is in another directory

# Construct the paths to the GHI and PR folders
ghi_path = base_path / 'GHI'
pr_path = base_path / 'PR'

# Call the function with these paths
all_data = preprocess_and_merge_data(ghi_path, pr_path)
output_file_path = './data/merged_data.csv'  # Specify your desired output file path
all_data.to_csv(output_file_path, index=False)  # 'index=False' to prevent writing row indices

print("Data saved to CSV file. You can download it from:", output_file_path)


Data saved to CSV file. You can download it from: ./data/merged_data.csv


In [9]:
import os
print(os.getcwd())

/Users/justiny/Projects/solarassessment
