In [1]:
import pandas as pd
import os
import datetime as dt



def Combine_All_Years(database_path, datasets = None, stations = None):
    '''This is a function that combines all yearly Formatted, Filtered or Cleaned .csv files for 
    each station into one single .csv.
    
    :param str database_path: the path to the database
    :param list datasets: a list of datasets to combine ('Formatted', 'Filtered' or 'Cleaned'). Leave as None to collect all stations.
    :param list datasets: a list of stations to combine. Leave as None to collect all stations.
    '''
    os.chdir(database_path)
    if not bool(datasets):
        datasets = [f for f in os.listdir() if f.endswith('_Data') and not f.startswith('Raw')]

    for dataset in datasets:
        try:
            os.chdir(dataset)
        except:
            raise ValueError(f'Unrecognised dataset path: {dataset}')
            
        if not bool(stations):
            stations = [f for f in os.listdir() if f.endswith('_Hourly') or f.endswith('_Daily')and not f.startswith('.')]

        for station in stations:
            try:
                os.chdir(station)
            except:
                raise ValueError(f'Unrecognised station path: {station}')
            print('Combining '+ dataset+': '+station+'             ', end='\r')
            files = [f for f in os.listdir() if f.endswith('.csv')]

            out = pd.DataFrame()

            for f in files:
                out = pd.concat([out,pd.read_csv(f)])

            out = out[~out.Time.duplicated(keep='first')] 

            os.chdir('..')

            out.to_csv(station+'.csv', index=False)

        os.chdir('..')

In [2]:
def update_metadata(database_path):
    '''Updates an existing metadata .csv file to include most current data availability for each station.
    
    :param str database_path: the database directory, containing a Metadata.csv file with sation names and information.
    '''
    os.chdir(database_path)
    metadata = pd.read_csv('Metadata.csv')
    
    os.chdir('Formatted_Data')
    
    stations = metadata.Station_Name
    metadata.index = metadata.Station_Name
    for station in stations:
        try:
            data = pd.read_csv(station+'.csv')
        except:
            print(f'{station} not in database.')
            continue
        if station.endswith('Hourly'):
            data.Time = pd.to_datetime(data.Time, format='%Y-%m-%d %H:%M:%S')
        elif station.endswith('Daily'):
            try:
                data.Time = pd.to_datetime(data.Time, format='%Y-%m-%d %H:%M' %p)
            except:
                try:
                    data.Time = pd.to_datetime(data.Time, format='%Y-%m-%d %H:%M')
                except:
                    try:
                        data.Time = pd.to_datetime(data.Time, format='%Y-%m-%d')
                    except:
                        raise ValueError(f"{station}'s datetime format is not recognised")
        print(f'Updated station: {station}')
        tmin = data.Time.min()
        tmax = data.Time.max()
        metadata.loc[station, 'Time_Available'] = f'{tmin}, {tmax}'
        metadata.loc[station, 'Data_Available'] = str(data.columns.to_list())
    metadata.to_csv(database_path+'/Metadata.csv',index=False)
    os.chdir(database_path)
    return metadata

In [3]:
Combine_All_Years(os.getcwd(), datasets = ['Formatted_Data','Filtered_Data'])
update_metadata(os.getcwd())

Combining Filtered_Data: ArthursEWS_Daily                            

ValueError: Unrecognised station path: ArthursEWS_Hourly

In [35]:
out = pd.DataFrame()
for f in files:
    out = pd.concat([out,pd.read_csv('Formatted_Data/Chilton_Hourly/'+f)])

In [10]:
os.chdir('..')

In [63]:
os.path.expanduser()

TypeError: expanduser() missing 1 required positional argument: 'path'

In [56]:
[f for f in os.listdir() if f.endswith('_Hourly') or f.endswith('_Daily')]

['.ipynb_checkpoints_Daily',
 'ArthursAWS_ScreenObs_Hourly',
 'ArthursCombined_Daily',
 'ArthursEWS_Hourly',
 'ArthursRain_Daily',
 'ArthursTemps_Daily',
 'BealeyRain_Daily',
 'BrokenRiverCombined_Daily',
 'BrokenRiverRain_Daily',
 'BrokenRiverTemp_Daily',
 'CampStreamCombined_Daily',
 'CampStreamRain_Daily',
 'CarringtonRain_Daily',
 'Cass_Daily',
 'Cass_Hourly',
 'CastleHillRain_Daily',
 'Chilton_Daily',
 'Chilton_Hourly',
 'ClifloArthursRain_Daily',
 'CragieburnForestCombined_Daily',
 'CragieburnForestTemps_Daily',
 'CragieburnStnRain_Daily',
 'EskRain_Daily',
 'FlockhillRain_Daily',
 'GrasmereRain_Daily',
 'MtWhiteRain_Daily',
 'OldCassRain_Daily']