In [1]:
# IMPORTS
import os
import pandas as pd
import numpy as np
pd.set_option('display.float_format', '{:.10f}'.format)

In [5]:
# Load files
ids = pd.read_excel('C:/Users/alexb/OneDrive/DP3/DP3_IDS.xlsx')
log = pd.read_excel('C:/Users/alexb/OneDrive/DP3/VAS_Log.xlsx')
log.rename(columns = {'Monitor ID':'ID', 'Location/participant':'PPN', 'Downoad data':'download', 'Deployment date': 'deploy', 'Return date':'return'}, inplace = True)
ids.rename(columns = {'Study_ID':'PPN'}, inplace = True)
log = pd.merge(log, ids, on = 'PPN', how = 'inner')
log

Unnamed: 0,ID,PPN,deploy,return,download,group,subgroup,set #
0,c6b87f,DP3-0005,2021-07-07 00:00:00,2021-08-24,yes,FGR,FGR <5,1
1,b67138,DP3-0006,2021-07-21 00:00:00,2021-09-22,,Control,Control,1
2,2441da,DP3-0006,2021-03-18 00:00:00,2021-05-13,yes,Control,Control,1
3,1748fa,DP3-0008,2021-03-23 00:00:00,2021-05-18,yes,Control,Control,2
4,4b81c6,DP3-0009,2021-07-22 00:00:00,2021-09-16,yes,HDP,gHTN,2
...,...,...,...,...,...,...,...,...
118,488810,DP3-0416,2024-04-14 00:00:00,2024-05-27,yes,Control,Control,3
119,e89a79,DP3-0416,2023-12-01 00:00:00,2024-02-02,yes,Control,Control,3
120,8eceb4,DP3-0419,2023-12-10 00:00:00,2024-04-02,yes,HDP,LO severe SIPE,3
121,4124c1,DP3-0420,2024-04-01 00:00:00,2024-06-04,yes,FGR,FGR <3,3


In [6]:
#Clean & convert to lower
log_lower = log.map(lambda x: x.lower() if isinstance(x, str) else x)
log = log_lower
log_yes = log[log['download'].str.contains('yes', case=False, na=False)]
log_yes['PPN'] = log_yes['PPN'] + '-' + log_yes['group']
log = log_yes.copy() # Only yes downloads
print(sum(pd.to_datetime(log['deploy']).isna()))
log['deploy'] = pd.to_datetime(log['deploy'])
log['deploy'] = log['deploy'].astype('int64') // 10**9 #Convert time to unix epoch
print(sum(pd.to_datetime(log['return']).isna()))
log['return'] = pd.to_datetime(log['return'])
log['return'] = log['return'].astype('int64') // 10**9 #Convert time to unix epoch


# Pass to dict structure for key:value mapping
log_dict_ids = log.groupby('ID')['PPN'].apply(list).to_dict() # Monitor : PPN
log_dict_times = log.set_index('PPN')[['deploy', 'return']].apply(tuple, axis = 1).to_dict() # PPN : (deploy, return)
log_dict_times

0
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  log_yes['PPN'] = log_yes['PPN'] + '-' + log_yes['group']


{'dp3-0005-fgr': (1625616000, 1629763200),
 'dp3-0006-control': (1616025600, 1620864000),
 'dp3-0008-control': (1616457600, 1621296000),
 'dp3-0009-hdp': (1617235200, 1621987200),
 'dp3-0018-control': (1619654400, 1624838400),
 'dp3-0024-sptb': (1624838400, 1628812800),
 'dp3-0027-control': (1628467200, 1632182400),
 'dp3-0028-control': (1629072000, 1633392000),
 'dp3-0036-hdp': (1629331200, 1633651200),
 'dp3-0062-control': (1645747200, 1649289600),
 'dp3-0070-control': (1644969600, 1650326400),
 'dp3-0101-sptb': (1642118400, 1652400000),
 'dp3-0105-control': (1652659200, 1658448000),
 'dp3-0111-control': (1654473600, 1660089600),
 'dp3-0113-control': (1654732800, 1660867200),
 'dp3-0120-control': (1655942400, 1661990400),
 'dp3-0129-control': (1657065600, 1662595200),
 'dp3-0142-control': (1649635200, 1654041600),
 'dp3-0152-control': (1652227200, 1657065600),
 'dp3-0156-hdp': (1651536000, 1656374400),
 'dp3-0158-hdp': (1653350400, 1658275200),
 'dp3-0161-fgr': (1654128000, 166380480

In [7]:
# Data Scrubbing
# Loop through each file in the directory, get the keys associated with that 

path = 'C:/Users/alexb/OneDrive/DP3/magee-pm-data-2024-09/magee-pm-data-2024-09'

for filename in os.listdir(path):
    # Check if the path is a file
    file_path = os.path.join(path, filename)
    if os.path.isfile(file_path):
        #Check ID through filename
        print(f'Processing file: {repr(filename[0:6])}')


        # Get monitor data from current file
        monitor_data = pd.read_csv(file_path)
        # set time to index for index based slicing
        monitor_data = monitor_data.set_index('unix_epoch')
        # Select only neeed readout variables
        monitor_data = monitor_data[['temperature', 'humidity', 'pressure', 'voc']]
        

        #Get associated ppns 
        airviz_id = filename[0:6]
        ppn_list = log_dict_ids.get(airviz_id) 
        print(ppn_list)
        #Init ppn frame
        ppn_save_data = None
        #Get UTC times for each ppn
        if ppn_list != None:
            for ppn in ppn_list:
                start_time, end_time = log_dict_times[ppn]
                print(start_time, end_time)

                ppn_save_data = monitor_data.loc[float(start_time):float(end_time)]
                save_name = '-'.join([ppn , airviz_id, '.csv'])
                ppn_save_data.to_csv(''.join(['C:/Users/alexb/OneDrive/DP3/PPN - Monitor_ID - Data/', save_name]), index_label='unix_epoch' )
        


Processing file: '06e5ac'
['dp3-0152-control']
1652227200 1657065600
Processing file: '1748fa'
['dp3-0008-control', 'dp3-0036-hdp', 'dp3-0142-control', 'dp3-0353-control']
1616457600 1621296000
1629331200 1633651200
1649635200 1654041600
1684886400 1690416000
Processing file: '2441da'
['dp3-0006-control', 'dp3-0330-sptb']
1616025600 1620864000
1681776000 1686700800
Processing file: '268af8'
['dp3-0113-control', 'dp3-0388-control']
1654732800 1660867200
1692316800 1697068800
Processing file: '291d69'
['dp3-0156-hdp', 'dp3-0169-control']
1651536000 1656374400
1654560000 1660003200
Processing file: '2c564e'
['dp3-0024-sptb', 'dp3-0062-control', 'dp3-0352-fgr+hdp']
1624838400 1628812800
1645747200 1649289600
1685404800 1690243200
Processing file: '2f1cc3'
['dp3-0197-hdp', 'dp3-0361-fgr', 'dp3-0409-hdp']
1673222400 1675036800
1687392000 1692057600
1700092800 1705881600
Processing file: '2ffebf'
['dp3-0161-fgr']
1654128000 1663804800
Processing file: '32880_'
None
Processing file: '3366b4'
[

In [9]:
# Load in ppns where multiple monitors were given
path = 'C:/Users/alexb/OneDrive/DP3/PPN - Monitor_ID - Data'

#Init dictionary
file_groups = {}

segments = {}
#Since files are loaded by ppn sorting, we do not need to worry about holding more than one dataset at a time
# PPNs have a max of 2 monitors in sequence so a simple hold and check will work
for filename in os.listdir(path):
    if filename.endswith(".csv"):
        # Extract file name ID 
        file_parts = filename.split('-')
        


        file_id = '-'.join((file_parts[0], file_parts[1], file_parts[2]))
        
        monitor_id = file_parts[3]
        
        key = file_id
        
        

        # Load the CSV into a DataFrame
        df = pd.read_csv(os.path.join(path, filename))
        
        # Check if the file ID exists in the dictionary, if not, initialize a list
        if key in file_groups:
            # Concatenate if the ID exists
            file_groups[key].append(df)
            segments[key].append(monitor_id)
        else:
            # Create a new list for this file ID
            file_groups[key] = [df]     
            segments[key] = [monitor_id]

c6b87f
2441da
1748fa
4b81c6
c9fb6e
8e5d53
2c564e
d14457
bbae56
1748fa
2c564e
b18aab
8e5d53
58e9b4
268af8
71a73b
1748fa
e89a79
06e5ac
291d69
4b33be
b18aab
2ffebf
9b1f9c
9f6673
291d69
d14457
e89a79
bbae56
dd6ced
8eceb4
2f1cc3
3366b4
3ca269
dd6ced
b18aab
9b1f9c
dd6ced
e89a79
b18aab
d14457
3366b4
4124c1
c30454
cd5668
2441da
3366b4
4ad5b3
e1845e
2c564e
1748fa
9f6673
2f1cc3
bbae56
4124c1
4ad5b3
8eceb4
bbae56
268af8
47e871
d14457
cd5668
c30454
2f1cc3
d14457
e89a79
8eceb4
4124c1
b18aab


In [10]:
# Now, concatenate all DataFrames that share the same file ID
for key, df_list in file_groups.items():
    # Concatenate the DataFrames for each file_id
    concatenated_df = pd.concat(df_list, ignore_index=True)
    concatenated_df.sort_values(by = 'unix_epoch', ascending=True, inplace = True)
    filename_out = f'{key}_{segments[key][0:]}.csv'
    print(filename_out)

    # Save the concatenated DataFrame (optional)
    concatenated_df.to_csv(os.path.join(path, filename_out), index=False)

    print(f"Concatenated files for ID: {key}") 

dp3-0005-fgr_['c6b87f'].csv
Concatenated files for ID: dp3-0005-fgr
dp3-0006-control_['2441da'].csv
Concatenated files for ID: dp3-0006-control
dp3-0008-control_['1748fa'].csv
Concatenated files for ID: dp3-0008-control
dp3-0009-hdp_['4b81c6', 'c9fb6e'].csv
Concatenated files for ID: dp3-0009-hdp
dp3-0018-control_['8e5d53'].csv
Concatenated files for ID: dp3-0018-control
dp3-0024-sptb_['2c564e'].csv
Concatenated files for ID: dp3-0024-sptb
dp3-0027-control_['d14457'].csv
Concatenated files for ID: dp3-0027-control
dp3-0028-control_['bbae56'].csv
Concatenated files for ID: dp3-0028-control
dp3-0036-hdp_['1748fa'].csv
Concatenated files for ID: dp3-0036-hdp
dp3-0062-control_['2c564e'].csv
Concatenated files for ID: dp3-0062-control
dp3-0101-sptb_['b18aab'].csv
Concatenated files for ID: dp3-0101-sptb
dp3-0105-control_['8e5d53'].csv
Concatenated files for ID: dp3-0105-control
dp3-0111-control_['58e9b4'].csv
Concatenated files for ID: dp3-0111-control
dp3-0113-control_['268af8'].csv
Concat

  concatenated_df = pd.concat(df_list, ignore_index=True)


Concatenated files for ID: dp3-0158-hdp
dp3-0161-fgr_['2ffebf'].csv
Concatenated files for ID: dp3-0161-fgr
dp3-0166-control_['9b1f9c', '9f6673'].csv
Concatenated files for ID: dp3-0166-control
dp3-0169-control_['291d69', 'd14457'].csv
Concatenated files for ID: dp3-0169-control
dp3-0175-control_['e89a79'].csv
Concatenated files for ID: dp3-0175-control
dp3-0177-control_['bbae56'].csv
Concatenated files for ID: dp3-0177-control
dp3-0179-control_['dd6ced'].csv
Concatenated files for ID: dp3-0179-control
dp3-0181-control_['8eceb4'].csv
Concatenated files for ID: dp3-0181-control
dp3-0197-hdp_['2f1cc3'].csv
Concatenated files for ID: dp3-0197-hdp
dp3-0204-control_['3366b4'].csv
Concatenated files for ID: dp3-0204-control
dp3-0214-sptb_['3ca269'].csv
Concatenated files for ID: dp3-0214-sptb
dp3-0217-control_['dd6ced'].csv
Concatenated files for ID: dp3-0217-control
dp3-0247-fgr_['b18aab'].csv
Concatenated files for ID: dp3-0247-fgr
dp3-0256-control_['9b1f9c'].csv
Concatenated files for ID: