In [5]:
import os
import pandas as pd
import ts_lag_features_generator as lag_gen
import aqi_calculator.aqi_calculator as aqc
import warnings
import time
warnings.filterwarnings('ignore')

In [6]:
CSV_PATH = '../datasources/aq_min/$pollutant_id$/'
CSV_EDA_PATH = '../datasources/eda/'
CSV_AQI_ENRICH_PATH = '../datasources/aqi_enrich/'
POL_CODES = [7, 6001, 5, 10]
#POL_CODES = [7]
POL_NAMES = {7: "O3", 6001: "PM2.5", 5: "PM10", 10: "CO", 1: "SO2", 8: "NO2"}
POL_MEASURES = {7: "µg/m3", 6001: "µg/m3", 5: "µg/m3", 10: "mg/m3", 1: "µg/m3", 8: "µg/m3"}

In [7]:
DATE_FROM = '2013-01-01'
DATE_END = '2022-10-29'
CONCENTRATION_AGGREGATES = ['mean', 'median', 'min', 'max']
CONCENTRATION_AGGREGATES_FOR_LAGS = ['mean', 'median']

In [8]:
def save_calc(df: pd.DataFrame, file_name: str):
	file_path = os.path.join(CSV_AQI_ENRICH_PATH, file_name)
	df.to_csv(file_path)

In [9]:
def timeit(show_args):
    def timeit_func(func):
        def timeit_wrapper(*args, **kwargs):
            start_time = time.perf_counter()
            result = func(*args, **kwargs)
            end_time = time.perf_counter()
            total_time = end_time - start_time
            if show_args:
                print(f'Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds')
            else:
                print(f'Function {func.__name__} Took {total_time:.4f} seconds')
            return result
        return timeit_wrapper
    return timeit_func

In [None]:
#Later
def calc_mode(df):
    # Factorize
    y, label = pd.factorize(df)
    y = pd.Series(y)
    label = pd.Series(label)
    y = y.rolling(window=7, min_periods=1).apply(lambda x: mode(x)[0])
    # Unfactorize
    y = y.map(label)
    return y

## Расчет по дням: индексов качества воздуха, статистик концентраций, объединение данных

In [10]:
def read_dataframe_for_pollutant(pollutant_id: int):
    df_p = pd.read_csv(os.path.join(CSV_EDA_PATH, f'{pollutant_id}.csv'), parse_dates=True, index_col='DatetimeEnd')
    df_p.drop(columns=['Unnamed: 0'], inplace=True)
    return df_p

def merge_column_by_index(pollutant_id: int, df_gen: pd.DataFrame, df_to_merge: pd.DataFrame, source_column: str, new_column=None) -> pd.DataFrame:
    if new_column is None:
        new_column=source_column

    df_gen = df_gen.merge(df_to_merge[source_column], left_index=True, right_index=True)
    df_gen = df_gen.rename(columns={source_column: f'{new_column}_{POL_NAMES[pollutant_id]}'})
    return df_gen

def calc_aqi_per_pollutant_and_merge_pollutants(g: pd.DataFrame) -> pd.DataFrame:
        for pollutant_id in POL_CODES:
            df_p = read_dataframe_for_pollutant(pollutant_id)
            measure = POL_MEASURES[pollutant_id]
            g_p = aqc.calc_aqi_for_day_pd(pollutant_id, df_p, measure).tz_localize(None)
            g = merge_column_by_index(pollutant_id, g, g_p, 'AQI')
        return g

def calc_mean_concentration_and_merge_pollutants(g: pd.DataFrame) -> pd.DataFrame:
    for pollutant_id in POL_CODES:
        df_p = read_dataframe_for_pollutant(pollutant_id)
        for method in CONCENTRATION_AGGREGATES:
            g_p = df_p['Concentration'].groupby(pd.Grouper(freq="24H")).agg(method).tz_localize(None).to_frame()
            g = merge_column_by_index(pollutant_id, g, g_p, 'Concentration', f'C_{method if type(method) is str else method.__name__}')
    return g

def calc_aqi_and_mean_concentration_and_merge(date_from, date_end) -> pd.DataFrame:
    g = pd.DataFrame(index=pd.date_range(start=date_from, end=date_end, freq='D', inclusive="both", name='DatetimeEnd'))
    g = calc_aqi_per_pollutant_and_merge_pollutants(g)
    g['Pollutant'] = g.idxmax(axis=1).apply(lambda x: x[x.index('_') + 1:])
    g['AQI'] = g.max(axis=1)
    g = calc_mean_concentration_and_merge_pollutants(g)
    return g

In [11]:
def add_date_info(df: pd.DataFrame) -> pd.DataFrame:
    df["weekday"] = df.index.weekday
    df["day"] = df.index.day
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['season'] = df.index.to_series().apply(lambda x: 0 if x.month < 3 else 1 if x.month < 6 else 2 if x.month < 9 else 3)
    df['is_weekend'] = [df['weekday'].isin([5,6])][0]*1
    #df['holiday'] = [df.index.isin(pd.to_datetime(holidays).date)][0]*1
    return df

In [12]:
g = calc_aqi_and_mean_concentration_and_merge(DATE_FROM, DATE_END)
g = add_date_info(g)


Unnamed: 0_level_0,AQI_O3,AQI_PM2.5,AQI_PM10,AQI_CO,Pollutant,AQI,C_mean_O3,C_median_O3,C_min_O3,C_max_O3,...,C_mean_CO,C_median_CO,C_min_CO,C_max_CO,weekday,day,month,year,season,is_weekend
DatetimeEnd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,26,64,30,4,PM2.5,64,48.331818,51.1,23.5,59.9,...,0.32287,0.304,0.26,0.424,1,1,1,2013,0,0
2013-01-02,26,31,21,4,PM2.5,31,33.9125,31.9,16.5,54.2,...,0.349217,0.377,0.253,0.45,2,2,1,2013,0,0
2013-01-03,16,47,23,4,PM2.5,47,25.217391,25.1,15.2,37.1,...,0.332292,0.3305,0.283,0.398,3,3,1,2013,0,0
2013-01-04,15,54,26,3,PM2.5,54,22.108333,20.65,10.4,44.3,...,0.313042,0.3095,0.276,0.373,4,4,1,2013,0,0
2013-01-05,17,56,22,4,PM2.5,56,19.282609,22.8,2.5,38.8,...,0.371261,0.374,0.266,0.514,5,5,1,2013,0,1


In [19]:
g

Unnamed: 0_level_0,AQI_O3,AQI_PM2.5,AQI_PM10,AQI_CO,Pollutant,AQI,C_mean_O3,C_median_O3,C_min_O3,C_max_O3,...,C_mean_CO,C_median_CO,C_min_CO,C_max_CO,weekday,day,month,year,season,is_weekend
DatetimeEnd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,26,64,30,4,PM2.5,64,48.331818,51.10,23.5,59.9,...,0.322870,0.3040,0.260,0.424,1,1,1,2013,0,0
2013-01-02,26,31,21,4,PM2.5,31,33.912500,31.90,16.5,54.2,...,0.349217,0.3770,0.253,0.450,2,2,1,2013,0,0
2013-01-03,16,47,23,4,PM2.5,47,25.217391,25.10,15.2,37.1,...,0.332292,0.3305,0.283,0.398,3,3,1,2013,0,0
2013-01-04,15,54,26,3,PM2.5,54,22.108333,20.65,10.4,44.3,...,0.313042,0.3095,0.276,0.373,4,4,1,2013,0,0
2013-01-05,17,56,22,4,PM2.5,56,19.282609,22.80,2.5,38.8,...,0.371261,0.3740,0.266,0.514,5,5,1,2013,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-24,28,17,12,4,O3,28,50.305000,55.70,22.9,68.4,...,0.288217,0.2690,0.208,0.416,0,24,10,2022,3,0
2022-10-25,22,24,15,5,PM2.5,24,30.691667,25.60,13.1,53.2,...,0.375583,0.3850,0.215,0.579,1,25,10,2022,3,0
2022-10-26,18,26,15,4,PM2.5,26,25.591304,22.40,11.4,41.9,...,0.354304,0.3670,0.233,0.452,2,26,10,2022,3,0
2022-10-27,16,33,17,5,PM2.5,33,18.720833,17.90,9.5,30.3,...,0.388792,0.3635,0.252,0.681,3,27,10,2022,3,0


***
#### Добавление лагов
***

In [13]:
def get_all_concentration_and_aqi_columns(df):
    return [x for x in df.columns.values if [p for p in POL_CODES if x.endswith(POL_NAMES[p])]] + ['AQI'] + ['Pollutant']

def get_aqi_columns(df):
    return [x for x in df.columns.values if x.startswith('AQI') and [p for p in POL_CODES if x.endswith(POL_NAMES[p])]] + ['AQI']

def get_concentration_columns_by_method(df, method):
    return [x for x in df.columns.values if x.startswith(f'C_{method}') and [p for p in POL_CODES if x.endswith(POL_NAMES[p])]]

In [23]:
@timeit(show_args=False)
def get_lag_data_shift(df: pd.DataFrame) -> pd.DataFrame:
    method_name = 'get_lag_data_shift'
    print(f'-------------------------------------------')
    print(f'{method_name} started')
    df_c = df.copy(deep=True)
    target_cols = get_all_concentration_and_aqi_columns(df_c)
    lags = [7, 8, 9, 10, 11, 12,  13, 14, 21, 28]

    for column in target_cols:
        for lag in lags:
            df_c[f'{column}_lag{lag}'] = df[column].shift(lag)

    print(f'{method_name} finished')
    return df_c

@timeit(show_args=False)
def get_lag_data_aqi(df: pd.DataFrame) -> pd.DataFrame:
        method_name = 'get_lag_data_aqi'
        print(f'-------------------------------------------')
        print(f'{method_name} started')

        target_cols = get_aqi_columns(df)
        id_cols = []
        date_col = 'DatetimeEnd'
        filters = ['NoFilter', 'weekday', 'month']

        windows = {
            'NoFilter': ['3D', '5D', '7D', '14D', '28D'],
            'weekday':  ['28D', '56D'],
            'month':    ['90D']
        }
        lags = [7, 10, 14, 21, 28]
        agg_methods = ['mean', 'median', lag_gen.percentile(10),  lag_gen.percentile(90)] #, pd.Series.skew, pd.Series.kurtosis]
        ewm_params={
            'NoFilter': [7, 14, 21, 28],
            'weekday': [28, 56],
            'month': [90],
        }
        df['NoFilter'] = 1

        total = len(target_cols) * len(lags) * len(windows) * len(agg_methods) * len(filters)
        print(f'New columns count: {total}')

        df_lagged_features = lag_gen.generate_lagged_features(df
                    , target_cols = target_cols
                    , id_cols = id_cols
                    , date_col = date_col
                    , lags = lags
                    , windows = windows
                    , preagg_methods = ['mean']
                    , agg_methods = agg_methods
                    , dynamic_filters = filters
                    , ewm_params=ewm_params
                    )
        df_lagged_features.set_index(date_col, inplace = True)

        print(f'{method_name} finished')
        return df_lagged_features

@timeit(show_args=False)
def get_lag_data_concentration(df: pd.DataFrame, method) -> pd.DataFrame:
        method_name = 'get_lag_data_concentration'
        print(f'-------------------------------------------')
        print(f'{method_name} started for {method}')

        target_cols = get_concentration_columns_by_method(df, method)
        id_cols = []
        date_col = 'DatetimeEnd'
        filters = ['NoFilter', 'weekday', 'month']

        windows = {
            'NoFilter': ['3D', '5D', '7D', '14D', '28D'],
            'weekday':  ['28D', '42D'],
            'month':    ['7D', '14D', '28D']
        }

        lags = [7, 10, 14, 21, 28]
        agg_methods = [method]
        ewm_params={
            'NoFilter': [7, 14, 21, 28],
            'weekday': [7, 14, 21, 28],
            'month': [7, 14, 21, 28],
        }
        df['NoFilter'] = 1

        total = len(target_cols) * len(lags) * len(windows) * len(agg_methods) * len(filters)
        print(f'New columns count: {total}')

        df_lagged_features = lag_gen.generate_lagged_features(df
                    , target_cols = target_cols
                    , id_cols = id_cols
                    , date_col = date_col
                    , lags = lags
                    , windows = windows
                    , preagg_methods = ['mean']
                    , agg_methods = agg_methods
                    , dynamic_filters = filters
                    , ewm_params=ewm_params
                    )
        df_lagged_features.set_index(date_col, inplace = True)

        print(f'{method_name} finished')
        return df_lagged_features

def get_all_lag_data(df: pd.DataFrame) -> pd.DataFrame:
    g_shift = get_lag_data_shift(df)
    g_shift = get_lag_data_aqi(g_shift)
    for method in CONCENTRATION_AGGREGATES_FOR_LAGS:
        g_shift = get_lag_data_concentration(g_shift, method)
    return g_shift

In [15]:
g1 = get_all_lag_data(g)
#save_calc(g1, 'aqi_enrich.csv')
g1

-------------------------------------------
get_lag_data_shift started
get_lag_data_shift finished
Function get_lag_data_shift Took 0.1708 seconds
-------------------------------------------
get_lag_data_aqi started
New columns count: 900


IntProgress(value=0, max=225)

get_lag_data_aqi finished
Function get_lag_data_aqi Took 947.9074 seconds
-------------------------------------------
get_lag_data_concentration started for mean
New columns count: 180


IntProgress(value=0, max=90)

get_lag_data_concentration finished
Function get_lag_data_concentration Took 9.8658 seconds
-------------------------------------------
get_lag_data_concentration started for median
New columns count: 180


IntProgress(value=0, max=90)

get_lag_data_concentration finished
Function get_lag_data_concentration Took 12.0582 seconds


Unnamed: 0_level_0,AQI_O3,AQI_PM2.5,AQI_PM10,AQI_CO,Pollutant,AQI,C_mean_O3,C_median_O3,C_min_O3,C_max_O3,...,C_median_PM10_lag21d_win90D_ag:median_filt:month,C_median_CO_lag21d_win90D_ag:median_filt:month,C_median_O3_lag28d_ewm90_filt:month,C_median_PM2.5_lag28d_ewm90_filt:month,C_median_PM10_lag28d_ewm90_filt:month,C_median_CO_lag28d_ewm90_filt:month,C_median_O3_lag28d_win90D_ag:median_filt:month,C_median_PM2.5_lag28d_win90D_ag:median_filt:month,C_median_PM10_lag28d_win90D_ag:median_filt:month,C_median_CO_lag28d_win90D_ag:median_filt:month
DatetimeEnd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,26,64,30,4,PM2.5,64,48.331818,51.10,23.5,59.9,...,,,,,,,,,,
2013-01-02,26,31,21,4,PM2.5,31,33.912500,31.90,16.5,54.2,...,,,,,,,,,,
2013-01-03,16,47,23,4,PM2.5,47,25.217391,25.10,15.2,37.1,...,,,,,,,,,,
2013-01-04,15,54,26,3,PM2.5,54,22.108333,20.65,10.4,44.3,...,,,,,,,,,,
2013-01-05,17,56,22,4,PM2.5,56,19.282609,22.80,2.5,38.8,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-24,28,17,12,4,O3,28,50.305000,55.70,22.9,68.4,...,16.750,0.2825,33.91433,10.084203,18.371554,0.383856,,,,
2022-10-25,22,24,15,5,PM2.5,24,30.691667,25.60,13.1,53.2,...,18.450,0.3425,33.91433,10.084203,18.371554,0.383856,,,,
2022-10-26,18,26,15,4,PM2.5,26,25.591304,22.40,11.4,41.9,...,16.750,0.2885,33.91433,10.084203,18.371554,0.383856,,,,
2022-10-27,16,33,17,5,PM2.5,33,18.720833,17.90,9.5,30.3,...,16.275,0.2855,33.91433,10.084203,18.371554,0.383856,,,,


In [24]:
gm = get_lag_data_pollutant(g)

-------------------------------------------
get_lag_data_pollutant started
New columns count: 45


IntProgress(value=0, max=45)

AttributeError: 'mode' is not a valid function for 'DataFrameGroupBy' object