In [14]:
import os
import configparser
import warnings
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import acf
warnings.filterwarnings('ignore')

In [15]:
BASE_DIR = os.getcwd()
CONFIG = configparser.ConfigParser()
CONFIG.read(os.path.join(BASE_DIR, 'script_config.ini'))

BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), '..', 'data'))

DATA_RAW = os.path.join(BASE_PATH, 'raw')
DATA_RESULTS = os.path.join(BASE_PATH, '..', 'results')

In [16]:
data_path = os.path.join(DATA_RESULTS, 'processed', 
                         'complete_cdc_pulmonary_data.csv')

df = pd.read_csv(data_path)
df = (df.groupby(['sex', 'race_recode3', 'age_cat', 'fileyear'], 
                 as_index = False).agg({'mort_count': 'sum'}))
df['mort_count'] = df['mort_count'].fillna(0)

First we calculate the first differences.

In [17]:
df['deaths_diff'] = df.groupby(['sex', 'race_recode3', 
                    'age_cat'])['mort_count'].diff()
df

Unnamed: 0,sex,race_recode3,age_cat,fileyear,mort_count,deaths_diff
0,F,Black,9 years or below,2005,1,
1,F,Black,9 years or below,2007,1,0.0
2,F,Black,9 years or below,2008,1,0.0
3,F,Black,9 years or below,2009,1,0.0
4,F,Black,9 years or below,2012,1,0.0
...,...,...,...,...,...,...
321,M,White,70 years or above,2016,2758,-117.0
322,M,White,70 years or above,2017,2863,105.0
323,M,White,70 years or above,2018,2868,5.0
324,M,White,70 years or above,2019,2817,-51.0


Next we compute Autocorrelation Function (ACF) up to lag 5 for raw and differenced series

In [18]:
results = []
max_lag = 5

groups = df.groupby(['sex', 'race_recode3', 'age_cat'])
for (sex, race, age_group), group_data in groups:
    
    group_data = group_data.sort_values("fileyear")

    ts_raw = group_data['mort_count'].values
    ts_diff = group_data['deaths_diff'].dropna().values 

    # Raw ACF
    acf_raw = acf(ts_raw, nlags = max_lag, fft = False)

    # Differenced ACF
    if len(ts_diff) >= 2:
        
        acf_diff = acf(ts_diff, nlags = min(max_lag, 
                   len(ts_diff)-1), fft = False)
    else:
        
        acf_diff = [np.nan] * (max_lag + 1)

   
    for lag in range(len(acf_raw)):
        results.append({
            "sex": sex,
            "race": race,
            "age_group": age_group,
            "lag": lag,
            "acf_raw": acf_raw[lag],
            "acf_diff": acf_diff[lag] if lag < len(acf_diff) else np.nan
        })

acf_df = pd.DataFrame(results)

In [19]:
folder_out = os.path.join(DATA_RESULTS, 'stat_test')
filename = 'acf_raw_diff_lag5.csv'
path_out = os.path.join(folder_out, filename)
acf_df.to_csv(path_out, index = False)