In [1]:
import os
import pandas as pd
import numpy as np
import sqlalchemy as sa
from urllib.parse import quote
import datetime
import seaborn as sns
from datetime import datetime, timedelta
import datetime
import matplotlib.pyplot as plt

import statsmodels.api as sm
from pylab import rcParams

import warnings; 
warnings.filterwarnings("ignore")

In [2]:
cur_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print("current times:", cur_time)

current times: 2024-07-29 23:38:38


In [3]:
# Specify the directory containing the CSV files
directory = r'data/'  # Use raw string to handle backslashes

amp = [
    'BUKT.csv',
    'CIHO.csv',
    'SAY001.csv',
    'SAY002.csv',
    'STH005.csv',
    'STH007.csv',
    'STH010.csv',
    'STH011.csv',
    'STH013.csv',
    'STH014.csv',
    'STH019.csv',
    'STH021.csv',
    'STH022.csv',
    'STH023.csv',
    'STH025.csv',
    'STH026.csv'
]
# Read each CSV file into a DataFrame, add a new column, and store them in a list
dataframes = []
for dirpath, _, filenames in os.walk(directory):
    for file in filenames:
        if file.endswith('.csv'):
            if file in amp:
                # print(file)
                file_path = os.path.join(dirpath, file)
                try:
                    df = pd.read_csv(file_path)
                    # df['source_file'] = os.path.relpath(file_path, directory)  # Add a new column with the relative file path
                    df['source_file'] = file
                    dataframes.append(df)
                except FileNotFoundError:
                    print(f"File not found: {file_path}")
                except pd.errors.EmptyDataError:
                    print(f"File is empty: {file_path}")
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

# Optionally, concatenate all DataFrames into a single DataFrame
if dataframes:
    all_data = pd.concat(dataframes, ignore_index=True)
    # Now 'all_data' contains all the data from the CSV files with an additional column 'source_file'
    print(all_data)
else:
    print("No dataframes to concatenate.")

              date   time   press source_file
0       2019-01-01  00:00  -999.0    BUKT.csv
1       2019-01-01  01:00  -999.0    BUKT.csv
2       2019-01-01  02:00  -999.0    BUKT.csv
3       2019-01-01  03:00  -999.0    BUKT.csv
4       2019-01-01  04:00  1010.7    BUKT.csv
...            ...    ...     ...         ...
667339  2023-12-31  19:00  1004.3  STH026.csv
667340  2023-12-31  20:00  1005.5  STH026.csv
667341  2023-12-31  21:00  1006.7  STH026.csv
667342  2023-12-31  22:00  1006.8  STH026.csv
667343  2023-12-31  23:00  1006.8  STH026.csv

[667344 rows x 4 columns]


In [4]:
pressure = pd.DataFrame(all_data)

In [5]:
pressure.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667344 entries, 0 to 667343
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   date         667344 non-null  object
 1   time         667344 non-null  object
 2   press        667344 non-null  object
 3   source_file  667344 non-null  object
dtypes: object(4)
memory usage: 20.4+ MB


In [6]:
# from datetime import datetime
def calculate_week_number(str_date):
    # Start of the custom week 1
    # str_date = datetime.strptime(str_date, format).date()
    start_of_week_1 = pd.Timestamp(year=str_date.year, month=1, day=7)
    # print(start_of_week_1)
    # if str_date < start_of_week_1:
    #     start_date = datetime.date(str_date.year, 1, 7)
    if str_date < start_of_week_1:
        return 52  # For dates before the start of week 1
    return ((str_date - start_of_week_1).days // 7) + 1

In [7]:
# Strip any leading/trailing whitespace from the date strings
pressure['date'] = pressure['date'].str.strip()
# temp['date'] = datetime.strptime(temp['date'], '%Y-%m-%d')
# Convert 'DATESICK' column to datetime
pressure['date'] = pd.to_datetime(pressure['date'], format='%Y-%m-%d')

In [8]:
# Apply the function to get the week number
pressure['week_number'] = pressure['date'].apply(calculate_week_number)

In [9]:
# Replace -999.0 with NaN
pressure['press'] = pressure['press'].replace(-999.0, np.nan)

In [10]:
# Convert 'press' column to numeric, coercing errors to NaN
pressure['press'] = pd.to_numeric(pressure['press'], errors='coerce')

In [11]:
pressure['press'] = pd.to_numeric(pressure['press'])
temp_mean = pressure.groupby(['date','source_file','week_number'])["press"].mean().reset_index(name='press_mean')
temp_mean.head(30)

Unnamed: 0,date,source_file,week_number,press_mean
0,2019-01-01,BUKT.csv,52,1011.273333
1,2019-01-01,CIHO.csv,52,1011.3
2,2019-01-01,SAY001.csv,52,
3,2019-01-01,SAY002.csv,52,1009.908333
4,2019-01-01,STH005.csv,52,
5,2019-01-01,STH007.csv,52,
6,2019-01-01,STH010.csv,52,
7,2019-01-01,STH011.csv,52,1011.458333
8,2019-01-01,STH013.csv,52,1006.652174
9,2019-01-01,STH014.csv,52,997.608696


In [12]:
pressure_mean_week = pressure.groupby(['source_file','week_number'])["press"].mean().reset_index(name='press_mean')
pressure_mean_week.tail(30)

Unnamed: 0,source_file,week_number,press_mean
802,STH026.csv,23,1002.286907
803,STH026.csv,24,1002.605297
804,STH026.csv,25,1002.25
805,STH026.csv,26,1002.213318
806,STH026.csv,27,1003.073944
807,STH026.csv,28,1001.553548
808,STH026.csv,29,1002.680106
809,STH026.csv,30,1002.754247
810,STH026.csv,31,1002.908743
811,STH026.csv,32,1003.688796


In [13]:
pressure_mean_week = pressure.groupby(['date'])["press"].mean().reset_index(name='press_mean')
pressure_mean_week.head(10)

Unnamed: 0,date,press_mean
0,2019-01-01,1007.495588
1,2019-01-02,1006.871981
2,2019-01-03,1005.368182
3,2019-01-04,1004.4825
4,2019-01-05,1005.462712
5,2019-01-06,1006.626738
6,2019-01-07,1005.987079
7,2019-01-08,1006.177838
8,2019-01-09,1005.894969
9,2019-01-10,1005.332099


In [14]:
# Apply the function to get the week number
pressure_mean_week['week_number'] = pressure_mean_week['date'].apply(calculate_week_number)

In [15]:
pressure_mean_week['YEAR'] = pressure_mean_week['date'].dt.year
pressure_mean_week.head()

Unnamed: 0,date,press_mean,week_number,YEAR
0,2019-01-01,1007.495588,52,2019
1,2019-01-02,1006.871981,52,2019
2,2019-01-03,1005.368182,52,2019
3,2019-01-04,1004.4825,52,2019
4,2019-01-05,1005.462712,52,2019


In [16]:
# Group by week number and sum cases
#weekly_cases = df.groupby('week_number','NADDRCODE')['DATESICK'].count().reset_index()
pressure_on_week_in_year = pressure_mean_week.groupby(['week_number','YEAR'])["press_mean"].mean().reset_index(name='press_mean')
print(pressure_on_week_in_year)

     week_number  YEAR   press_mean
0              1  2019  1005.820689
1              1  2020  1005.739333
2              1  2021  1006.024304
3              1  2022  1008.889319
4              1  2023  1007.043226
..           ...   ...          ...
247           52  2019  1006.581647
248           52  2020  1007.449375
249           52  2021  1006.856205
250           52  2022  1009.008944
251           52  2023  1008.525427

[252 rows x 3 columns]


In [17]:
def find_week_number(date):
    """
    Calculate the week number of a given date, where week 1 starts on January 7.

    Args:
    date (datetime.date): The date to calculate the week number for.

    Returns:
    int: The week number of the date.
    """
    # Define the starting date of week 1
    start_date = datetime.date(date.year, 1, 7)
    
    # If the given date is before January 7 of the same year, adjust the start date to the previous year's January 7
    if date < start_date:
        start_date = datetime.date(date.year - 1, 1, 7)
    
    # Calculate the difference in days
    days_diff = (date - start_date).days
    
    # Calculate the week number
    week_number = (days_diff // 7) + 1
    
    return week_number

def get_start_of_week(week_number, year):
    """
    Calculate the start date of a given week number, where week 1 starts on January 7.

    Args:
    week_number (int): The week number.
    year (int): The year.

    Returns:
    datetime.date: The start date of the week.
    """
    start_date = datetime.date(year, 1, 7)
    start_of_week = start_date + datetime.timedelta(weeks=week_number - 1)
    return start_of_week

def create_weeks_dataframe(start_date_str, end_date_str):
    """
    Create a DataFrame with weeks starting from January 7, along with their start and end dates.

    Args:
    start_date_str (str): The start date string for the DataFrame in 'YYYY-MM-DD' format.
    end_date_str (str): The end date string for the DataFrame in 'YYYY-MM-DD' format.

    Returns:
    pd.DataFrame: A DataFrame with week numbers and their start and end dates.
    """
    start_date = datetime.datetime.strptime(start_date_str, '%Y-%m-%d').date()
    end_date = datetime.datetime.strptime(end_date_str, '%Y-%m-%d').date()
    
    weeks = []
    current_date = start_date
    
    while current_date <= end_date:
        week_number = find_week_number(current_date)
        start_of_week = get_start_of_week(week_number, current_date.year)
        end_of_week = start_of_week + datetime.timedelta(days=6)
        weeks.append({
            'week_number': week_number,
            'start_date': start_of_week,
            'end_date': end_of_week
        })
        current_date = end_of_week + datetime.timedelta(days=1)
    
    weeks_df = pd.DataFrame(weeks).drop_duplicates(subset=['week_number'])
    return weeks_df

In [18]:
# Example usage:
start_date = '2024-01-07'
end_date = '2025-01-04'
weeks_df = create_weeks_dataframe(start_date, end_date)
weeks_df

Unnamed: 0,week_number,start_date,end_date
0,1,2024-01-07,2024-01-13
1,2,2024-01-14,2024-01-20
2,3,2024-01-21,2024-01-27
3,4,2024-01-28,2024-02-03
4,5,2024-02-04,2024-02-10
5,6,2024-02-11,2024-02-17
6,7,2024-02-18,2024-02-24
7,8,2024-02-25,2024-03-02
8,9,2024-03-03,2024-03-09
9,10,2024-03-10,2024-03-16


In [19]:
ndf = pressure_on_week_in_year.merge(weeks_df, on='week_number', how='left')
ndf.head()

Unnamed: 0,week_number,YEAR,press_mean,start_date,end_date
0,1,2019,1005.820689,2024-01-07,2024-01-13
1,1,2020,1005.739333,2024-01-07,2024-01-13
2,1,2021,1006.024304,2024-01-07,2024-01-13
3,1,2022,1008.889319,2024-01-07,2024-01-13
4,1,2023,1007.043226,2024-01-07,2024-01-13


In [20]:
new_df2 = pd.merge(pressure_on_week_in_year, weeks_df , on=['week_number'])
new_df2 = new_df2[['week_number','YEAR', 'start_date', 'end_date', 'press_mean']]
new_df2.head()

Unnamed: 0,week_number,YEAR,start_date,end_date,press_mean
0,1,2019,2024-01-07,2024-01-13,1005.820689
1,1,2020,2024-01-07,2024-01-13,1005.739333
2,1,2021,2024-01-07,2024-01-13,1006.024304
3,1,2022,2024-01-07,2024-01-13,1008.889319
4,1,2023,2024-01-07,2024-01-13,1007.043226


In [21]:
# Strip any leading/trailing whitespace from the date strings
# new_df2['start_date'] = new_df2['start_date'].str.strip()

# Convert 'DATESICK' column to datetime
new_df2['start_date'] = pd.to_datetime(new_df2['start_date'], format='%d/%m/%Y')
new_df2['MONTH'] = new_df2['start_date'].dt.month
new_df2['DAY'] = new_df2['start_date'].dt.day
new_df2.head()

Unnamed: 0,week_number,YEAR,start_date,end_date,press_mean,MONTH,DAY
0,1,2019,2024-01-07,2024-01-13,1005.820689,1,7
1,1,2020,2024-01-07,2024-01-13,1005.739333,1,7
2,1,2021,2024-01-07,2024-01-13,1006.024304,1,7
3,1,2022,2024-01-07,2024-01-13,1008.889319,1,7
4,1,2023,2024-01-07,2024-01-13,1007.043226,1,7


In [22]:
# new_df2['DATE'] = pd.to_datetime(new_df2[['YEARSICK','MONTHSICK','DAYSICK']])
new_df2['date'] = pd.to_datetime(new_df2.YEAR.astype(str) + '/' + new_df2.MONTH.astype(str) +'/' + new_df2.DAY.astype(str))

In [23]:
df2 = new_df2[['date','press_mean']].sort_values('date')

In [24]:
df2.rename(columns={'date': "date", 'press_mean' : "press"}, inplace=True)
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 252 entries, 0 to 251
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    252 non-null    datetime64[ns]
 1   press   252 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 5.9 KB


In [27]:
df2.to_csv(r"data\dataset\press.csv", index=False)