In [1]:
import os
import pandas as pd
import numpy as np
import sqlalchemy as sa
from urllib.parse import quote
import datetime
from datetime import datetime, timedelta
import datetime

from pylab import rcParams

import warnings; 
warnings.filterwarnings("ignore")

In [2]:
# Specify the directory containing the CSV files
directory = r'data/'  # Use raw string to handle backslashes
# amp = [
#     'BUKT.csv',
#     'CIHO.csv',
#     'SAY001.csv',
#     'SAY002.csv',
#     'STH005.csv',
#     'STH007.csv',
#     'STH010.csv',
#     'STH011.csv',
#     'STH013.csv',
#     'STH014.csv',
#     'STH019.csv',
#     'STH021.csv',
#     'STH022.csv',
#     'STH023.csv',
#     'STH025.csv',
#     'STH026.csv'
# ]
amp = [
    'BUKT.csv',
    'STH005.csv',
    'STH007.csv',
    'STH010.csv',
    'STH011.csv',
    'STH014.csv',
    'STH019.csv',
    'STH021.csv',
    'STH022.csv',
    'STH023.csv',
    'STH025.csv',
    'STH026.csv'
]
# Read each CSV file into a DataFrame, add a new column, and store them in a list
dataframes = []
for dirpath, _, filenames in os.walk(directory):
    for file in filenames:
        if file.endswith('.csv'):
            if file in amp:
                # print(file)
                file_path = os.path.join(dirpath, file)
                try:
                    df = pd.read_csv(file_path)
                    # df['source_file'] = os.path.relpath(file_path, directory)  # Add a new column with the relative file path
                    df['source_file'] = file
                    dataframes.append(df)
                except FileNotFoundError:
                    print(f"File not found: {file_path}")
                except pd.errors.EmptyDataError:
                    print(f"File is empty: {file_path}")
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

# Optionally, concatenate all DataFrames into a single DataFrame
if dataframes:
    all_data = pd.concat(dataframes, ignore_index=True)
    # Now 'all_data' contains all the data from the CSV files with an additional column 'source_file'
    # print(all_data)
else:
    print("No dataframes to concatenate.")

              date   time   press source_file
0       2022-01-01  00:00    -999  STH007.csv
1       2022-01-01  01:00    -999  STH007.csv
2       2022-01-01  02:00    -999  STH007.csv
3       2022-01-01  03:00    -999  STH007.csv
4       2022-01-01  04:00    -999  STH007.csv
...            ...    ...     ...         ...
500131  2020-07-31  19:00  1003.7    BUKT.csv
500132  2020-07-31  20:00  1004.5    BUKT.csv
500133  2020-07-31  21:00  1005.3    BUKT.csv
500134  2020-07-31  22:00  -999.0    BUKT.csv
500135  2020-07-31  23:00  -999.0    BUKT.csv

[500136 rows x 4 columns]


In [5]:
press = pd.DataFrame(all_data)

In [6]:
# press.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500136 entries, 0 to 500135
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   date         500136 non-null  object
 1   time         500136 non-null  object
 2   press        500136 non-null  object
 3   source_file  500136 non-null  object
dtypes: object(4)
memory usage: 15.3+ MB


In [7]:
# press.tail()

Unnamed: 0,date,time,press,source_file
500131,2020-07-31,19:00,1003.7,BUKT.csv
500132,2020-07-31,20:00,1004.5,BUKT.csv
500133,2020-07-31,21:00,1005.3,BUKT.csv
500134,2020-07-31,22:00,-999.0,BUKT.csv
500135,2020-07-31,23:00,-999.0,BUKT.csv


In [8]:
# Strip any leading/trailing whitespace from the date strings
press['date'] = press['date'].str.strip()
# temp['date'] = datetime.strptime(temp['date'], '%Y-%m-%d')
# Convert 'DATESICK' column to datetime
press['date'] = pd.to_datetime(press['date'], format='%Y-%m-%d')

In [9]:
# Replace -999.0 with NaN
press['press'] = press['press'].replace(-999.0, np.nan)

In [10]:
# Convert 'temp' column to numeric, coercing errors to NaN
press['press'] = pd.to_numeric(press['press'], errors='coerce')

In [12]:
press.drop(['time'], axis=1, inplace=True)

In [13]:
# press.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500136 entries, 0 to 500135
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   date         500136 non-null  datetime64[ns]
 1   press        363906 non-null  float64       
 2   source_file  500136 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 11.4+ MB


In [14]:
# press.head()

Unnamed: 0,date,press,source_file
0,2022-01-01,,STH007.csv
1,2022-01-01,,STH007.csv
2,2022-01-01,,STH007.csv
3,2022-01-01,,STH007.csv
4,2022-01-01,,STH007.csv


In [15]:
# print('Series has {} missing values'.format(press.isna().sum()))
date_missed = press[press.isna()].index

Series has date                0
press          136230
source_file         0
dtype: int64 missing values


In [16]:
# Group by the relevant column(s), ensure that 'date' is part of the index
press.set_index(['source_file', 'date'], inplace=True)

In [17]:
# Resetting just the date index to work with it directly
df_resampled = press.groupby(level='source_file').apply(
    lambda x: x.droplevel('source_file').resample('W-mon').mean().interpolate(method='linear')
)

# Reassigning the group level back to the resampled data
# df_resampled['source_file'] = df_resampled.index.get_level_values(0)
# df_resampled.set_index(['source_file', df_resampled.index], inplace=True)

In [18]:
# Optional: Reset the index if needed
df_resampled = df_resampled.reset_index()

In [19]:
# print('Series has {} missing values'.format(df_resampled.isna().sum()))
date_missed = df_resampled[df_resampled.isna()].index

Series has source_file     0
date            0
press          62
dtype: int64 missing values


In [20]:
# Reset the MultiIndex to work with 'date' as a regular column
df_reset = press.reset_index()

# Set 'date' as the index for resampling
df_reset.set_index('date', inplace=True)

# Resample the DataFrame based on the 'date' index
df_resampled = df_reset.groupby('source_file').resample('W-mon').mean().interpolate(method='linear')

# Reset index to reintroduce 'group' as a column
df_resampled = df_resampled.reset_index(level=0)

In [21]:
# print('Series has {} missing values'.format(df_resampled.isna().sum()))
date_missed = df_resampled[df_resampled.isna()].index

Series has source_file    0
press          0
dtype: int64 missing values


In [22]:
newdf = df_resampled

In [23]:
# newdf.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3132 entries, 2019-01-07 to 2024-01-01
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   source_file  3132 non-null   object 
 1   press        3132 non-null   float64
dtypes: float64(1), object(1)
memory usage: 73.4+ KB


In [24]:
newdf = newdf.reset_index()

In [26]:
ST9601 =  newdf.groupby('date')["press"].mean().reset_index()

In [27]:
ST9601.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261 entries, 0 to 260
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    261 non-null    datetime64[ns]
 1   press   261 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 4.2 KB


In [28]:
ST9601['source_file'] = 'ST9601'

In [29]:
ST9601.head()

Unnamed: 0,date,press,source_file
0,2019-01-07,1008.002678,ST9601
1,2019-01-14,1007.790922,ST9601
2,2019-01-21,1007.519514,ST9601
3,2019-01-28,1008.248065,ST9601
4,2019-02-04,1008.213114,ST9601


In [30]:
newdf3 = pd.concat([newdf, ST9601], ignore_index=True)

In [31]:
# newdf3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3393 entries, 0 to 3392
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         3393 non-null   datetime64[ns]
 1   source_file  3393 non-null   object        
 2   press        3393 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 79.6+ KB


In [32]:
newdf3['source_file'] = newdf3['source_file'].replace('ST9601', '9601')
newdf3['source_file'] = newdf3['source_file'].replace('STH007.csv', '9602')
newdf3['source_file'] = newdf3['source_file'].replace('STH023.csv', '9603')
newdf3['source_file'] = newdf3['source_file'].replace('STH011.csv', '9604')
newdf3['source_file'] = newdf3['source_file'].replace('STH005.csv', '9605')
newdf3['source_file'] = newdf3['source_file'].replace('STH022.csv', '9606')
newdf3['source_file'] = newdf3['source_file'].replace('STH019.csv', '9607')
newdf3['source_file'] = newdf3['source_file'].replace('STH021.csv', '9608')
newdf3['source_file'] = newdf3['source_file'].replace('STH014.csv', '9609')
newdf3['source_file'] = newdf3['source_file'].replace('STH010.csv', '9610')
newdf3['source_file'] = newdf3['source_file'].replace('BUKT.csv', '9611')
newdf3['source_file'] = newdf3['source_file'].replace('STH026.csv', '9612')
newdf3['source_file'] = newdf3['source_file'].replace('STH025.csv', '9613')

In [33]:
newdf3.rename(columns={'source_file': "station"}, inplace=True)

In [34]:
# newdf3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3393 entries, 0 to 3392
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   date     3393 non-null   datetime64[ns]
 1   station  3393 non-null   object        
 2   press    3393 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 79.6+ KB


In [None]:
DIALECT = "mysql"
SQL_DRIVER = "pymysql"
USERNAME = "user"
PASSWORD = "user"
HOST = "dengue-db"
PORT = 3306
DBNAME = "dengue"

conn_str = DIALECT + "+" + SQL_DRIVER + "://" + USERNAME + ":" +quote(PASSWORD) + "@" + HOST + ":" +str(PORT) + "/" + DBNAME

In [None]:
with sa.create_engine(conn_str).connect() as con:
  newdf3.to_sql("pressure",con,index=None, if_exists='replace')

In [None]:
print('uploaded pressure success...')

In [36]:
newdf3.to_csv(r"data/dataset/press_all.csv", index=False)