# Analyse and visualise rainfall data collected by region

In [1]:
import pandas as pd 
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
sns.set(style="whitegrid", font='Helvetica')
import numpy as np

## Load data

In [57]:
def read_data(filepath):
    df = pd.read_csv(filepath)
    df.drop(columns=' Validation',inplace=True)
    df.rename(columns={' Minutes':'rainfall',' Rainfall':'Validation'},inplace=True)
    df['date'] = pd.to_datetime(df[' Date'], format='%Y%m%d %H%M')
    df.drop(columns=[' Date'],inplace=True)
    df.set_index('date',inplace=True)
    return df

In [63]:
df = read_data('/Users/kang/Desktop/Hackathon_2023/Data/1503016')

# Plotly monthly sum of rainfall for particular year

In [49]:
def plot_monthly_sum_of_year(df, start_date, end_date,station_name):
    
    start_date = str(start_date)[0:10]
    end_date = str(end_date)[0:10]
    # slice the date
    mask = (df.index >= start_date) & (df.index <= end_date)
    df_sliced = df.loc[mask]
    
    # resample into monthly sum
    month = np.array(['Jan','Feb','Mar','Apr','May','Jun','July','Aug','Sep','Oct','Nov','Dec'])
    df_resample = df_sliced.resample('1M').sum()
    annual_mean_of_sum = df_resample.mean()
    
    if len(df_resample['rainfall'])!= len(month):
        print('Year Not Completed')
    else:
    
        # add into dataframe for plotiing 
        df_plot = pd.DataFrame({'rainfall':df_resample['rainfall'].values,'month':month})

        # calculate total rainfall
        total_rainfall = df_resample.sum()['rainfall']

        # plot the graph 
        year = start_date[0:4]

        plt.figure(figsize=(10, 6))

        sns.barplot(data=df_plot, x='month', y='rainfall',label='Monthly Sum')

        plt.title(f'Monthly Sum of Rainfall Across {year} ({station_name})\n {start_date} to {end_date}')
        plt.xlabel('Month')
        plt.ylabel('mm')
        
        plt.axhline(y=annual_mean_of_sum['rainfall'], color='r', linestyle='--',label='Annual Mean')


        # Display data values on the data points
        for i, val in enumerate(df_plot['rainfall']):
            plt.text(i, val, str(round(val,2)), ha='center', va='bottom')

        plt.legend()
        plt.savefig(f'/Users/kang/Desktop/Hackathon_2023/Images/monthly_sum_rainfall_{year}_{station_name}.jpg',dpi = 300)

        print(f'Saving for monthly_sum_rainfall_{year}_{station_name}.jpg')
        plt.close()

In [64]:
# Create a list of dates with yearly intervals
start_list = pd.date_range(start='2013-01-01', end='2022-01-01', freq='YS')
end_list = pd.date_range(start='2013-12-31', end='2022-12-31', freq='Y')


In [65]:
for i in range(len(end_list)):
    start_date = start_list[i]
    end_date = end_list[i]
    
    plot_monthly_sum_of_year(df,start_date,end_date,station_name='Satok Bridge')

Saving for monthly_sum_rainfall_2013_Satok Bridge.jpg
Saving for monthly_sum_rainfall_2014_Satok Bridge.jpg
Saving for monthly_sum_rainfall_2015_Satok Bridge.jpg
Saving for monthly_sum_rainfall_2016_Satok Bridge.jpg
Saving for monthly_sum_rainfall_2017_Satok Bridge.jpg
Saving for monthly_sum_rainfall_2018_Satok Bridge.jpg
Saving for monthly_sum_rainfall_2019_Satok Bridge.jpg
Saving for monthly_sum_rainfall_2020_Satok Bridge.jpg
Saving for monthly_sum_rainfall_2021_Satok Bridge.jpg
Year Not Completed


## Plotly daily sum of rainfall for particular month of the year

In [53]:
def plot_daily_sum_of_month(df,start_year,end_year, start_date, end_date,station_name):
    
    # create mask for year
    mask_year = (df.index >= f'{start_year}-01-01') & (df.index <= f'{end_year}-12-31')
    df_daily_year = df.loc[mask_year]
    
    # get the daily mean of the year
    df_daily_mean = df_daily_year.resample('1D').sum().mean()
    df_daily_max = df_daily_year.resample('1D').sum().max()
    df_daily_min = df_daily_year.resample('1D').sum().min()

    
    start_date = str(start_date)[0:10]
    end_date = str(end_date)[0:10]
    # slice the date
    mask = (df.index >= start_date) & (df.index <= end_date)
    df_sliced = df.loc[mask]
    
    # resample into monthly sum
    df_resample = df_sliced.resample('1D').sum()
    
    #calculate total rainfall
    total_rainfall = df_resample.sum()['rainfall']

    # plot the graph 
    year = start_date[0:4]
    month = start_date[5:7]
    plt.figure(figsize=(10, 6))

    sns.barplot(data=df_resample, x=df_resample.index, y='rainfall',color='royalblue')

    plt.title(f'Daily Sum of Rainfall Across {year} ({station_name})\n {start_date} to {end_date}')
    plt.xlabel('Day')
    plt.ylabel('Rainfall (mm)')

    # daily max
    plt.axhline(y=df_daily_max['rainfall'],color='red',linestyle='--',label='Annual Daily Max')
    
    # daily mean
     
    plt.axhline(y=df_daily_mean['rainfall'], color='orange', linestyle='--',label='Annual Daily Mean')
    
    
    # daily min
    # plt.axhline(y=df_daily_min['rainfall'],color='green',linestyle='--',label='Daily min of the year')
    
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%d'))  # %b gives abbreviated month name


    # legend
    plt.legend()

    plt.savefig(f'/Users/kang/Desktop/Hackathon_2023/Images/daily_sum_rainfall_{year}_{month}_{station_name}.jpg',dpi = 300)

    print(f'Saving for daily_sum_rainfall_{year}_{month}_{station_name}.jpg')
    plt.close()

In [66]:
# Create a list of dates with monthly intervals
start_list = pd.date_range(start='2021-01-01', end='2021-12-01', freq='MS')
end_list = pd.date_range(start='2021-01-31', end='2021-12-31', freq='M')

for i in range(len(end_list)):
    start_date = start_list[i]
    end_date = end_list[i]
    
    plot_daily_sum_of_month(df,start_year='2021',end_year='2021',start_date=start_date,end_date=end_date,station_name='Satok Bridge')

Saving for daily_sum_rainfall_2021_01_Satok Bridge.jpg
Saving for daily_sum_rainfall_2021_02_Satok Bridge.jpg
Saving for daily_sum_rainfall_2021_03_Satok Bridge.jpg
Saving for daily_sum_rainfall_2021_04_Satok Bridge.jpg
Saving for daily_sum_rainfall_2021_05_Satok Bridge.jpg
Saving for daily_sum_rainfall_2021_06_Satok Bridge.jpg
Saving for daily_sum_rainfall_2021_07_Satok Bridge.jpg
Saving for daily_sum_rainfall_2021_08_Satok Bridge.jpg
Saving for daily_sum_rainfall_2021_09_Satok Bridge.jpg
Saving for daily_sum_rainfall_2021_10_Satok Bridge.jpg
Saving for daily_sum_rainfall_2021_11_Satok Bridge.jpg
Saving for daily_sum_rainfall_2021_12_Satok Bridge.jpg


## Plot the hourly sum of rainfall of the selected day

In [55]:
def plot_hourly_sum_of_day(df,date,station_name):
    
#     # create mask for year
#     mask_year = (df.index >= f'{start_year}-01-01') & (df.index <= f'{end_year}-12-31')
#     df_daily_year = df.loc[mask_year]
    
#     # get the daily mean of the year
#     df_daily_mean = df_daily_year.resample('1H').sum().mean()
#     df_daily_max = df_daily_year.resample('1H').sum().max()
#     df_daily_min = df_daily_year.resample('1H').sum().min()

    
    start_hour = str(date)[0:10]+' 00:00:00'
    end_hour = str(date)[0:10]+' 23:55:00'
    # slice the date
    mask = (df.index >= start_hour) & (df.index <= end_hour)
    df_sliced = df.loc[mask]
    
    # resample into monthly sum
    df_resample = df_sliced.resample('1H').sum()
    
    #calculate total rainfall
    total_rainfall = df_resample.sum()['rainfall']
    
    
    hours = np.arange(0, 24)
    minutes = np.zeros_like(hours)
    seconds = np.zeros_like(hours)

    # Combine hour, minute, and second arrays into a string array
    time_strings = [f"{h:02d}:{m:02d}:{s:02d}" for h, m, s in zip(hours, minutes, seconds)]


    # create new dataframe
    df_plot = pd.DataFrame({'hour':time_strings,'rainfall':df_resample['rainfall'].values})

    # plot the graph 
    date_of_data = str(date)[0:10]
    plt.figure(figsize=(10, 6))

    sns.barplot(data=df_plot, x='hour', y='rainfall',color='royalblue')

    plt.title(f'Hourly Sum of Rainfall on {date_of_data} ({station_name})')
    plt.xlabel('Hour')
    plt.ylabel('Rainfall (mm)')
    plt.xticks(rotation='vertical')


    # daily mean
#     plt.axhline(y=df_daily_mean['rainfall'], color='orange', linestyle='--',label='Daily mean of the year')
    
#     # daily max
#     plt.axhline(y=df_daily_max['rainfall'],color='red',linestyle='--',label='Daily max of the year')
                
#     # daily min
#     plt.axhline(y=df_daily_min['rainfall'],color='green',linestyle='--',label='Daily min of the year')
    
    # plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))  # %b gives abbreviated month name

    plt.tight_layout()
    plt.savefig(f'/Users/kang/Desktop/Hackathon_2023/Images/hourly_sum_rainfall_{date_of_data}_{station_name}.jpg',dpi = 300)

    print(f'Saving for hourly_sum_rainfall_{date_of_data}_{station_name}.jpg')
    plt.close()

In [67]:
# Create a list of dates with monthly intervals
date_list = pd.date_range(start='2021-12-29', end='2021-12-31', freq='D')
for i in range(len(date_list)):
    date = date_list[i]
    
    plot_hourly_sum_of_day(df,date=date,station_name='Satok Bridge')

Saving for hourly_sum_rainfall_2021-12-29_Satok Bridge.jpg
Saving for hourly_sum_rainfall_2021-12-30_Satok Bridge.jpg
Saving for hourly_sum_rainfall_2021-12-31_Satok Bridge.jpg
