## **Installing Necessary Libraries**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import calendar
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Data Loading & Make ready for Preprocessing**

In [3]:
## Loading the data
path ='/content/drive/MyDrive/Thesis/Research Data/Original_Data/3_Hourly_Rainfall_Data.csv'
df = pd.read_csv(path, low_memory=False)
df

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,1,2,3,...,22,23,24,25,26,27,28,29,30,31
0,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,0,0.6,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.8,0,0.0
1,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,3,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0
2,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,6,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,2,0,0.0
3,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,9,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.6,0,0.0
4,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,12,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,2.4,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69691,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,9,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0
69692,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,12,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0
69693,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,15,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0
69694,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,18,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0


In [4]:
# Melt the DataFrame into a long format
df_rain = pd.melt(df, id_vars=['StationName','StationID','Latitude','Longitude', 'Year', 'Month', 'Time'], var_name='Day', value_name='Rainfall')
df_rain


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Rainfall
0,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,0,1,0.6
1,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,3,1,0.0
2,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,6,1,0.0
3,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,9,1,0.0
4,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,12,1,0.0
...,...,...,...,...,...,...,...,...,...
2160571,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,9,31,0.0
2160572,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,12,31,0.0
2160573,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,15,31,0.0
2160574,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,18,31,0.0


In [5]:
## Find the datatypes of the columns
df_rain.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Time,int64
Day,object
Rainfall,object


In [6]:
df_rain.shape

(2160576, 9)

In [7]:
# Find unique values in 'Month column
df_rain['StationName'].unique()

array(['Saidpur', 'Rangpur', 'Dinajpur', 'Sylhet', 'Bogura', 'Mymensingh',
       'Rajshahi', 'Srimangal', 'Tangail', 'Ishurdi', 'Dhaka',
       'Chuadanga', 'Faridpur', 'Sitakunda', 'Cumilla', 'Chandpur',
       'Jessore', 'Madaripur', 'Feni', 'Maijdee_court', 'Khulna',
       'Barishal', 'Satkhira', 'Bhola', 'Rangamati', 'Sandwip', 'Hatiya',
       'Ambagan(Ctg.)', 'Mongla', 'Patuakhali', 'Chittagong', 'Khepupara',
       'Kutubdia', "Cox'sBazar", 'Teknaf'], dtype=object)

In [8]:
# Count unique StationIDs
num_unique_station_name = df_rain['StationName'].nunique()

print(f"Count of unique Station IDs: {num_unique_station_name}")


Count of unique Station IDs: 35


In [9]:
# Find unique values in 'Year column
df_rain['StationID'].unique()

array([41858, 10208, 10120, 10705, 10408, 10609, 10320, 10724, 41909,
       10910, 11111, 41926, 11505, 11912, 11313, 11316, 11407, 11513,
       11805, 11809, 11604, 11704, 11610, 11706, 12007, 11916, 11814,
       41977, 41958, 12103, 11921, 12110, 11925, 11927, 11929])

In [10]:
# Count unique StationIDs
num_unique_station_ids = df_rain['StationID'].nunique()

print(f"Count of unique Station IDs: {num_unique_station_ids}")


Count of unique Station IDs: 35


In [11]:
# Find unique values in 'Month column
df_rain['Latitude'].unique()

array(['25Deg.47Mts.N', '25Deg.44Mts.N', '25Deg.39Mts.N', '24Deg.54Mts.N',
       '24Deg.51Mts.N', '24Deg.43Mts.N', '24Deg.22Mts.N', '24Deg.18Mts.N',
       '24Deg.15Mts.N', '24Deg. 8Mts.N', '23Deg.46Mts.N', '23Deg.39Mts.N',
       '23Deg.36Mts.N', '23Deg.35Mts.N', '23Deg.26Mts.N', '23Deg.16Mts.N',
       '23Deg.11Mts.N', '23Deg.10Mts.N ', '23Deg.2Mts.N', '22Deg.52Mts.N',
       '22Deg.47Mts.N', '22Deg.45Mts.N', '22Deg.43Mts.N', '22Deg.41Mts.N',
       '22Deg.32Mts.N', '22Deg.29Mts.N', '22Deg.26Mts.N', '22Deg.21Mts.N',
       '22Deg.20Mts.N', '22Deg.16Mts.N', '21Deg.59Mts.N', '21Deg.49Mts.N',
       '21Deg.26Mts.N', '20Deg.52Mts.N'], dtype=object)

In [12]:
# Count unique StationIDs
num_unique_latitude = df_rain['Latitude'].nunique()

print(f"Count of unique Station IDs: {num_unique_latitude}")


Count of unique Station IDs: 34


In [13]:
# Find unique values in 'Month column
df_rain['Longitude'].unique()

array(['88Deg.53Mts.E', '89Deg.14Mts.E', '88Deg.41Mts.E', '91Deg.53Mts.E',
       '89Deg.22Mts.E', '90Deg.26Mts.E', '88Deg.42Mts.E', '91Deg.44Mts.E',
       '89Deg.55Mts.E', '89Deg.3Mts.E', '90Deg.23Mts.E', '88Deg.52Mts.E',
       '89Deg.51Mts.E', '91Deg.42Mts.E', '91Deg.11Mts.E', '90Deg.42Mts.E',
       '89Deg.10Mts.E', '90Deg.11Mts.E', '91Deg.25Mts.E', '91Deg.6Mts.E',
       '89Deg.32Mts.E', '90Deg.20Mts.E', '89Deg.5Mts.E', '90Deg.39Mts.E',
       '92Deg.12Mts.E', '91Deg.26Mts.E', '91Deg.49Mts.E', '89Deg.36Mts.E',
       '90Deg.14Mts.E', '91Deg.51Mts.E', '91Deg.56Mts.E', '92Deg.18Mts.E'],
      dtype=object)

In [14]:
df_rain['Longitude'].nunique()

32

In [15]:
df_rain['Year'].unique()

array([2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023])

In [16]:
df_rain['Year'].nunique()

21

In [17]:
# Find unique values in 'Month column
df_rain['Month'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [18]:
# Find unique values in 'Day column
df_rain['Day'].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', '25', '26', '27', '28', '29', '30', '31'], dtype=object)

In [19]:
# Find data types 'Day column
df_rain['Day'].dtypes

dtype('O')

In [20]:
# Convert the Day column to numeric
df_rain['Day'] = pd.to_numeric(df_rain['Day'])
df_rain

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Rainfall
0,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,0,1,0.6
1,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,3,1,0.0
2,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,6,1,0.0
3,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,9,1,0.0
4,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,12,1,0.0
...,...,...,...,...,...,...,...,...,...
2160571,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,9,31,0.0
2160572,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,12,31,0.0
2160573,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,15,31,0.0
2160574,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,18,31,0.0


In [21]:
## Find the datatypes of the columns again
df_rain.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Time,int64
Day,int64
Rainfall,object


In [22]:
# Find unique values in 'Time' column
df_rain['Time'].unique()

array([ 0,  3,  6,  9, 12, 15, 18, 21])

In [23]:
# Find unique values in 'Rainfall' column
df_rain['Rainfall'].unique()

array([0.6, 0.0, 1.0, ..., 85.4, 63.9, 71.4], dtype=object)

## **Handling Missing value and Unwanted Symbol**

In [24]:
## Count the number of '****' in the value column
df_rain['Rainfall'][df_rain['Rainfall'] == '*****'].count()

np.int64(21)

In [25]:
# Filter rows where 'Rainfall' is '*****'
rows_with_asterisks = df_rain[df_rain['Rainfall'] == '*****']

# Display the rows
rows_with_asterisks


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Rainfall
277151,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2006,12,21,4,*****
309245,Chandpur,11316,23Deg.16Mts.N,90Deg.42Mts.E,2006,4,15,5,*****
309246,Chandpur,11316,23Deg.16Mts.N,90Deg.42Mts.E,2006,4,18,5,*****
378941,Chandpur,11316,23Deg.16Mts.N,90Deg.42Mts.E,2006,4,15,6,*****
1426557,Jessore,11407,23Deg.11Mts.N,89Deg.10Mts.E,2007,12,15,21,*****
1870966,Patuakhali,12103,22Deg.20Mts.N,90Deg.20Mts.E,2010,3,18,27,*****
1983881,Jessore,11407,23Deg.11Mts.N,89Deg.10Mts.E,2005,6,3,29,*****
1983882,Jessore,11407,23Deg.11Mts.N,89Deg.10Mts.E,2005,6,6,29,*****
1983883,Jessore,11407,23Deg.11Mts.N,89Deg.10Mts.E,2005,6,9,29,*****
1983884,Jessore,11407,23Deg.11Mts.N,89Deg.10Mts.E,2005,6,12,29,*****


In [26]:
# Filter rows where 'Rainfall' is '*****'
rows_with_asterisks = df_rain[df_rain['Rainfall'] == '*****']

# Group by 'StationName' and display the results
station_specific_results = rows_with_asterisks.groupby('StationName')

# Display results for each station
for station, data in station_specific_results:
    print(f"Station: {station}")
    print(data, '\n')


Station: Chandpur
       StationName  StationID       Latitude      Longitude  Year  Month  \
309245    Chandpur      11316  23Deg.16Mts.N  90Deg.42Mts.E  2006      4   
309246    Chandpur      11316  23Deg.16Mts.N  90Deg.42Mts.E  2006      4   
378941    Chandpur      11316  23Deg.16Mts.N  90Deg.42Mts.E  2006      4   

        Time  Day Rainfall  
309245    15    5    *****  
309246    18    5    *****  
378941    15    6    *****   

Station: Jessore
        StationName  StationID       Latitude      Longitude  Year  Month  \
1426557     Jessore      11407  23Deg.11Mts.N  89Deg.10Mts.E  2007     12   
1983881     Jessore      11407  23Deg.11Mts.N  89Deg.10Mts.E  2005      6   
1983882     Jessore      11407  23Deg.11Mts.N  89Deg.10Mts.E  2005      6   
1983883     Jessore      11407  23Deg.11Mts.N  89Deg.10Mts.E  2005      6   
1983884     Jessore      11407  23Deg.11Mts.N  89Deg.10Mts.E  2005      6   
1983885     Jessore      11407  23Deg.11Mts.N  89Deg.10Mts.E  2005      6   
198

In [27]:
# To find the null values
df_rain.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Time,0
Day,0
Rainfall,39264


In [28]:
# Replace '*****' with NaN in the original DataFrame
df_rain.loc[df_rain['Rainfall'] == '*****', 'Rainfall'] = np.nan

In [29]:
# To find the null values
df_rain.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Time,0
Day,0
Rainfall,39285


In [30]:
# Step 2: Convert 'Rainfall' to numeric, coercing errors (turn non-numeric to NaN)
df_rain['Rainfall'] = pd.to_numeric(df_rain['Rainfall'], errors='coerce')

# Group by StationName, Month, Day, and Time, and calculate the mean for each group
rainfall_mean = df_rain.groupby(['StationName', 'Month', 'Day', 'Time'])['Rainfall'].transform('mean')

# Replace '*****' (NaN values) with the mean of the same station, month, day, and time
df_rain['Rainfall'] = df_rain['Rainfall'].fillna(rainfall_mean)

# Verify the result
print(df_rain)


        StationName  StationID       Latitude      Longitude  Year  Month  \
0           Saidpur      41858  25Deg.47Mts.N  88Deg.53Mts.E  2003      1   
1           Saidpur      41858  25Deg.47Mts.N  88Deg.53Mts.E  2003      1   
2           Saidpur      41858  25Deg.47Mts.N  88Deg.53Mts.E  2003      1   
3           Saidpur      41858  25Deg.47Mts.N  88Deg.53Mts.E  2003      1   
4           Saidpur      41858  25Deg.47Mts.N  88Deg.53Mts.E  2003      1   
...             ...        ...            ...            ...   ...    ...   
2160571      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2023     12   
2160572      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2023     12   
2160573      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2023     12   
2160574      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2023     12   
2160575      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2023     12   

         Time  Day  Rainfall  
0           0    1       0.6  
1           3

In [31]:
# Check it again to  find the null values
df_rain.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Time,0
Day,0
Rainfall,33872


In [32]:
# Check rows where 'Rainfall' is still NaN after replacement
rows_with_nan = df_rain[df_rain['Rainfall'].isna()]

# If there are still rows with NaN, inspect them
rows_with_nan

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Rainfall
2021192,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,2,0,30,
2021193,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,2,3,30,
2021194,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,2,6,30,
2021195,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,2,9,30,
2021196,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,2,12,30,
...,...,...,...,...,...,...,...,...,...
2160563,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,11,9,31,
2160564,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,11,12,31,
2160565,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,11,15,31,
2160566,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,11,18,31,


In [33]:
# Sample: Define valid days for each month (for non-leap years)
valid_days_per_month = {
    1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30,
    7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31
}

# Adjust for leap years (for February only)
def is_leap_year(year):
    return (year % 4 == 0 and (year % 100 != 0 or year % 400 == 0))

# Add a column to check if a year is a leap year
df_rain['IsLeapYear'] = df_rain['Year'].apply(is_leap_year)

# Adjust February days for leap years
df_rain.loc[df_rain['IsLeapYear'], 'ValidDays'] = df_rain.loc[df_rain['IsLeapYear'], 'Month'].apply(lambda month: 29 if month == 2 else valid_days_per_month[month])
df_rain.loc[~df_rain['IsLeapYear'], 'ValidDays'] = df_rain.loc[~df_rain['IsLeapYear'], 'Month'].apply(lambda month: valid_days_per_month[month])

In [34]:
# Remove rows where 'Day' is greater than the valid days for that month
df_rain = df_rain[df_rain['Day'] <= df_rain['ValidDays']]

In [35]:
df_rain

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Rainfall,IsLeapYear,ValidDays
0,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,0,1,0.6,False,31.0
1,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,3,1,0.0,False,31.0
2,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,6,1,0.0,False,31.0
3,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,9,1,0.0,False,31.0
4,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,12,1,0.0,False,31.0
...,...,...,...,...,...,...,...,...,...,...,...
2160571,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,9,31,0.0,False,31.0
2160572,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,12,31,0.0,False,31.0
2160573,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,15,31,0.0,False,31.0
2160574,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,18,31,0.0,False,31.0


In [36]:
# Check rows where 'Rainfall' is still NaN after replacement
rows_with_nan = df_rain[df_rain['Rainfall'].isna()]

# If there are still rows with NaN, inspect them
rows_with_nan

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Rainfall,IsLeapYear,ValidDays
2073512,Hatiya,11814,22Deg.26Mts.N,91Deg.6Mts.E,2004,12,0,30,,True,31.0
2073513,Hatiya,11814,22Deg.26Mts.N,91Deg.6Mts.E,2004,12,3,30,,True,31.0
2073514,Hatiya,11814,22Deg.26Mts.N,91Deg.6Mts.E,2004,12,6,30,,True,31.0
2073515,Hatiya,11814,22Deg.26Mts.N,91Deg.6Mts.E,2004,12,9,30,,True,31.0
2073516,Hatiya,11814,22Deg.26Mts.N,91Deg.6Mts.E,2004,12,12,30,,True,31.0
...,...,...,...,...,...,...,...,...,...,...,...
2144995,Hatiya,11814,22Deg.26Mts.N,91Deg.6Mts.E,2023,7,9,31,,False,31.0
2144996,Hatiya,11814,22Deg.26Mts.N,91Deg.6Mts.E,2023,7,12,31,,False,31.0
2144997,Hatiya,11814,22Deg.26Mts.N,91Deg.6Mts.E,2023,7,15,31,,False,31.0
2144998,Hatiya,11814,22Deg.26Mts.N,91Deg.6Mts.E,2023,7,18,31,,False,31.0


In [37]:
# Check it again to  find the null values
df_rain.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Time,0
Day,0
Rainfall,464
IsLeapYear,0


In [38]:
# Check if all Rainfall values within each group are NaN
nan_groups = df_rain.groupby(['StationName', 'Month', 'Day', 'Time'])['Rainfall'].apply(lambda x: x.isna().all())

# Get the combinations that have all NaN Rainfall values
print(nan_groups[nan_groups].index)  # This will show you the groups where all Rainfall values are NaN


MultiIndex([('Hatiya',  7, 31,  0),
            ('Hatiya',  7, 31,  3),
            ('Hatiya',  7, 31,  6),
            ('Hatiya',  7, 31,  9),
            ('Hatiya',  7, 31, 12),
            ('Hatiya',  7, 31, 15),
            ('Hatiya',  7, 31, 18),
            ('Hatiya',  7, 31, 21),
            ('Hatiya', 12, 30,  0),
            ('Hatiya', 12, 30,  3),
            ('Hatiya', 12, 30,  6),
            ('Hatiya', 12, 30,  9),
            ('Hatiya', 12, 30, 12),
            ('Hatiya', 12, 30, 15),
            ('Hatiya', 12, 30, 18),
            ('Hatiya', 12, 30, 21),
            ('Hatiya', 12, 31,  0),
            ('Hatiya', 12, 31,  3),
            ('Hatiya', 12, 31,  6),
            ('Hatiya', 12, 31,  9),
            ('Hatiya', 12, 31, 12),
            ('Hatiya', 12, 31, 15),
            ('Hatiya', 12, 31, 18),
            ('Hatiya', 12, 31, 21)],
           names=['StationName', 'Month', 'Day', 'Time'])


In [39]:
# Forward fill missing values in 'Rainfall'
df_rain['Rainfall'] = df_rain['Rainfall'].fillna(method='ffill')

# Verify the result by checking if there are any NaN values left
print(df_rain.isnull().sum())


  df_rain['Rainfall'] = df_rain['Rainfall'].fillna(method='ffill')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rain['Rainfall'] = df_rain['Rainfall'].fillna(method='ffill')


StationName    0
StationID      0
Latitude       0
Longitude      0
Year           0
Month          0
Time           0
Day            0
Rainfall       0
IsLeapYear     0
ValidDays      0
dtype: int64


In [40]:
# Check it again to  find the null values
df_rain.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Time,0
Day,0
Rainfall,0
IsLeapYear,0


In [41]:
df_rain = df_rain.drop(columns=['IsLeapYear', 'ValidDays'])


In [42]:
df_rain

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Rainfall
0,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,0,1,0.6
1,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,3,1,0.0
2,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,6,1,0.0
3,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,9,1,0.0
4,Saidpur,41858,25Deg.47Mts.N,88Deg.53Mts.E,2003,1,12,1,0.0
...,...,...,...,...,...,...,...,...,...
2160571,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,9,31,0.0
2160572,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,12,31,0.0
2160573,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,15,31,0.0
2160574,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,12,18,31,0.0


## **Coverting Latitude and Longitude to Numerical Value**

In [43]:
df_rain.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Time,int64
Day,int64
Rainfall,float64


In [44]:
def convert_to_decimal(deg_min, direction):
    """
    Converts degree-minute format to decimal degrees.
    Example Input: "23Deg.46Mts.N" -> Output: 23.7667
    """
    parts = deg_min.split("Deg.")
    degrees = float(parts[0])
    minutes = float(parts[1].split("Mts.")[0])

    decimal = degrees + (minutes / 60)

    # Convert South/West to negative
    if direction in ["S", "W"]:
        decimal = -decimal

    return decimal

# Example usage
latitude = convert_to_decimal("23Deg.46Mts.", "N")
longitude = convert_to_decimal("90Deg.23Mts.", "E")

print(latitude, longitude)  # Output: 23.7667, 90.3833


23.766666666666666 90.38333333333334


In [45]:
df_rain["Latitude"] = df_rain["Latitude"].apply(lambda x: convert_to_decimal(x[:-1], x[-1]))  # Extract last character (N/S)
df_rain["Longitude"] = df_rain["Longitude"].apply(lambda x: convert_to_decimal(x[:-1], x[-1]))  # Extract last character (E/W)
df_rain

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Rainfall
0,Saidpur,41858,25.783333,88.883333,2003,1,0,1,0.6
1,Saidpur,41858,25.783333,88.883333,2003,1,3,1,0.0
2,Saidpur,41858,25.783333,88.883333,2003,1,6,1,0.0
3,Saidpur,41858,25.783333,88.883333,2003,1,9,1,0.0
4,Saidpur,41858,25.783333,88.883333,2003,1,12,1,0.0
...,...,...,...,...,...,...,...,...,...
2160571,Teknaf,11929,20.866667,92.300000,2023,12,9,31,0.0
2160572,Teknaf,11929,20.866667,92.300000,2023,12,12,31,0.0
2160573,Teknaf,11929,20.866667,92.300000,2023,12,15,31,0.0
2160574,Teknaf,11929,20.866667,92.300000,2023,12,18,31,0.0


In [46]:
# Check for invalid days (e.g., day > 31 or day > max days in the month)
invalid_dates = df_rain[(df_rain['Day'] > 31) | (df_rain['Month'] > 12)]
print(invalid_dates)


Empty DataFrame
Columns: [StationName, StationID, Latitude, Longitude, Year, Month, Time, Day, Rainfall]
Index: []


## **Creating Date Time Column**

In [47]:
import calendar

# Function to check if a date is valid
def is_invalid_date(row):
    days_in_month = calendar.monthrange(row['Year'], row['Month'])[1]
    return row['Day'] > days_in_month

# Remove invalid rows
df_rain = df_rain[~df_rain.apply(is_invalid_date, axis=1)].reset_index(drop=True)

# Convert datetime again
df_rain['Datetime'] = pd.to_datetime(df_rain[['Year', 'Month', 'Day']]) + pd.to_timedelta(df_rain['Time'], unit='h')

print(df_rain.info())  # Verify that datetime is correctly created


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2121304 entries, 0 to 2121303
Data columns (total 10 columns):
 #   Column       Dtype         
---  ------       -----         
 0   StationName  object        
 1   StationID    int64         
 2   Latitude     float64       
 3   Longitude    float64       
 4   Year         int64         
 5   Month        int64         
 6   Time         int64         
 7   Day          int64         
 8   Rainfall     float64       
 9   Datetime     datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(5), object(1)
memory usage: 161.8+ MB
None


In [48]:
df_rain

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Rainfall,Datetime
0,Saidpur,41858,25.783333,88.883333,2003,1,0,1,0.6,2003-01-01 00:00:00
1,Saidpur,41858,25.783333,88.883333,2003,1,3,1,0.0,2003-01-01 03:00:00
2,Saidpur,41858,25.783333,88.883333,2003,1,6,1,0.0,2003-01-01 06:00:00
3,Saidpur,41858,25.783333,88.883333,2003,1,9,1,0.0,2003-01-01 09:00:00
4,Saidpur,41858,25.783333,88.883333,2003,1,12,1,0.0,2003-01-01 12:00:00
...,...,...,...,...,...,...,...,...,...,...
2121299,Teknaf,11929,20.866667,92.300000,2023,12,9,31,0.0,2023-12-31 09:00:00
2121300,Teknaf,11929,20.866667,92.300000,2023,12,12,31,0.0,2023-12-31 12:00:00
2121301,Teknaf,11929,20.866667,92.300000,2023,12,15,31,0.0,2023-12-31 15:00:00
2121302,Teknaf,11929,20.866667,92.300000,2023,12,18,31,0.0,2023-12-31 18:00:00


In [49]:
print(df_rain["Datetime"].apply(type).value_counts())


Datetime
<class 'pandas._libs.tslibs.timestamps.Timestamp'>    2121304
Name: count, dtype: int64


In [50]:
print(df_rain.duplicated().sum())  # Check for duplicate rows


143


In [51]:
# Find duplicate rows
duplicates = df_rain[df_rain.duplicated()]

# Show the duplicate rows
duplicates


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Rainfall,Datetime
45841,Satkhira,11610,22.716667,89.083333,2020,7,3,1,0.0,2020-07-01 03:00:00
45842,Satkhira,11610,22.716667,89.083333,2020,7,6,1,0.0,2020-07-01 06:00:00
45843,Satkhira,11610,22.716667,89.083333,2020,7,9,1,0.0,2020-07-01 09:00:00
45844,Satkhira,11610,22.716667,89.083333,2020,7,12,1,0.0,2020-07-01 12:00:00
45846,Satkhira,11610,22.716667,89.083333,2020,7,18,1,0.0,2020-07-01 18:00:00
...,...,...,...,...,...,...,...,...,...,...
2058778,Satkhira,11610,22.716667,89.083333,2020,7,6,30,0.0,2020-07-30 06:00:00
2058779,Satkhira,11610,22.716667,89.083333,2020,7,9,30,0.0,2020-07-30 09:00:00
2058780,Satkhira,11610,22.716667,89.083333,2020,7,12,30,0.0,2020-07-30 12:00:00
2058782,Satkhira,11610,22.716667,89.083333,2020,7,18,30,0.0,2020-07-30 18:00:00


In [52]:
# Remove all duplicate rows (considering all columns)
df_rain = df_rain.drop_duplicates()

# Display the cleaned DataFrame
df_rain


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Rainfall,Datetime
0,Saidpur,41858,25.783333,88.883333,2003,1,0,1,0.6,2003-01-01 00:00:00
1,Saidpur,41858,25.783333,88.883333,2003,1,3,1,0.0,2003-01-01 03:00:00
2,Saidpur,41858,25.783333,88.883333,2003,1,6,1,0.0,2003-01-01 06:00:00
3,Saidpur,41858,25.783333,88.883333,2003,1,9,1,0.0,2003-01-01 09:00:00
4,Saidpur,41858,25.783333,88.883333,2003,1,12,1,0.0,2003-01-01 12:00:00
...,...,...,...,...,...,...,...,...,...,...
2121299,Teknaf,11929,20.866667,92.300000,2023,12,9,31,0.0,2023-12-31 09:00:00
2121300,Teknaf,11929,20.866667,92.300000,2023,12,12,31,0.0,2023-12-31 12:00:00
2121301,Teknaf,11929,20.866667,92.300000,2023,12,15,31,0.0,2023-12-31 15:00:00
2121302,Teknaf,11929,20.866667,92.300000,2023,12,18,31,0.0,2023-12-31 18:00:00


In [53]:
print(df_rain.duplicated().sum())  # Check for duplicate rows

0


In [54]:
print(df_rain.groupby("StationID")["Datetime"].nunique())  # Check number of unique timestamps per station


StationID
10120    61360
10208    61360
10320    61360
10408    61360
10609    61360
10705    61360
10724    58440
10910    61360
11111    61360
11313    61360
11316    61360
11407    61360
11505    61360
11513    61360
11604    61360
11610    61120
11704    61360
11706    61360
11805    58440
11809    61360
11814    58424
11912    61360
11916    61360
11921    46752
11925    61360
11927    58440
11929    61360
12007    61360
12103    61360
12110    61360
41858    61360
41909    61360
41926    61360
41958    61360
41977    61360
Name: Datetime, dtype: int64


In [55]:
# Reorganizing columns
df_rain = df_rain[[
    'Datetime', 'StationName',
    'StationID', 'Latitude', 'Longitude',
    'Year', 'Month', 'Day', 'Time',
    'Rainfall'
]]

# Display the first few rows to confirm
df_rain.head()

Unnamed: 0,Datetime,StationName,StationID,Latitude,Longitude,Year,Month,Day,Time,Rainfall
0,2003-01-01 00:00:00,Saidpur,41858,25.783333,88.883333,2003,1,1,0,0.6
1,2003-01-01 03:00:00,Saidpur,41858,25.783333,88.883333,2003,1,1,3,0.0
2,2003-01-01 06:00:00,Saidpur,41858,25.783333,88.883333,2003,1,1,6,0.0
3,2003-01-01 09:00:00,Saidpur,41858,25.783333,88.883333,2003,1,1,9,0.0
4,2003-01-01 12:00:00,Saidpur,41858,25.783333,88.883333,2003,1,1,12,0.0


## **Data Summary**

In [56]:
print(df_rain.info())  # Check data types and missing values
print(df_rain.describe())  # Summary statistics for numeric columns
print(df_rain["Datetime"].min(), df_rain["Datetime"].max())  # Verify datetime range
print(df_rain.duplicated().sum())  # Check for duplicate rows


<class 'pandas.core.frame.DataFrame'>
Index: 2121161 entries, 0 to 2121303
Data columns (total 10 columns):
 #   Column       Dtype         
---  ------       -----         
 0   Datetime     datetime64[ns]
 1   StationName  object        
 2   StationID    int64         
 3   Latitude     float64       
 4   Longitude    float64       
 5   Year         int64         
 6   Month        int64         
 7   Day          int64         
 8   Time         int64         
 9   Rainfall     float64       
dtypes: datetime64[ns](1), float64(3), int64(5), object(1)
memory usage: 178.0+ MB
None
                            Datetime     StationID      Latitude  \
count                        2121161  2.121161e+06  2.121161e+06   
mean   2013-07-28 07:29:41.846073856  1.580979e+04  2.332438e+01   
min              2003-01-01 00:00:00  1.012000e+04  2.086667e+01   
25%              2008-05-29 15:00:00  1.111100e+04  2.243333e+01   
50%              2013-08-05 12:00:00  1.170600e+04  2.316667e+01   


In [57]:
print(df_rain.isnull().sum())


Datetime       0
StationName    0
StationID      0
Latitude       0
Longitude      0
Year           0
Month          0
Day            0
Time           0
Rainfall       0
dtype: int64


In [58]:
print(df_rain.duplicated(subset=['StationID', 'Datetime']).sum())


105


In [59]:
print(df_rain['StationName'].unique())


['Saidpur' 'Rangpur' 'Dinajpur' 'Sylhet' 'Bogura' 'Mymensingh' 'Rajshahi'
 'Srimangal' 'Tangail' 'Ishurdi' 'Dhaka' 'Chuadanga' 'Faridpur'
 'Sitakunda' 'Cumilla' 'Chandpur' 'Jessore' 'Madaripur' 'Feni'
 'Maijdee_court' 'Khulna' 'Barishal' 'Satkhira' 'Bhola' 'Rangamati'
 'Sandwip' 'Hatiya' 'Ambagan(Ctg.)' 'Mongla' 'Patuakhali' 'Chittagong'
 'Khepupara' 'Kutubdia' "Cox'sBazar" 'Teknaf']


In [60]:
print(df_rain['Datetime'].is_monotonic_increasing)


False


In [61]:
df_rain['Rainfall'] = df_rain['Rainfall'].round(1)

## **Saving the Data Finally**

In [62]:
# Save the DataFrame to a CSV file
df_rain.to_csv('/content/drive/MyDrive/Thesis/Research Data/Preprocessed Data/Processed_rain_data.csv', index=False)
