## **Installing Necessary Libraries**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import calendar
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Data Loading & Make ready for Preprocessing**

In [3]:
## Loading the data
path ='/content/drive/MyDrive/Thesis/Research Data/Original_Data/3-Hourly Humidity.csv'
df = pd.read_csv(path, low_memory=False)
df

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,1,2,3,...,22,23,24,25,26,27,28,29,30,31
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,0,90,87,97,...,86,90,93,86,93,93,86,88,81,86
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,3,84,76,94,...,70,70,80,67,80,73,87,79,49,59
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,6,49,66,74,...,36,56,43,37,51,42,69,32,29,22
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,9,44,86,63,...,31,47,37,35,49,43,31,29,25,33
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,12,61,97,80,...,51,69,56,59,62,59,53,41,46,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141211,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,9,43,43,45,...,55,50,53,45,45,44,39,38,33,28
141212,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,12,77,76,72,...,82,76,76,75,74,81,78,74,82,71
141213,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,15,79,78,76,...,78,79,77,78,80,78,76,77,85,72
141214,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,18,79,79,80,...,78,82,78,77,77,78,81,81,78,74


In [4]:
# Melt the DataFrame into a long format
df_hum = pd.melt(df, id_vars=['StationName','StationID','Latitude','Longitude', 'Year', 'Month', 'Time'], var_name='Day', value_name='Humidity')
df_hum


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Humidity
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,0,1,90
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,3,1,84
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,6,1,49
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,9,1,44
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,12,1,61
...,...,...,...,...,...,...,...,...,...
4377691,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,9,31,28
4377692,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,12,31,71
4377693,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,15,31,72
4377694,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,18,31,74


In [5]:
## Find the datatypes of the columns
df_hum.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Time,int64
Day,object
Humidity,object


In [6]:
df_hum.shape

(4377696, 9)

In [7]:
# Find unique values in 'Month column
df_hum['StationName'].unique()

array(['Dhaka', 'Tangail', 'Mymensingh', 'Faridpur', 'Madaripur',
       'Srimangal', 'Sylhet', 'Bogura', 'Dinajpur', 'Ishurdi', 'Rajshahi',
       'Rangpur', 'Saidpur', 'Chadanga', 'Jessore', 'Khulna', 'Mongla',
       'Satkhira', 'Barishal', 'Bhola', 'Khepupara', 'Patuakhali',
       'Chandpur', 'Teknaf', 'Ambagan(Ctg.)', 'Cumilla', "Cox'sBazar",
       'Feni', 'Hatiya', 'Kutubdia', 'Maijdee_court', 'Rangamati',
       'Sandwip', 'Chittagong', 'Sitakunda'], dtype=object)

In [8]:
# Count unique StationIDs
num_unique_station_name = df_hum['StationName'].nunique()

print(f"Count of unique Station IDs: {num_unique_station_name}")


Count of unique Station IDs: 35


In [9]:
# Find unique values in 'Year column
df_hum['StationID'].unique()

array([11111, 41909, 10609, 11505, 11513, 10724, 10705, 10408, 10120,
       10910, 10320, 10208, 41858, 41926, 11407, 11604, 41958, 11610,
       11704, 11706, 12110, 12103, 11316, 11929, 41977, 11313, 11927,
       11805, 11814, 11925, 11809, 12007, 11916, 11921, 11912])

In [10]:
# Count unique StationIDs
num_unique_station_ids = df_hum['StationID'].nunique()

print(f"Count of unique Station IDs: {num_unique_station_ids}")


Count of unique Station IDs: 35


In [11]:
# Find unique values in 'Month column
df_hum['Latitude'].unique()

array(['23Deg.46Mts.N', '24Deg.15Mts.N', '24Deg.43Mts.N', '23Deg.36Mts.N',
       '23Deg.10Mts.N', '24Deg.18Mts.N', '24Deg.54Mts.N', '24Deg.51Mts.N',
       '25Deg.39Mts.N', '24Deg.8Mts.N', '24Deg.22Mts.N', '25Deg.44Mts.N',
       '25Deg.47Mts.N', '23Deg.39Mts.N', '23Deg.11Mts.N', '22Deg.47Mts.N',
       '22Deg.20Mts.N', '22Deg.43Mts.N', '22Deg.45Mts.N', '22Deg.41Mts.N',
       '21Deg.59Mts.N', '23Deg.16Mts.N', '20Deg.52Mts.N', '22Deg.21Mts.N',
       '23Deg.26Mts.N', '21Deg.26Mts.N', '23Deg.2Mts.N', '22Deg.26Mts.N',
       '21Deg.49Mts.N', '22Deg.52Mts.N', '22Deg.32Mts.N', '22Deg.29Mts.N',
       '22Deg.16Mts.N', '23Deg.35Mts.N'], dtype=object)

In [12]:
# Count unique StationIDs
num_unique_latitude = df_hum['Latitude'].nunique()

print(f"Count of unique Station IDs: {num_unique_latitude}")


Count of unique Station IDs: 34


In [13]:
# Find unique values in 'Month column
df_hum['Longitude'].unique()

array(['90Deg.23Mts.E', '89Deg.55Mts.E', '90Deg.26Mts.E', '89Deg.51Mts.E',
       '90Deg.11Mts.E', '91Deg.44Mts.E', '91Deg.53Mts.E', '89Deg.22Mts.E',
       '88Deg.41Mts.E', '89Deg.3Mts.E', '88Deg.42Mts.E', '89Deg.14Mts.E',
       '88Deg.53Mts.E', '88Deg.52Mts.E', '89Deg.10Mts.E', '89Deg.32Mts.E',
       '89Deg.36Mts.E', '89Deg.5Mts.E', '90Deg.20Mts.E', '90Deg.39Mts.E',
       '90Deg.14Mts.E', '90Deg.42Mts.E', '92Deg.18Mts.E', '91Deg.49Mts.E',
       '91Deg.11Mts.E', '91Deg.56Mts.E', '91Deg.25Mts.E', '91Deg.6Mts.E',
       '91Deg.51Mts.E', '92Deg.12Mts.E', '91Deg.26Mts.E', '91Deg.42Mts.E'],
      dtype=object)

In [14]:
df_hum['Longitude'].nunique()

32

In [15]:
df_hum['Year'].unique()

array([1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
       1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023])

In [16]:
df_hum['Year'].nunique()

44

In [17]:
# Find unique values in 'Month column
df_hum['Month'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [18]:
# Find unique values in 'Day column
df_hum['Day'].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', '25', '26', '27', '28', '29', '30', '31'], dtype=object)

In [19]:
# Find data types 'Day column
df_hum['Day'].dtypes

dtype('O')

In [20]:
# Convert the Day column to numeric
df_hum['Day'] = pd.to_numeric(df_hum['Day'])
df_hum

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Humidity
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,0,1,90
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,3,1,84
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,6,1,49
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,9,1,44
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,12,1,61
...,...,...,...,...,...,...,...,...,...
4377691,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,9,31,28
4377692,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,12,31,71
4377693,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,15,31,72
4377694,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,18,31,74


In [21]:
## Find the datatypes of the columns again
df_hum.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Time,int64
Day,int64
Humidity,object


In [22]:
# Find unique values in 'Time' column
df_hum['Time'].unique()

array([ 0,  3,  6,  9, 12, 15, 18, 21,  1])

In [23]:
# Find unique values in 'Humidity' column
df_hum['Humidity'].unique()

array(['90', '84', '49', '44', '61', '83', '88', '87', '89', '67', '39',
       '38', '55', '73', '78', '95', '82', '97', '94', '***', '85', '53',
       '27', '28', '54', '68', '63', '69', '77', '100', '74', '98', '91',
       '93', '64', '79', '96', '81', '72', '76', '75', '66', '80', '43',
       '52', '99', '59', '56', '92', '65', '58', '86', '71', '60', '46',
       '42', '57', '50', '37', '62', '45', '35', '25', '48', '70', '41',
       '51', '40', '34', '26', '29', '30', '31', '32', '33', '47', '23',
       '36', '24', '17', '22', '13', '15', '18', '20', '21', '19', '16',
       '14', '11', '12', '9', '10', '8', '7', '5', '4', '6', '3', '2',
       '1', nan], dtype=object)

## **Handling Missing value and Unwanted Symbol**

In [24]:
## Count the number of '***' in the value column
df_hum['Humidity'][df_hum['Humidity'] == '***'].count()

np.int64(131291)

In [25]:
# Filter rows where 'Humidity' is '***'
rows_with_asterisks = df_hum[df_hum['Humidity'] == '***']

# Display the rows
rows_with_asterisks


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Humidity
23,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,3,21,1,***
31,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,4,21,1,***
47,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,6,21,1,***
55,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,7,21,1,***
63,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,8,21,1,***
...,...,...,...,...,...,...,...,...,...
4374319,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,1988,10,21,31,***
4374334,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,1988,12,18,31,***
4374335,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,1988,12,21,31,***
4374342,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,1989,1,18,31,***


In [26]:
# Filter rows where 'Humidity' is '***'
rows_with_asterisks = df_hum[df_hum['Humidity'] == '***']

# Group by 'StationName' and display the results
station_specific_results = rows_with_asterisks.groupby('StationName')

# Display results for each station
for station, data in station_specific_results:
    print(f"Station: {station}")
    print(data, '\n')


Station: Ambagan(Ctg.)
           StationName  StationID       Latitude      Longitude  Year  Month  \
803801   Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  1999     12   
2358519  Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  2013     11   

         Time  Day Humidity  
803801      3    6      ***  
2358519    21   17      ***   

Station: Barishal
        StationName  StationID       Latitude      Longitude  Year  Month  \
2191432    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2191433    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2191434    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2191435    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2191436    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2191437    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2191438    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2191439   

In [27]:
# To find the null values
df_hum.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Time,0
Day,0
Humidity,79464


In [28]:
# Replace '***' with NaN in the original DataFrame
df_hum.loc[df_hum['Humidity'] == '***', 'Humidity'] = np.nan

In [29]:
# To find the null values
df_hum.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Time,0
Day,0
Humidity,210755


In [30]:
# Step 2: Convert 'Humidity' to numeric, coercing errors (turn non-numeric to NaN)
df_hum['Humidity'] = pd.to_numeric(df_hum['Humidity'], errors='coerce')

# Group by StationName, Month, Day, and Time, and calculate the mean for each group
Humidity_mean = df_hum.groupby(['StationName', 'Month', 'Day', 'Time'])['Humidity'].transform('mean')

# Replace '***' (NaN values) with the mean of the same station, month, day, and time
df_hum['Humidity'] = df_hum['Humidity'].fillna(Humidity_mean)

# Verify the result
print(df_hum)


        StationName  StationID       Latitude      Longitude  Year  Month  \
0             Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      1   
1             Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      1   
2             Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      1   
3             Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      1   
4             Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      1   
...             ...        ...            ...            ...   ...    ...   
4377691   Sitakunda      11912  23Deg.35Mts.N  91Deg.42Mts.E  2023     12   
4377692   Sitakunda      11912  23Deg.35Mts.N  91Deg.42Mts.E  2023     12   
4377693   Sitakunda      11912  23Deg.35Mts.N  91Deg.42Mts.E  2023     12   
4377694   Sitakunda      11912  23Deg.35Mts.N  91Deg.42Mts.E  2023     12   
4377695   Sitakunda      11912  23Deg.35Mts.N  91Deg.42Mts.E  2023     12   

         Time  Day  Humidity  
0           0    1      90.0  
1           3

In [31]:
# Check it again to  find the null values
df_hum.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Time,0
Day,0
Humidity,70608


In [32]:
# Check rows where 'Humidity' is still NaN after replacement
rows_with_nan = df_hum[df_hum['Humidity'].isna()]

# If there are still rows with NaN, inspect them
rows_with_nan

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Humidity
4095272,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,0,30,
4095273,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,3,30,
4095274,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,6,30,
4095275,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,9,30,
4095276,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,12,30,
...,...,...,...,...,...,...,...,...,...
4377683,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,11,9,31,
4377684,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,11,12,31,
4377685,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,11,15,31,
4377686,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,11,18,31,


In [33]:
# Sample: Define valid days for each month (for non-leap years)
valid_days_per_month = {
    1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30,
    7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31
}

# Adjust for leap years (for February only)
def is_leap_year(year):
    return (year % 4 == 0 and (year % 100 != 0 or year % 400 == 0))

# Add a column to check if a year is a leap year
df_hum['IsLeapYear'] = df_hum['Year'].apply(is_leap_year)

# Adjust February days for leap years
df_hum.loc[df_hum['IsLeapYear'], 'ValidDays'] = df_hum.loc[df_hum['IsLeapYear'], 'Month'].apply(lambda month: 29 if month == 2 else valid_days_per_month[month])
df_hum.loc[~df_hum['IsLeapYear'], 'ValidDays'] = df_hum.loc[~df_hum['IsLeapYear'], 'Month'].apply(lambda month: valid_days_per_month[month])

In [34]:
# Remove rows where 'Day' is greater than the valid days for that month
df_hum = df_hum[df_hum['Day'] <= df_hum['ValidDays']]

In [35]:
df_hum

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Humidity,IsLeapYear,ValidDays
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,0,1,90.0,True,31.0
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,3,1,84.0,True,31.0
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,6,1,49.0,True,31.0
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,9,1,44.0,True,31.0
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,12,1,61.0,True,31.0
...,...,...,...,...,...,...,...,...,...,...,...
4377691,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,9,31,28.0,False,31.0
4377692,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,12,31,71.0,False,31.0
4377693,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,15,31,72.0,False,31.0
4377694,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,18,31,74.0,False,31.0


In [36]:
# Check it again to  find the null values
df_hum.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Time,0
Day,0
Humidity,0
IsLeapYear,0


In [37]:
df_hum = df_hum.drop(columns=['IsLeapYear', 'ValidDays'])


In [38]:
df_hum

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Humidity
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,0,1,90.0
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,3,1,84.0
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,6,1,49.0
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,9,1,44.0
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,12,1,61.0
...,...,...,...,...,...,...,...,...,...
4377691,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,9,31,28.0
4377692,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,12,31,71.0
4377693,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,15,31,72.0
4377694,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,12,18,31,74.0


## **Coverting Latitude and Longitude to Numerical Value**

In [39]:
df_hum.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Time,int64
Day,int64
Humidity,float64


In [40]:
def convert_to_decimal(deg_min, direction):
    """
    Converts degree-minute format to decimal degrees.
    Example Input: "23Deg.46Mts.N" -> Output: 23.7667
    """
    parts = deg_min.split("Deg.")
    degrees = float(parts[0])
    minutes = float(parts[1].split("Mts.")[0])

    decimal = degrees + (minutes / 60)

    # Convert South/West to negative
    if direction in ["S", "W"]:
        decimal = -decimal

    return decimal

# Example usage
latitude = convert_to_decimal("23Deg.46Mts.", "N")
longitude = convert_to_decimal("90Deg.23Mts.", "E")

print(latitude, longitude)  # Output: 23.7667, 90.3833


23.766666666666666 90.38333333333334


In [41]:
df_hum["Latitude"] = df_hum["Latitude"].apply(lambda x: convert_to_decimal(x[:-1], x[-1]))  # Extract last character (N/S)
df_hum["Longitude"] = df_hum["Longitude"].apply(lambda x: convert_to_decimal(x[:-1], x[-1]))  # Extract last character (E/W)
df_hum

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Humidity
0,Dhaka,11111,23.766667,90.383333,1980,1,0,1,90.0
1,Dhaka,11111,23.766667,90.383333,1980,1,3,1,84.0
2,Dhaka,11111,23.766667,90.383333,1980,1,6,1,49.0
3,Dhaka,11111,23.766667,90.383333,1980,1,9,1,44.0
4,Dhaka,11111,23.766667,90.383333,1980,1,12,1,61.0
...,...,...,...,...,...,...,...,...,...
4377691,Sitakunda,11912,23.583333,91.700000,2023,12,9,31,28.0
4377692,Sitakunda,11912,23.583333,91.700000,2023,12,12,31,71.0
4377693,Sitakunda,11912,23.583333,91.700000,2023,12,15,31,72.0
4377694,Sitakunda,11912,23.583333,91.700000,2023,12,18,31,74.0


In [42]:
# Check for invalid days (e.g., day > 31 or day > max days in the month)
invalid_dates = df_hum[(df_hum['Day'] > 31) | (df_hum['Month'] > 12)]
print(invalid_dates)


Empty DataFrame
Columns: [StationName, StationID, Latitude, Longitude, Year, Month, Time, Day, Humidity]
Index: []


## **Creating Date Time Column**

In [43]:
import calendar

# Function to check if a date is valid
def is_invalid_date(row):
    days_in_month = calendar.monthrange(row['Year'], row['Month'])[1]
    return row['Day'] > days_in_month

# Remove invalid rows
df_hum = df_hum[~df_hum.apply(is_invalid_date, axis=1)].reset_index(drop=True)

# Convert datetime again
df_hum['Datetime'] = pd.to_datetime(df_hum[['Year', 'Month', 'Day']]) + pd.to_timedelta(df_hum['Time'], unit='h')

print(df_hum.info())  # Verify that datetime is correctly created


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4298232 entries, 0 to 4298231
Data columns (total 10 columns):
 #   Column       Dtype         
---  ------       -----         
 0   StationName  object        
 1   StationID    int64         
 2   Latitude     float64       
 3   Longitude    float64       
 4   Year         int64         
 5   Month        int64         
 6   Time         int64         
 7   Day          int64         
 8   Humidity     float64       
 9   Datetime     datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(5), object(1)
memory usage: 327.9+ MB
None


In [44]:
df_hum

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Humidity,Datetime
0,Dhaka,11111,23.766667,90.383333,1980,1,0,1,90.0,1980-01-01 00:00:00
1,Dhaka,11111,23.766667,90.383333,1980,1,3,1,84.0,1980-01-01 03:00:00
2,Dhaka,11111,23.766667,90.383333,1980,1,6,1,49.0,1980-01-01 06:00:00
3,Dhaka,11111,23.766667,90.383333,1980,1,9,1,44.0,1980-01-01 09:00:00
4,Dhaka,11111,23.766667,90.383333,1980,1,12,1,61.0,1980-01-01 12:00:00
...,...,...,...,...,...,...,...,...,...,...
4298227,Sitakunda,11912,23.583333,91.700000,2023,12,9,31,28.0,2023-12-31 09:00:00
4298228,Sitakunda,11912,23.583333,91.700000,2023,12,12,31,71.0,2023-12-31 12:00:00
4298229,Sitakunda,11912,23.583333,91.700000,2023,12,15,31,72.0,2023-12-31 15:00:00
4298230,Sitakunda,11912,23.583333,91.700000,2023,12,18,31,74.0,2023-12-31 18:00:00


In [45]:
print(df_hum["Datetime"].apply(type).value_counts())


Datetime
<class 'pandas._libs.tslibs.timestamps.Timestamp'>    4298232
Name: count, dtype: int64


In [46]:
print(df_hum.duplicated().sum())  # Check for duplicate rows


2920


In [47]:
# Find duplicate rows
duplicates = df_hum[df_hum.duplicated()]

# Show the duplicate rows
duplicates


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Humidity,Datetime
18720,Madaripur,11513,23.166667,90.183333,2005,1,0,1,97.0,2005-01-01 00:00:00
18721,Madaripur,11513,23.166667,90.183333,2005,1,3,1,94.0,2005-01-01 03:00:00
18722,Madaripur,11513,23.166667,90.183333,2005,1,6,1,46.0,2005-01-01 06:00:00
18723,Madaripur,11513,23.166667,90.183333,2005,1,9,1,54.0,2005-01-01 09:00:00
18724,Madaripur,11513,23.166667,90.183333,2005,1,12,1,79.0,2005-01-01 12:00:00
...,...,...,...,...,...,...,...,...,...,...
4226827,Madaripur,11513,23.166667,90.183333,2005,12,9,31,46.0,2005-12-31 09:00:00
4226828,Madaripur,11513,23.166667,90.183333,2005,12,12,31,82.0,2005-12-31 12:00:00
4226829,Madaripur,11513,23.166667,90.183333,2005,12,15,31,91.0,2005-12-31 15:00:00
4226830,Madaripur,11513,23.166667,90.183333,2005,12,18,31,93.0,2005-12-31 18:00:00


In [48]:
# Remove all duplicate rows (considering all columns)
df_hum = df_hum.drop_duplicates()

# Display the cleaned DataFrame
df_hum


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,Humidity,Datetime
0,Dhaka,11111,23.766667,90.383333,1980,1,0,1,90.0,1980-01-01 00:00:00
1,Dhaka,11111,23.766667,90.383333,1980,1,3,1,84.0,1980-01-01 03:00:00
2,Dhaka,11111,23.766667,90.383333,1980,1,6,1,49.0,1980-01-01 06:00:00
3,Dhaka,11111,23.766667,90.383333,1980,1,9,1,44.0,1980-01-01 09:00:00
4,Dhaka,11111,23.766667,90.383333,1980,1,12,1,61.0,1980-01-01 12:00:00
...,...,...,...,...,...,...,...,...,...,...
4298227,Sitakunda,11912,23.583333,91.700000,2023,12,9,31,28.0,2023-12-31 09:00:00
4298228,Sitakunda,11912,23.583333,91.700000,2023,12,12,31,71.0,2023-12-31 12:00:00
4298229,Sitakunda,11912,23.583333,91.700000,2023,12,15,31,72.0,2023-12-31 15:00:00
4298230,Sitakunda,11912,23.583333,91.700000,2023,12,18,31,74.0,2023-12-31 18:00:00


In [49]:
print(df_hum.duplicated().sum())  # Check for duplicate rows

0


In [50]:
print(df_hum.groupby("StationID")["Datetime"].nunique())  # Check number of unique timestamps per station


StationID
10120    125640
10208    128568
10320    128568
10408    128568
10609    128568
10705    128568
10724    125648
10910    128568
11111    128568
11313    128568
11316    125640
11407    128568
11505    128568
11513    128568
11604    128568
11610    128568
11704    128568
11706    128568
11805    128568
11809    128568
11814    125648
11912    128568
11916    125648
11921    113960
11925    116880
11927    128568
11929    128568
12007    128568
12103    125640
12110    128568
41858     96424
41909    108112
41926    102264
41958    102264
41977     73048
Name: Datetime, dtype: int64


In [51]:
# Reorganizing columns
df_hum = df_hum[[
    'Datetime', 'StationName',
    'StationID', 'Latitude', 'Longitude',
    'Year', 'Month', 'Day', 'Time',
    'Humidity'
]]

# Display the first few rows to confirm
df_hum.head()

Unnamed: 0,Datetime,StationName,StationID,Latitude,Longitude,Year,Month,Day,Time,Humidity
0,1980-01-01 00:00:00,Dhaka,11111,23.766667,90.383333,1980,1,1,0,90.0
1,1980-01-01 03:00:00,Dhaka,11111,23.766667,90.383333,1980,1,1,3,84.0
2,1980-01-01 06:00:00,Dhaka,11111,23.766667,90.383333,1980,1,1,6,49.0
3,1980-01-01 09:00:00,Dhaka,11111,23.766667,90.383333,1980,1,1,9,44.0
4,1980-01-01 12:00:00,Dhaka,11111,23.766667,90.383333,1980,1,1,12,61.0


## **Data Summary**

In [52]:
print(df_hum.info())  # Check data types and missing values
print(df_hum.describe())  # Summary statistics for numeric columns
print(df_hum["Datetime"].min(), df_hum["Datetime"].max())  # Verify datetime range
print(df_hum.duplicated().sum())  # Check for duplicate rows


<class 'pandas.core.frame.DataFrame'>
Index: 4295312 entries, 0 to 4298231
Data columns (total 10 columns):
 #   Column       Dtype         
---  ------       -----         
 0   Datetime     datetime64[ns]
 1   StationName  object        
 2   StationID    int64         
 3   Latitude     float64       
 4   Longitude    float64       
 5   Year         int64         
 6   Month        int64         
 7   Day          int64         
 8   Time         int64         
 9   Humidity     float64       
dtypes: datetime64[ns](1), float64(3), int64(5), object(1)
memory usage: 360.5+ MB
None
                            Datetime     StationID      Latitude  \
count                        4295312  4.295312e+06  4.295312e+06   
mean   2002-09-12 09:49:46.714350976  1.482217e+04  2.331501e+01   
min              1980-01-01 00:00:00  1.012000e+04  2.086667e+01   
25%              1992-02-18 09:00:00  1.111100e+04  2.248333e+01   
50%              2002-10-30 03:00:00  1.170400e+04  2.316667e+01   


In [53]:
print(df_hum.isnull().sum())


Datetime       0
StationName    0
StationID      0
Latitude       0
Longitude      0
Year           0
Month          0
Day            0
Time           0
Humidity       0
dtype: int64


In [54]:
print(df_hum['Datetime'].is_monotonic_increasing)


False


In [55]:
df_hum['Humidity'] = df_hum['Humidity'].round(1)


## **Saving the Data Finally**

In [56]:
# Save the DataFrame to a CSV file
df_hum.to_csv('/content/drive/MyDrive/Thesis/Research Data/Preprocessed Data/Processed_humidity_data.csv', index=False)
