## **Installing Necessary Libraries**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import calendar
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Data Loading & Make ready for Preprocessing**

In [3]:
## Loading the data
path ='/content/drive/MyDrive/Thesis/Research Data/Original_Data/3-Hourly Station Level Pressure.csv'
df = pd.read_csv(path, low_memory=False)
df

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,1,2,3,...,22,23,24,25,26,27,28,29,30,31
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,0,1011.7,1011.9,1013,...,1013.7,1011.3,1011.2,1013.8,1011.3,1010.4,1006.6,1006.3,1008.3,1011.2
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,3,1014.8,1014.3,1015.4,...,1016.4,1013.3,1014,1016.1,1013.6,1012.3,1009.2,1008.3,1012,1013.5
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,6,1013.4,1014,1015,...,1014.7,1012.7,1013.4,1014.8,1012.4,1011.2,1007.8,1007.8,1011.2,1012.7
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,9,1011.2,1013.3,1012.6,...,1012,1011.1,1011.2,1011.8,1010.2,1008.1,1005.4,1006.4,1009.3,1010.9
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,12,1011.4,1012.2,1013.1,...,1011.9,1009.9,1011.5,1012.6,1009,1006.8,1005.7,1006.1,1009.9,1011.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139003,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,9,1008,1007.5,1007.9,...,1008.5,1007.7,1007.7,1010.5,1010.6,1012.7,1013.1,1014.7,1015.1,1014.7
139004,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,12,1009.1,1008.6,1010.1,...,1009.4,1008.1,1007.8,1009.8,1010.6,1013.5,1013.2,1014.1,1015.5,1014.9
139005,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,15,1010.3,1010.6,1011.2,...,1009,1008.8,1005.4,1008.2,1016.7,1012.9,1013.9,1015.9,1016.8,1016.5
139006,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,18,1009.8,1009.8,1010.7,...,1008.7,1008.5,1006.6,1008.1,1016.3,1012.6,1013.8,1015.7,1016.5,1016.4


In [4]:
# Melt the DataFrame into a long format
df_pressure = pd.melt(df, id_vars=['StationName','StationID','Latitude','Longitude', 'Year', 'Month', 'Time'], var_name='Day', value_name='StationLevelPressure')
df_pressure


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,StationLevelPressure
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,0,1,1011.7
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,3,1,1014.8
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,6,1,1013.4
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,9,1,1011.2
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,12,1,1011.4
...,...,...,...,...,...,...,...,...,...
4309243,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,9,31,1014.7
4309244,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,12,31,1014.9
4309245,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,15,31,1016.5
4309246,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,18,31,1016.4


In [5]:
## Find the datatypes of the columns
df_pressure.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Time,int64
Day,object
StationLevelPressure,object


In [6]:
df_pressure.shape

(4309248, 9)

In [7]:
# Find unique values in 'Month column
df_pressure['StationName'].unique()

array(['Dhaka', 'Tangail', 'Mymensingh', 'Faridpur', 'Madaripur',
       'Srimangal', 'Sylhet', 'Bogura', 'Dinajpur', 'Ishurdi', 'Rajshahi',
       'Rangpur', 'Saidpur', 'Chuadanga', 'Jessore', 'Khulna', 'Mongla',
       'Satkhira', 'Barishal', 'Bhola', 'Khepupara', 'Patuakhali',
       'Chandpur', 'Ambagan(Ctg.)', 'Chittagong', 'Cumilla', "Cox'sBazar",
       'Feni', 'Hatiya', 'Kutubdia', 'Maijdee_court', 'Rangamati',
       'Sandwip', 'Sitakunda', 'Teknaf'], dtype=object)

In [8]:
# Count unique StationIDs
num_unique_station_name = df_pressure['StationName'].nunique()

print(f"Count of unique Station IDs: {num_unique_station_name}")


Count of unique Station IDs: 35


In [9]:
# Find unique values in 'Year column
df_pressure['StationID'].unique()

array([11111, 41909, 10609, 11505, 11513, 10724, 10705, 10408, 10120,
       10910, 10320, 10208, 41858, 41926, 11407, 11604, 41958, 11610,
       11704, 11706, 12110, 12103, 11316, 41977, 11921, 11313, 11927,
       11805, 11814, 11925, 11809, 12007, 11916, 11912, 11929])

In [10]:
# Count unique StationIDs
num_unique_station_ids = df_pressure['StationID'].nunique()

print(f"Count of unique Station IDs: {num_unique_station_ids}")


Count of unique Station IDs: 35


In [11]:
# Find unique values in 'Month column
df_pressure['Latitude'].unique()

array(['23Deg.46Mts.N', '24Deg.15Mts.N', '24Deg.43Mts.N', '23Deg.36Mts.N',
       '23Deg.10Mts.N', '24Deg.18Mts.N', '24Deg.54Mts.N', '24Deg.51Mts.N',
       '25Deg.39Mts.N', '24Deg.8Mts.N', '24Deg.22Mts.N', '25Deg.44Mts.N',
       '25Deg.47Mts.N', '23Deg.39Mts.N', '23Deg.11Mts.N', '22Deg.47Mts.N',
       '22Deg.20Mts.N', '22Deg.43Mts.N', '22Deg.45Mts.N', '22Deg.41Mts.N',
       '21Deg.59Mts.N', '23Deg.16Mts.N', '22Deg.21Mts.N', '22Deg.16Mts.N',
       '23Deg.26Mts.N', '21Deg.26Mts.N', '23Deg.2Mts.N', '22Deg.26Mts.N',
       '21Deg.49Mts.N', '22Deg.52Mts.N', '22Deg.32Mts.N', '22Deg.29Mts.N',
       '23Deg.35Mts.N', '20Deg.52Mts.N'], dtype=object)

In [12]:
# Count unique StationIDs
num_unique_latitude = df_pressure['Latitude'].nunique()

print(f"Count of unique Station IDs: {num_unique_latitude}")


Count of unique Station IDs: 34


In [13]:
# Find unique values in 'Month column
df_pressure['Longitude'].unique()

array(['90Deg.23Mts.E', '89Deg.55Mts.E', '90Deg.26Mts.E', '89Deg.51Mts.E',
       '90Deg.11Mts.E', '91Deg.44Mts.E', '91Deg.53Mts.E', '89Deg.22Mts.E',
       '88Deg.41Mts.E', '89Deg.3Mts.E', '88Deg.42Mts.E', '89Deg.14Mts.E',
       '88Deg.53Mts.E', '88Deg.52Mts.E', '89Deg.10Mts.E', '89Deg.32Mts.E',
       '89Deg.36Mts.E', '89Deg.5Mts.E', '90Deg.20Mts.E', '90Deg.39Mts.E',
       '90Deg.14Mts.E', '90Deg.42Mts.E', '91Deg.49Mts.E', '91Deg.11Mts.E',
       '91Deg.56Mts.E', '91Deg.25Mts.E', '91Deg.6Mts.E', '91Deg.51Mts.E',
       '92Deg.12Mts.E', '91Deg.26Mts.E', '91Deg.42Mts.E', '92Deg.18Mts.E'],
      dtype=object)

In [14]:
df_pressure['Longitude'].nunique()

32

In [15]:
df_pressure['Year'].unique()

array([1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
       1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023])

In [16]:
df_pressure['Year'].nunique()

44

In [17]:
# Find unique values in 'Month column
df_pressure['Month'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [18]:
# Find unique values in 'Day column
df_pressure['Day'].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', '25', '26', '27', '28', '29', '30', '31'], dtype=object)

In [19]:
# Find data types 'Day column
df_pressure['Day'].dtypes

dtype('O')

In [20]:
# Convert the Day column to numeric
df_pressure['Day'] = pd.to_numeric(df_pressure['Day'])
df_pressure

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,StationLevelPressure
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,0,1,1011.7
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,3,1,1014.8
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,6,1,1013.4
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,9,1,1011.2
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,12,1,1011.4
...,...,...,...,...,...,...,...,...,...
4309243,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,9,31,1014.7
4309244,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,12,31,1014.9
4309245,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,15,31,1016.5
4309246,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,18,31,1016.4


In [21]:
## Find the datatypes of the columns again
df_pressure.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Time,int64
Day,int64
StationLevelPressure,object


In [22]:
# Find unique values in 'Time' column
df_pressure['Time'].unique()

array([ 0,  3,  6,  9, 12, 15, 18, 21])

In [23]:
# Find unique values in 'StationLevelPressure' column
df_pressure['StationLevelPressure'].unique()

array(['1011.7', '1014.8', '1013.4', '1011.2', '1011.4', '1013.8',
       '1012.9', '1012.3', '1014.4', '1013.2', '1011.6', '1014.3',
       '1013.6', '1010.2', '1009.5', '1009.7', '1010.8', '1008.9',
       '1005.6', '1012.4', '1009.1', '1006.5', '1009', '1010.4', '1009.9',
       '1007.4', '1008.3', '1005', '1004.6', '1007.1', '1007', '1005.8',
       '1004.7', '1003.6', '1003.2', '1005.3', '1005.1', '997.8', '998.8',
       '999', '996.5', '996.1', '999.3', '999.2', '998.9', '997.9',
       '999.9', '998.6', '999.4', '998.3', '1002', '1003', '1001.9',
       '1001.1', '1001.5', '999.8', '1007.2', '1005.9', '1005.4',
       '1005.5', '1013', '1015.1', '1013.3', '1011', '1011.9', '1013.1',
       '1011.1', '1014', '1012.1', '1015', '1012.8', '******', '1015.9',
       '1013.9', '1016.3', '1014.9', '1009.3', '1010.5', '1008.2', '1008',
       '1008.5', '1007.6', '1006.6', '1006', '1008.7', '1010.1', '1001.4',
       '1003.5', '1002.5', '1000.8', '1001.3', '996', '997', '996.9',
       

## **Handling Missing value and Unwanted Symbol**

In [24]:
## Count the number of '******' in the value column
df_pressure['StationLevelPressure'][df_pressure['StationLevelPressure'] == '******'].count()

np.int64(130725)

In [25]:
# Filter rows where 'StationLevelPressure' is '*******'
rows_with_asterisks = df_pressure[df_pressure['StationLevelPressure'] == '******']

# Display the rows
rows_with_asterisks


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,StationLevelPressure
103,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1981,1,21,1,******
111,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1981,2,21,1,******
119,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1981,3,21,1,******
127,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1981,4,21,1,******
135,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1981,5,21,1,******
...,...,...,...,...,...,...,...,...,...
4308187,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2011,12,9,31,******
4308188,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2011,12,12,31,******
4308189,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2011,12,15,31,******
4308190,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2011,12,18,31,******


In [26]:
# Filter rows where 'StationLevelPressure' is '******'
rows_with_asterisks = df_pressure[df_pressure['StationLevelPressure'] == '******']

# Group by 'StationName' and display the results
station_specific_results = rows_with_asterisks.groupby('StationName')

# Display results for each station
for station, data in station_specific_results:
    print(f"Station: {station}")
    print(data, '\n')


Station: Barishal
        StationName  StationID       Latitude      Longitude  Year  Month  \
2156488    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2156489    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2156490    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2156491    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2156492    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2156493    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2156494    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2156495    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2294914    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1981      5   
2295496    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2295497    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   
2295498    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts

In [27]:
# To find the null values
df_pressure.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Time,0
Day,0
StationLevelPressure,78200


In [28]:
# Replace '******' with NaN in the original DataFrame
df_pressure.loc[df_pressure['StationLevelPressure'] == '******', 'StationLevelPressure'] = np.nan

In [29]:
# To find the null values
df_pressure.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Time,0
Day,0
StationLevelPressure,208925


In [30]:
# Step 2: Convert 'StationLevelPressure' to numeric, coercing errors (turn non-numeric to NaN)
df_pressure['StationLevelPressure'] = pd.to_numeric(df_pressure['StationLevelPressure'], errors='coerce')

# Group by StationName, Month, Day, and Time, and calculate the mean for each group
StationLevelPressure_mean = df_pressure.groupby(['StationName', 'Month', 'Day', 'Time'])['StationLevelPressure'].transform('mean')

# Replace '******' (NaN values) with the mean of the same station, month, day, and time
df_pressure['StationLevelPressure'] = df_pressure['StationLevelPressure'].fillna(StationLevelPressure_mean)

# Verify the result
print(df_pressure)


        StationName  StationID       Latitude      Longitude  Year  Month  \
0             Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      1   
1             Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      1   
2             Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      1   
3             Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      1   
4             Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      1   
...             ...        ...            ...            ...   ...    ...   
4309243      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2022     12   
4309244      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2022     12   
4309245      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2022     12   
4309246      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2022     12   
4309247      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2022     12   

         Time  Day  StationLevelPressure  
0           0    1              

In [31]:
# Check it again to  find the null values
df_pressure.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Time,0
Day,0
StationLevelPressure,66488


In [32]:
# Check rows where 'StationLevelPressure' is still NaN after replacement
rows_with_nan = df_pressure[df_pressure['StationLevelPressure'].isna()]

# If there are still rows with NaN, inspect them
rows_with_nan

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,StationLevelPressure
4031240,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,0,30,
4031241,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,3,30,
4031242,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,6,30,
4031243,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,9,30,
4031244,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,12,30,
...,...,...,...,...,...,...,...,...,...
4309235,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,11,9,31,
4309236,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,11,12,31,
4309237,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,11,15,31,
4309238,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,11,18,31,


In [33]:
# Sample: Define valid days for each month (for non-leap years)
valid_days_per_month = {
    1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30,
    7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31
}

# Adjust for leap years (for February only)
def is_leap_year(year):
    return (year % 4 == 0 and (year % 100 != 0 or year % 400 == 0))

# Add a column to check if a year is a leap year
df_pressure['IsLeapYear'] = df_pressure['Year'].apply(is_leap_year)

# Adjust February days for leap years
df_pressure.loc[df_pressure['IsLeapYear'], 'ValidDays'] = df_pressure.loc[df_pressure['IsLeapYear'], 'Month'].apply(lambda month: 29 if month == 2 else valid_days_per_month[month])
df_pressure.loc[~df_pressure['IsLeapYear'], 'ValidDays'] = df_pressure.loc[~df_pressure['IsLeapYear'], 'Month'].apply(lambda month: valid_days_per_month[month])

In [34]:
# Remove rows where 'Day' is greater than the valid days for that month
df_pressure = df_pressure[df_pressure['Day'] <= df_pressure['ValidDays']]

In [35]:
df_pressure

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,StationLevelPressure,IsLeapYear,ValidDays
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,0,1,1011.7,True,31.0
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,3,1,1014.8,True,31.0
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,6,1,1013.4,True,31.0
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,9,1,1011.2,True,31.0
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,12,1,1011.4,True,31.0
...,...,...,...,...,...,...,...,...,...,...,...
4309243,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,9,31,1014.7,False,31.0
4309244,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,12,31,1014.9,False,31.0
4309245,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,15,31,1016.5,False,31.0
4309246,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,18,31,1016.4,False,31.0


In [36]:
# Check rows where 'StationLevelPressure' is still NaN after replacement
rows_with_nan = df_pressure[df_pressure['StationLevelPressure'].isna()]

# If there are still rows with NaN, inspect them
rows_with_nan

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,StationLevelPressure,IsLeapYear,ValidDays


In [37]:
# Check it again to  find the null values
df_pressure.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Time,0
Day,0
StationLevelPressure,0
IsLeapYear,0


In [38]:
df_pressure = df_pressure.drop(columns=['IsLeapYear', 'ValidDays'])


In [39]:
df_pressure

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,StationLevelPressure
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,0,1,1011.7
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,3,1,1014.8
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,6,1,1013.4
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,9,1,1011.2
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,12,1,1011.4
...,...,...,...,...,...,...,...,...,...
4309243,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,9,31,1014.7
4309244,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,12,31,1014.9
4309245,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,15,31,1016.5
4309246,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2022,12,18,31,1016.4


## **Coverting Latitude and Longitude to Numerical Value**

In [40]:
df_pressure.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Time,int64
Day,int64
StationLevelPressure,float64


In [41]:
def convert_to_decimal(deg_min, direction):
    """
    Converts degree-minute format to decimal degrees.
    Example Input: "23Deg.46Mts.N" -> Output: 23.7667
    """
    parts = deg_min.split("Deg.")
    degrees = float(parts[0])
    minutes = float(parts[1].split("Mts.")[0])

    decimal = degrees + (minutes / 60)

    # Convert South/West to negative
    if direction in ["S", "W"]:
        decimal = -decimal

    return decimal

# Example usage
latitude = convert_to_decimal("23Deg.46Mts.", "N")
longitude = convert_to_decimal("90Deg.23Mts.", "E")

print(latitude, longitude)  # Output: 23.7667, 90.3833


23.766666666666666 90.38333333333334


In [42]:
df_pressure["Latitude"] = df_pressure["Latitude"].apply(lambda x: convert_to_decimal(x[:-1], x[-1]))  # Extract last character (N/S)
df_pressure["Longitude"] = df_pressure["Longitude"].apply(lambda x: convert_to_decimal(x[:-1], x[-1]))  # Extract last character (E/W)
df_pressure

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,StationLevelPressure
0,Dhaka,11111,23.766667,90.383333,1980,1,0,1,1011.7
1,Dhaka,11111,23.766667,90.383333,1980,1,3,1,1014.8
2,Dhaka,11111,23.766667,90.383333,1980,1,6,1,1013.4
3,Dhaka,11111,23.766667,90.383333,1980,1,9,1,1011.2
4,Dhaka,11111,23.766667,90.383333,1980,1,12,1,1011.4
...,...,...,...,...,...,...,...,...,...
4309243,Teknaf,11929,20.866667,92.300000,2022,12,9,31,1014.7
4309244,Teknaf,11929,20.866667,92.300000,2022,12,12,31,1014.9
4309245,Teknaf,11929,20.866667,92.300000,2022,12,15,31,1016.5
4309246,Teknaf,11929,20.866667,92.300000,2022,12,18,31,1016.4


In [43]:
# Check for invalid days (e.g., day > 31 or day > max days in the month)
invalid_dates = df_pressure[(df_pressure['Day'] > 31) | (df_pressure['Month'] > 12)]
print(invalid_dates)


Empty DataFrame
Columns: [StationName, StationID, Latitude, Longitude, Year, Month, Time, Day, StationLevelPressure]
Index: []


## **Creating Date Time Column**

In [44]:
import calendar

# Function to check if a date is valid
def is_invalid_date(row):
    days_in_month = calendar.monthrange(row['Year'], row['Month'])[1]
    return row['Day'] > days_in_month

# Remove invalid rows
df_pressure = df_pressure[~df_pressure.apply(is_invalid_date, axis=1)].reset_index(drop=True)

# Convert datetime again
df_pressure['Datetime'] = pd.to_datetime(df_pressure[['Year', 'Month', 'Day']]) + pd.to_timedelta(df_pressure['Time'], unit='h')

print(df_pressure.info())  # Verify that datetime is correctly created


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4231032 entries, 0 to 4231031
Data columns (total 10 columns):
 #   Column                Dtype         
---  ------                -----         
 0   StationName           object        
 1   StationID             int64         
 2   Latitude              float64       
 3   Longitude             float64       
 4   Year                  int64         
 5   Month                 int64         
 6   Time                  int64         
 7   Day                   int64         
 8   StationLevelPressure  float64       
 9   Datetime              datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(5), object(1)
memory usage: 322.8+ MB
None


In [45]:
df_pressure

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,StationLevelPressure,Datetime
0,Dhaka,11111,23.766667,90.383333,1980,1,0,1,1011.7,1980-01-01 00:00:00
1,Dhaka,11111,23.766667,90.383333,1980,1,3,1,1014.8,1980-01-01 03:00:00
2,Dhaka,11111,23.766667,90.383333,1980,1,6,1,1013.4,1980-01-01 06:00:00
3,Dhaka,11111,23.766667,90.383333,1980,1,9,1,1011.2,1980-01-01 09:00:00
4,Dhaka,11111,23.766667,90.383333,1980,1,12,1,1011.4,1980-01-01 12:00:00
...,...,...,...,...,...,...,...,...,...,...
4231027,Teknaf,11929,20.866667,92.300000,2022,12,9,31,1014.7,2022-12-31 09:00:00
4231028,Teknaf,11929,20.866667,92.300000,2022,12,12,31,1014.9,2022-12-31 12:00:00
4231029,Teknaf,11929,20.866667,92.300000,2022,12,15,31,1016.5,2022-12-31 15:00:00
4231030,Teknaf,11929,20.866667,92.300000,2022,12,18,31,1016.4,2022-12-31 18:00:00


In [46]:
print(df_pressure["Datetime"].apply(type).value_counts())


Datetime
<class 'pandas._libs.tslibs.timestamps.Timestamp'>    4231032
Name: count, dtype: int64


In [47]:
print(df_pressure.duplicated().sum())  # Check for duplicate rows


3241


In [48]:
# Find duplicate rows
duplicates = df_pressure[df_pressure.duplicated()]

# Show the duplicate rows
duplicates


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,StationLevelPressure,Datetime
103968,Cox'sBazar,11927,21.433333,91.933333,2002,1,0,1,1015.600000,2002-01-01 00:00:00
103969,Cox'sBazar,11927,21.433333,91.933333,2002,1,3,1,1017.900000,2002-01-01 03:00:00
103970,Cox'sBazar,11927,21.433333,91.933333,2002,1,6,1,1016.100000,2002-01-01 06:00:00
103971,Cox'sBazar,11927,21.433333,91.933333,2002,1,9,1,1014.200000,2002-01-01 09:00:00
103972,Cox'sBazar,11927,21.433333,91.933333,2002,1,12,1,1015.400000,2002-01-01 12:00:00
...,...,...,...,...,...,...,...,...,...,...
4216623,Kutubdia,11925,21.816667,91.850000,1980,7,21,31,1001.882051,1980-07-31 21:00:00
4224368,Sandwip,11916,22.483333,91.433333,1989,12,12,31,1011.700000,1989-12-31 12:00:00
4224369,Sandwip,11916,22.483333,91.433333,1989,12,15,31,1013.300000,1989-12-31 15:00:00
4224370,Sandwip,11916,22.483333,91.433333,1989,12,18,31,1013.000000,1989-12-31 18:00:00


In [49]:
# Remove all duplicate rows (considering all columns)
df_pressure = df_pressure.drop_duplicates()

# Display the cleaned DataFrame
df_pressure


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Time,Day,StationLevelPressure,Datetime
0,Dhaka,11111,23.766667,90.383333,1980,1,0,1,1011.7,1980-01-01 00:00:00
1,Dhaka,11111,23.766667,90.383333,1980,1,3,1,1014.8,1980-01-01 03:00:00
2,Dhaka,11111,23.766667,90.383333,1980,1,6,1,1013.4,1980-01-01 06:00:00
3,Dhaka,11111,23.766667,90.383333,1980,1,9,1,1011.2,1980-01-01 09:00:00
4,Dhaka,11111,23.766667,90.383333,1980,1,12,1,1011.4,1980-01-01 12:00:00
...,...,...,...,...,...,...,...,...,...,...
4231027,Teknaf,11929,20.866667,92.300000,2022,12,9,31,1014.7,2022-12-31 09:00:00
4231028,Teknaf,11929,20.866667,92.300000,2022,12,12,31,1014.9,2022-12-31 12:00:00
4231029,Teknaf,11929,20.866667,92.300000,2022,12,15,31,1016.5,2022-12-31 15:00:00
4231030,Teknaf,11929,20.866667,92.300000,2022,12,18,31,1016.4,2022-12-31 18:00:00


In [50]:
print(df_pressure.duplicated().sum())  # Check for duplicate rows

0


In [51]:
print(df_pressure.groupby("StationID")["Datetime"].nunique())  # Check number of unique timestamps per station


StationID
10120    125640
10208    128568
10320    128568
10408    128568
10609    128568
10705    128568
10724    122728
10910    128568
11111    128568
11313    125648
11316    125640
11407    128568
11505    128568
11513    125648
11604    128568
11610    128568
11704    128568
11706    128568
11805    128568
11809    128568
11814    122712
11912    125648
11916    125648
11921    113960
11925    116632
11927    125648
11929    125648
12007    128568
12103    122720
12110    128568
41858     96424
41909     84736
41926     96424
41958     84736
41977     73048
Name: Datetime, dtype: int64


In [52]:
# Reorganizing columns
df_pressure = df_pressure[[
    'Datetime', 'StationName',
    'StationID', 'Latitude', 'Longitude',
    'Year', 'Month', 'Day', 'Time',
    'StationLevelPressure'
]]

# Display the first few rows to confirm
df_pressure.head()

Unnamed: 0,Datetime,StationName,StationID,Latitude,Longitude,Year,Month,Day,Time,StationLevelPressure
0,1980-01-01 00:00:00,Dhaka,11111,23.766667,90.383333,1980,1,1,0,1011.7
1,1980-01-01 03:00:00,Dhaka,11111,23.766667,90.383333,1980,1,1,3,1014.8
2,1980-01-01 06:00:00,Dhaka,11111,23.766667,90.383333,1980,1,1,6,1013.4
3,1980-01-01 09:00:00,Dhaka,11111,23.766667,90.383333,1980,1,1,9,1011.2
4,1980-01-01 12:00:00,Dhaka,11111,23.766667,90.383333,1980,1,1,12,1011.4


## **Data Summary**

In [53]:
print(df_pressure.info())  # Check data types and missing values
print(df_pressure.describe())  # Summary statistics for numeric columns
print(df_pressure["Datetime"].min(), df_pressure["Datetime"].max())  # Verify datetime range
print(df_pressure.duplicated().sum())  # Check for duplicate rows


<class 'pandas.core.frame.DataFrame'>
Index: 4227791 entries, 0 to 4231031
Data columns (total 10 columns):
 #   Column                Dtype         
---  ------                -----         
 0   Datetime              datetime64[ns]
 1   StationName           object        
 2   StationID             int64         
 3   Latitude              float64       
 4   Longitude             float64       
 5   Year                  int64         
 6   Month                 int64         
 7   Day                   int64         
 8   Time                  int64         
 9   StationLevelPressure  float64       
dtypes: datetime64[ns](1), float64(3), int64(5), object(1)
memory usage: 354.8+ MB
None
                            Datetime     StationID      Latitude  \
count                        4227791  4.227791e+06  4.227791e+06   
mean   2002-10-14 07:06:16.387194752  1.453819e+04  2.331640e+01   
min              1980-01-01 00:00:00  1.012000e+04  2.086667e+01   
25%              1992-03-19 

In [54]:
print(df_pressure.isnull().sum())


Datetime                0
StationName             0
StationID               0
Latitude                0
Longitude               0
Year                    0
Month                   0
Day                     0
Time                    0
StationLevelPressure    0
dtype: int64


In [55]:
print(df_pressure.duplicated(subset=['StationID', 'Datetime']).sum())


2847


In [56]:
print(df_pressure['StationName'].unique())


['Dhaka' 'Tangail' 'Mymensingh' 'Faridpur' 'Madaripur' 'Srimangal'
 'Sylhet' 'Bogura' 'Dinajpur' 'Ishurdi' 'Rajshahi' 'Rangpur' 'Saidpur'
 'Chuadanga' 'Jessore' 'Khulna' 'Mongla' 'Satkhira' 'Barishal' 'Bhola'
 'Khepupara' 'Patuakhali' 'Chandpur' 'Ambagan(Ctg.)' 'Chittagong'
 'Cumilla' "Cox'sBazar" 'Feni' 'Hatiya' 'Kutubdia' 'Maijdee_court'
 'Rangamati' 'Sandwip' 'Sitakunda' 'Teknaf']


In [57]:
print(df_pressure['Datetime'].is_monotonic_increasing)


False


In [58]:
df_pressure['StationLevelPressure'] = df_pressure['StationLevelPressure'].round(1)


## **Saving the Data Finally**

In [59]:
# Save the DataFrame to a CSV file
df_pressure.to_csv('/content/drive/MyDrive/Thesis/Research Data/Preprocessed Data/Processed_station_level_pressure_data.csv', index=False)
