## **Installing Necessary Libraries**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import calendar
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Data Loading & Make ready for Preprocessing**

In [3]:
## Loading the data
path ='/content/drive/MyDrive/Thesis/Research Data/Original_Data/Daily Maximum Temperature.csv'
df = pd.read_csv(path, low_memory=False)
df

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,1,2,3,4,...,22,23,24,25,26,27,28,29,30,31
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,24.7,21.1,22.8,25,...,26.7,24.2,24.4,25.6,22.2,26.7,28.9,27.8,23.3,24.1
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,24.4,26.1,27.1,28.4,...,30,30.8,32.2,30.6,30.7,30.8,31.2,30.4,,
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,3,22.2,26.1,20,28.9,...,36.6,37.9,35.3,33.6,35.8,32.1,34.3,34.7,29,32.8
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,4,35,35.6,36.4,36.4,...,35,35,35.6,35.6,35.7,35.8,35.3,34.7,33.9,
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,5,35.3,35.3,30.3,34.2,...,33.7,34.4,35.4,34.4,32.2,31.1,30.1,30.6,29.4,30.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17575,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,8,****,****,****,****,...,****,****,****,****,****,****,****,****,****,****
17576,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,9,****,****,****,****,...,****,****,****,****,****,****,****,****,****,
17577,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,10,****,****,****,****,...,****,****,****,****,****,****,****,****,****,****
17578,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,11,****,****,****,****,...,****,****,****,****,****,****,****,****,****,


In [4]:
# Melt the DataFrame into a long format
df_max = pd.melt(df, id_vars=['StationName','StationID','Latitude','Longitude', 'Year', 'Month'], var_name='Day', value_name='MaximumTemperature')
df_max


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,MaximumTemperature
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,1,24.7
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,1,24.4
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,3,1,22.2
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,4,1,35
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,5,1,35.3
...,...,...,...,...,...,...,...,...
544975,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,8,31,****
544976,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,9,31,
544977,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,10,31,****
544978,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,11,31,


In [5]:
## Find the datatypes of the columns
df_max.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Day,object
MaximumTemperature,object


In [6]:
df_max.shape

(544980, 8)

In [7]:
# Find unique values in 'Month column
df_max['StationName'].unique()

array(['Dhaka', 'Tangail', 'Mymensingh', 'Faridpur', 'Madaripur',
       'Srimangal', 'Sylhet', 'Chuadanga', 'Jessore', 'Khulna', 'Mongla',
       'Satkhira', 'Barishal', 'Bhola', 'Khepupara', 'Patuakhali',
       'Chandpur', 'Ambagan(Ctg.)', 'Chittagong', 'Cumilla', "Cox'sBazar",
       'Feni', 'Hatiya', 'Kutubdia', 'Maijdee_court', 'Rangamati',
       'Sandwip', 'Sitakunda', 'Teknaf'], dtype=object)

In [8]:
# Count unique StationIDs
num_unique_station_name = df_max['StationName'].nunique()

print(f"Count of unique Station IDs: {num_unique_station_name}")


Count of unique Station IDs: 29


In [9]:
# Find unique values in 'Year column
df_max['StationID'].unique()

array([11111, 41909, 10609, 11505, 11513, 10724, 10705, 10408, 10120,
       10910, 10320, 10208, 41858, 41926, 11407, 11604, 41958, 11610,
       11704, 11706, 12110, 12103, 11316, 41977, 11921, 11313, 11927,
       11805, 11814, 11925, 11809, 12007, 11916, 11912, 11929])

In [10]:
# Count unique StationIDs
num_unique_station_ids = df_max['StationID'].nunique()

print(f"Count of unique Station IDs: {num_unique_station_ids}")


Count of unique Station IDs: 35


In [11]:
# Find unique values in 'Month column
df_max['Latitude'].unique()

array(['23Deg.46Mts.N', '24Deg.15Mts.N', '24Deg.43Mts.N', '23Deg.36Mts.N',
       '23Deg.10Mts.N', '24Deg.18Mts.N', '24Deg.54Mts.N', '23Deg.39Mts.N',
       '23Deg.11Mts.N', '22Deg.47Mts.N', '22Deg.20Mts.N', '22Deg.43Mts.N',
       '22Deg.45Mts.N', '22Deg.41Mts.N', '21Deg.59Mts.N', '23Deg.16Mts.N',
       '22Deg.21Mts.N', '22Deg.16Mts.N', '23Deg.26Mts.N', '21Deg.26Mts.N',
       '23Deg.2Mts.N', '22Deg.26Mts.N', '21Deg.49Mts.N', '22Deg.52Mts.N',
       '22Deg.32Mts.N', '22Deg.29Mts.N', '23Deg.35Mts.N', '20Deg.52Mts.N'],
      dtype=object)

In [12]:
# Count unique StationIDs
num_unique_latitude = df_max['Latitude'].nunique()

print(f"Count of unique Station IDs: {num_unique_latitude}")


Count of unique Station IDs: 28


In [13]:
# Find unique values in 'Month column
df_max['Longitude'].unique()

array(['90Deg.23Mts.E', '89Deg.55Mts.E', '90Deg.26Mts.E', '89Deg.51Mts.E',
       '90Deg.11Mts.E', '91Deg.44Mts.E', '91Deg.53Mts.E', '88Deg.52Mts.E',
       '89Deg.10Mts.E', '89Deg.32Mts.E', '89Deg.36Mts.E', '89Deg.5Mts.E',
       '90Deg.20Mts.E', '90Deg.39Mts.E', '90Deg.14Mts.E', '90Deg.42Mts.E',
       '91Deg.49Mts.E', '91Deg.11Mts.E', '91Deg.56Mts.E', '91Deg.25Mts.E',
       '91Deg.6Mts.E', '91Deg.51Mts.E', '92Deg.12Mts.E', '91Deg.26Mts.E',
       '91Deg.42Mts.E', '92Deg.18Mts.E'], dtype=object)

In [14]:
df_max['Longitude'].nunique()

26

In [15]:
df_max['Year'].unique()

array([1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
       1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023])

In [16]:
df_max['Year'].nunique()

44

In [17]:
# Find unique values in 'Month column
df_max['Month'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [18]:
# Find unique values in 'Day column
df_max['Day'].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', '25', '26', '27', '28', '29', '30', '31'], dtype=object)

In [19]:
# Find data types 'Day column
df_max['Day'].dtypes

dtype('O')

In [20]:
# Convert the Day column to numeric
df_max['Day'] = pd.to_numeric(df_max['Day'])
df_max

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,MaximumTemperature
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,1,24.7
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,1,24.4
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,3,1,22.2
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,4,1,35
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,5,1,35.3
...,...,...,...,...,...,...,...,...
544975,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,8,31,****
544976,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,9,31,
544977,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,10,31,****
544978,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,11,31,


In [21]:
## Find the datatypes of the columns again
df_max.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Day,int64
MaximumTemperature,object


In [22]:
# Find unique values in 'MaximumTemperature' column
df_max['MaximumTemperature'].unique()

array(['24.7', '24.4', '22.2', '35', '35.3', '31.4', '32.2', '31.2',
       '32.8', '31.9', '31.1', '28.9', '23.9', '25', '32', '26.7', '33.3',
       '32.6', '27.7', '24.3', '30.8', '****', '34.2', '35.6', '33.1',
       '22.3', '21.3', '29.6', '34.3', '27.8', '31.8', '30.3', '28.4',
       '26.6', '37.5', '33.4', '35.8', '31.3', '30.2', '31.7', '21.1',
       '26.2', '33.2', '32.5', '30', '33', '29', '23.3', '34.4', '33.7',
       '36.4', '31.5', '32.3', '24.2', '29.4', '30.1', '30.5', '34.8',
       '28.1', '28', '38.2', '33.8', '34', '23.1', '26.4', '34.5', '28.2',
       '23.5', '27', '26.3', '32.7', '30.7', '29.9', '25.6', '28.5',
       '32.4', '33.5', '29.2', '26.8', '20.5', '24.6', '34.7', '29.5',
       '28.6', '29.1', '26.5', '35.4', '34.1', '25.7', '37.2', '36.5',
       '27.6', '35.1', '25.5', '33.6', '31', '24.8', '30.6', '26', '27.4',
       '36.3', '36.6', '32.1', '27.3', '29.7', '25.9', '25.3', '24.5',
       '28.7', '23', '27.5', '17', '36', '29.8', '32.9', '35.5', '2

## **Handling Missing value and Unwanted Symbol**

In [23]:
## Count the number of '****' in the value column
df_max['MaximumTemperature'][df_max['MaximumTemperature'] == '****'].count()

np.int64(5834)

In [24]:
# Filter rows where 'MaximumTemperature' is '****'
rows_with_asterisks = df_max[df_max['MaximumTemperature'] == '****']

# Display the rows
rows_with_asterisks


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,MaximumTemperature
27,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1982,4,1,****
528,Tangail,41909,24Deg.15Mts.N,89Deg.55Mts.E,1987,1,1,****
529,Tangail,41909,24Deg.15Mts.N,89Deg.55Mts.E,1987,2,1,****
530,Tangail,41909,24Deg.15Mts.N,89Deg.55Mts.E,1987,3,1,****
531,Tangail,41909,24Deg.15Mts.N,89Deg.55Mts.E,1987,4,1,****
...,...,...,...,...,...,...,...,...
544502,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,1984,3,31,****
544674,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,1998,7,31,****
544975,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,8,31,****
544977,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,10,31,****


In [25]:
# Filter rows where 'MaximumTemperature' is '****'
rows_with_asterisks = df_max[df_max['MaximumTemperature'] == '****']

# Group by 'StationName' and display the results
station_specific_results = rows_with_asterisks.groupby('StationName')

# Display results for each station
for station, data in station_specific_results:
    print(f"Station: {station}")
    print(data, '\n')


Station: Ambagan(Ctg.)
          StationName  StationID       Latitude      Longitude  Year  Month  \
310745  Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  2018      6   
328325  Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  2018      6   
345905  Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  2018      6   

        Day MaximumTemperature  
310745   18               ****  
328325   19               ****  
345905   20               ****   

Station: Barishal
       StationName  StationID       Latitude      Longitude  Year  Month  Day  \
26619     Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1980      4    2   
44236     Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1983      5    3   
61852     Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1986      5    4   
79476     Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1990      1    5   
114519    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1980      4    7   
132125    Barishal    

In [26]:
# To find the null values
df_max.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Day,0
MaximumTemperature,9893


In [27]:
# Replace '****' with NaN in the original DataFrame
df_max.loc[df_max['MaximumTemperature'] == '****', 'MaximumTemperature'] = np.nan

In [28]:
# To find the null values
df_max.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Day,0
MaximumTemperature,15727


In [29]:
# Step 2: Convert 'MaximumTemperature' to numeric, coercing errors (turn non-numeric to NaN)
df_max['MaximumTemperature'] = pd.to_numeric(df_max['MaximumTemperature'], errors='coerce')

# Group by StationName, Month, Day, and Time, and calculate the mean for each group
MaximumTemperature_mean = df_max.groupby(['StationName', 'Month', 'Day'])['MaximumTemperature'].transform('mean')

# Replace '****' (NaN values) with the mean of the same station, month, day, and time
df_max['MaximumTemperature'] = df_max['MaximumTemperature'].fillna(MaximumTemperature_mean)

# Verify the result
print(df_max)


       StationName  StationID       Latitude      Longitude  Year  Month  Day  \
0            Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      1    1   
1            Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      2    1   
2            Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      3    1   
3            Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      4    1   
4            Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      5    1   
...            ...        ...            ...            ...   ...    ...  ...   
544975      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2023      8   31   
544976      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2023      9   31   
544977      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2023     10   31   
544978      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2023     11   31   
544979      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2023     12   31   

        MaximumTemperature 

In [30]:
# Check it again to  find the null values
df_max.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Day,0
MaximumTemperature,8790


In [31]:
# Check rows where 'MaximumTemperature' is still NaN after replacement
rows_with_nan = df_max[df_max['MaximumTemperature'].isna()]

# If there are still rows with NaN, inspect them
rows_with_nan

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,MaximumTemperature
509821,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,30,
509833,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1981,2,30,
509845,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1982,2,30,
509857,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1983,2,30,
509869,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1984,2,30,
...,...,...,...,...,...,...,...,...
544969,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,2,31,
544971,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,4,31,
544973,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,6,31,
544976,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,9,31,


In [32]:
# Sample: Define valid days for each month (for non-leap years)
valid_days_per_month = {
    1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30,
    7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31
}

# Adjust for leap years (for February only)
def is_leap_year(year):
    return (year % 4 == 0 and (year % 100 != 0 or year % 400 == 0))

# Add a column to check if a year is a leap year
df_max['IsLeapYear'] = df_max['Year'].apply(is_leap_year)

# Adjust February days for leap years
df_max.loc[df_max['IsLeapYear'], 'ValidDays'] = df_max.loc[df_max['IsLeapYear'], 'Month'].apply(lambda month: 29 if month == 2 else valid_days_per_month[month])
df_max.loc[~df_max['IsLeapYear'], 'ValidDays'] = df_max.loc[~df_max['IsLeapYear'], 'Month'].apply(lambda month: valid_days_per_month[month])

In [33]:
# Remove rows where 'Day' is greater than the valid days for that month
df_max = df_max[df_max['Day'] <= df_max['ValidDays']]

In [34]:
df_max

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,MaximumTemperature,IsLeapYear,ValidDays
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,1,24.700000,True,31.0
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,1,24.400000,True,29.0
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,3,1,22.200000,True,31.0
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,4,1,35.000000,True,30.0
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,5,1,35.300000,True,31.0
...,...,...,...,...,...,...,...,...,...,...
544972,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,5,31,34.200000,False,31.0
544974,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,7,31,32.800000,False,31.0
544975,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,8,31,30.665116,False,31.0
544977,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,10,31,30.832558,False,31.0


In [35]:
# Check it again to  find the null values
df_max.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Day,0
MaximumTemperature,0
IsLeapYear,0
ValidDays,0


In [36]:
df_max = df_max.drop(columns=['IsLeapYear', 'ValidDays'])


In [37]:
df_max

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,MaximumTemperature
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,1,24.700000
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,1,24.400000
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,3,1,22.200000
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,4,1,35.000000
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,5,1,35.300000
...,...,...,...,...,...,...,...,...
544972,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,5,31,34.200000
544974,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,7,31,32.800000
544975,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,8,31,30.665116
544977,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,10,31,30.832558


## **Coverting Latitude and Longitude to Numerical Value**

In [38]:
df_max.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Day,int64
MaximumTemperature,float64


In [39]:
def convert_to_decimal(deg_min, direction):
    """
    Converts degree-minute format to decimal degrees.
    Example Input: "23Deg.46Mts.N" -> Output: 23.7667
    """
    parts = deg_min.split("Deg.")
    degrees = float(parts[0])
    minutes = float(parts[1].split("Mts.")[0])

    decimal = degrees + (minutes / 60)

    # Convert South/West to negative
    if direction in ["S", "W"]:
        decimal = -decimal

    return decimal

# Example usage
latitude = convert_to_decimal("23Deg.46Mts.", "N")
longitude = convert_to_decimal("90Deg.23Mts.", "E")

print(latitude, longitude)  # Output: 23.7667, 90.3833


23.766666666666666 90.38333333333334


In [40]:
df_max["Latitude"] = df_max["Latitude"].apply(lambda x: convert_to_decimal(x[:-1], x[-1]))  # Extract last character (N/S)
df_max["Longitude"] = df_max["Longitude"].apply(lambda x: convert_to_decimal(x[:-1], x[-1]))  # Extract last character (E/W)
df_max

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,MaximumTemperature
0,Dhaka,11111,23.766667,90.383333,1980,1,1,24.700000
1,Dhaka,11111,23.766667,90.383333,1980,2,1,24.400000
2,Dhaka,11111,23.766667,90.383333,1980,3,1,22.200000
3,Dhaka,11111,23.766667,90.383333,1980,4,1,35.000000
4,Dhaka,11111,23.766667,90.383333,1980,5,1,35.300000
...,...,...,...,...,...,...,...,...
544972,Teknaf,11929,20.866667,92.300000,2023,5,31,34.200000
544974,Teknaf,11929,20.866667,92.300000,2023,7,31,32.800000
544975,Teknaf,11929,20.866667,92.300000,2023,8,31,30.665116
544977,Teknaf,11929,20.866667,92.300000,2023,10,31,30.832558


In [41]:
df_max.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,float64
Longitude,float64
Year,int64
Month,int64
Day,int64
MaximumTemperature,float64


## **Creating Date Time Column**

In [42]:
import calendar

# Function to check if a date is valid
def is_invalid_date(row):
    days_in_month = calendar.monthrange(row['Year'], row['Month'])[1]
    return row['Day'] > days_in_month

# Remove invalid rows
df_max = df_max[~df_max.apply(is_invalid_date, axis=1)].reset_index(drop=True)

# Convert datetime again
df_max['Datetime'] = pd.to_datetime(df_max[['Year', 'Month', 'Day']])

print(df_max.info())  # Verify that datetime is correctly created


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 535087 entries, 0 to 535086
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   StationName         535087 non-null  object        
 1   StationID           535087 non-null  int64         
 2   Latitude            535087 non-null  float64       
 3   Longitude           535087 non-null  float64       
 4   Year                535087 non-null  int64         
 5   Month               535087 non-null  int64         
 6   Day                 535087 non-null  int64         
 7   MaximumTemperature  535087 non-null  float64       
 8   Datetime            535087 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(4), object(1)
memory usage: 36.7+ MB
None


In [43]:
df_max

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,MaximumTemperature,Datetime
0,Dhaka,11111,23.766667,90.383333,1980,1,1,24.700000,1980-01-01
1,Dhaka,11111,23.766667,90.383333,1980,2,1,24.400000,1980-02-01
2,Dhaka,11111,23.766667,90.383333,1980,3,1,22.200000,1980-03-01
3,Dhaka,11111,23.766667,90.383333,1980,4,1,35.000000,1980-04-01
4,Dhaka,11111,23.766667,90.383333,1980,5,1,35.300000,1980-05-01
...,...,...,...,...,...,...,...,...,...
535082,Teknaf,11929,20.866667,92.300000,2023,5,31,34.200000,2023-05-31
535083,Teknaf,11929,20.866667,92.300000,2023,7,31,32.800000,2023-07-31
535084,Teknaf,11929,20.866667,92.300000,2023,8,31,30.665116,2023-08-31
535085,Teknaf,11929,20.866667,92.300000,2023,10,31,30.832558,2023-10-31


In [44]:
print(df_max["Datetime"].apply(type).value_counts())


Datetime
<class 'pandas._libs.tslibs.timestamps.Timestamp'>    535087
Name: count, dtype: int64


In [45]:
print(df_max.groupby("StationID")["Datetime"].nunique())  # Check number of unique timestamps per station


StationID
10120    15705
10208    16071
10320    16071
10408    16071
10609    16071
10705    16071
10724    15340
10910    16071
11111    16071
11313    16071
11316    15705
11407    16071
11505    16071
11513    16071
11604    16071
11610    16071
11704    16071
11706    16071
11805    16071
11809    16071
11814    14611
11912    16071
11916    15706
11921    14245
11925    14244
11927    16071
11929    16071
12007    16071
12103    15705
12110    16071
41858    12053
41909    13514
41926    12783
41958    12783
41977     9131
Name: Datetime, dtype: int64


In [46]:
# Reorganizing columns
df_max = df_max[[
    'Datetime', 'StationName',
    'StationID', 'Latitude', 'Longitude',
    'Year', 'Month', 'Day',
    'MaximumTemperature'
]]

# Display the first few rows to confirm
df_max.head()

Unnamed: 0,Datetime,StationName,StationID,Latitude,Longitude,Year,Month,Day,MaximumTemperature
0,1980-01-01,Dhaka,11111,23.766667,90.383333,1980,1,1,24.7
1,1980-02-01,Dhaka,11111,23.766667,90.383333,1980,2,1,24.4
2,1980-03-01,Dhaka,11111,23.766667,90.383333,1980,3,1,22.2
3,1980-04-01,Dhaka,11111,23.766667,90.383333,1980,4,1,35.0
4,1980-05-01,Dhaka,11111,23.766667,90.383333,1980,5,1,35.3


In [47]:
# Count the number of duplicate rows
duplicate_count = df_max.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 0


## **Data Summary**

In [48]:
print(df_max.info())  # Check data types and missing values
print(df_max.describe())  # Summary statistics for numeric columns
print(df_max["Datetime"].min(), df_max["Datetime"].max())  # Verify datetime range
print(df_max.duplicated().sum())  # Check for duplicate rows


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 535087 entries, 0 to 535086
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   Datetime            535087 non-null  datetime64[ns]
 1   StationName         535087 non-null  object        
 2   StationID           535087 non-null  int64         
 3   Latitude            535087 non-null  float64       
 4   Longitude           535087 non-null  float64       
 5   Year                535087 non-null  int64         
 6   Month               535087 non-null  int64         
 7   Day                 535087 non-null  int64         
 8   MaximumTemperature  535087 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(4), object(1)
memory usage: 36.7+ MB
None
                            Datetime      StationID       Latitude  \
count                         535087  535087.000000  535087.000000   
mean   2002-09-26 04:48:14.952035584

In [49]:
print(df_max.isnull().sum())


Datetime              0
StationName           0
StationID             0
Latitude              0
Longitude             0
Year                  0
Month                 0
Day                   0
MaximumTemperature    0
dtype: int64


In [50]:
print(df_max.duplicated(subset=['StationID', 'Datetime']).sum())


0


In [51]:
print(df_max['StationName'].unique())


['Dhaka' 'Tangail' 'Mymensingh' 'Faridpur' 'Madaripur' 'Srimangal'
 'Sylhet' 'Chuadanga' 'Jessore' 'Khulna' 'Mongla' 'Satkhira' 'Barishal'
 'Bhola' 'Khepupara' 'Patuakhali' 'Chandpur' 'Ambagan(Ctg.)' 'Chittagong'
 'Cumilla' "Cox'sBazar" 'Feni' 'Hatiya' 'Kutubdia' 'Maijdee_court'
 'Rangamati' 'Sandwip' 'Sitakunda' 'Teknaf']


In [52]:
print(df_max['Datetime'].is_monotonic_increasing)


False


In [54]:
df_max['MaximumTemperature'] = df_max['MaximumTemperature'].round(1)


## **Saving the Data Finally**

In [55]:
# Save the DataFrame to a CSV file
df_max.to_csv('/content/drive/MyDrive/Thesis/Research Data/Preprocessed Data/Processed_max_temp_data.csv', index=False)
