## **Installing Necessary Libraries**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import calendar
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Data Loading & Make ready for Preprocessing**

In [3]:
## Loading the data
path ='/content/drive/MyDrive/Thesis/Research Data/Original_Data/Daily Minimum Temperature.csv'
df = pd.read_csv(path, low_memory=False)
df

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,1,2,3,4,...,22,23,24,25,26,27,28,29,30,31
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,11.8,13.4,14,13.9,...,11.1,12.2,12.4,9.6,11.7,11.7,17.8,12.2,13.3,9.9
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,9.8,10.4,11.1,17.5,...,16.2,20.8,21.9,22.1,16.2,15.7,20.8,22.8,,
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,3,19.2,17.2,18.3,20.3,...,20.7,23.4,24.4,25.6,22.2,21.1,23.3,24.4,23.9,20.4
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,4,21.4,23.3,25.4,24.7,...,25,27.8,27.8,27.3,27.3,24.4,27.3,20.1,21.7,
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,5,23.1,23.9,20.7,23.3,...,27.8,23.8,27.2,25.4,23.3,25,22.2,21.9,24.7,23.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17575,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,8,****,****,****,****,...,****,****,****,****,****,****,****,****,****,****
17576,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,9,****,****,****,****,...,****,****,****,****,****,****,****,****,****,
17577,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,10,****,****,****,****,...,****,****,****,****,****,****,****,****,****,****
17578,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,11,****,****,****,****,...,****,****,****,****,****,****,****,****,****,


In [4]:
# Melt the DataFrame into a long format
df_min = pd.melt(df, id_vars=['StationName','StationID','Latitude','Longitude', 'Year', 'Month'], var_name='Day', value_name='MinimumTemperature')
df_min


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,MinimumTemperature
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,1,11.8
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,1,9.8
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,3,1,19.2
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,4,1,21.4
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,5,1,23.1
...,...,...,...,...,...,...,...,...
544975,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,8,31,****
544976,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,9,31,
544977,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,10,31,****
544978,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,11,31,


In [5]:
## Find the datatypes of the columns
df_min.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Day,object
MinimumTemperature,object


In [6]:
df_min.shape

(544980, 8)

In [7]:
# Find unique values in 'Month column
df_min['StationName'].unique()

array(['Dhaka', 'Tangail', 'Mymensingh', 'Faridpur', 'Madaripur',
       'Srimangal', 'Sylhet', 'Bogura', 'Dinajpur', 'Ishurdi', 'Rajshahi',
       'Rangpur', 'Saidpur', 'Chuadanga', 'Jessore', 'Khulna', 'Mongla',
       'Satkhira', 'Barishal', 'Bhola', 'Khepupara', 'Patuakhali',
       'Chandpur', 'Ambagan(Ctg.)', 'Chittagong', 'Cumilla', "Cox'sBazar",
       'Feni', 'Hatiya', 'Kutubdia', 'Maijdee_court', 'Rangamati',
       'Sandwip', 'Sitakunda', 'Teknaf'], dtype=object)

In [8]:
# Count unique StationIDs
num_unique_station_name = df_min['StationName'].nunique()

print(f"Count of unique Station IDs: {num_unique_station_name}")


Count of unique Station IDs: 35


In [9]:
# Find unique values in 'Year column
df_min['StationID'].unique()

array([11111, 41909, 10609, 11505, 11513, 10724, 10705, 10408, 10120,
       10910, 10320, 10208, 41858, 41926, 11407, 11604, 41958, 11610,
       11704, 11706, 12110, 12103, 11316, 41977, 11921, 11313, 11927,
       11805, 11814, 11925, 11809, 12007, 11916, 11912, 11929])

In [10]:
# Count unique StationIDs
num_unique_station_ids = df_min['StationID'].nunique()

print(f"Count of unique Station IDs: {num_unique_station_ids}")


Count of unique Station IDs: 35


In [11]:
# Find unique values in 'Month column
df_min['Latitude'].unique()

array(['23Deg.46Mts.N', '24Deg.15Mts.N', '24Deg.43Mts.N', '23Deg.36Mts.N',
       '23Deg.10Mts.N', '24Deg.18Mts.N', '24Deg.54Mts.N', '24Deg.51Mts.N',
       '25Deg.39Mts.N', '24Deg.8Mts.N', '24Deg.22Mts.N', '25Deg.44Mts.N',
       '25Deg.47Mts.N', '23Deg.39Mts.N', '23Deg.11Mts.N', '22Deg.47Mts.N',
       '22Deg.20Mts.N', '22Deg.43Mts.N', '22Deg.45Mts.N', '22Deg.41Mts.N',
       '21Deg.59Mts.N', '23Deg.16Mts.N', '22Deg.21Mts.N', '22Deg.16Mts.N',
       '23Deg.26Mts.N', '21Deg.26Mts.N', '23Deg.2Mts.N', '22Deg.26Mts.N',
       '21Deg.49Mts.N', '22Deg.52Mts.N', '22Deg.32Mts.N', '22Deg.29Mts.N',
       '23Deg.35Mts.N', '20Deg.52Mts.N'], dtype=object)

In [12]:
# Count unique StationIDs
num_unique_latitude = df_min['Latitude'].nunique()

print(f"Count of unique Station IDs: {num_unique_latitude}")


Count of unique Station IDs: 34


In [13]:
# Find unique values in 'Month column
df_min['Longitude'].unique()

array(['90Deg.23Mts.E', '89Deg.55Mts.E', '90Deg.26Mts.E', '89Deg.51Mts.E',
       '90Deg.11Mts.E', '91Deg.44Mts.E', '91Deg.53Mts.E', '89Deg.22Mts.E',
       '88Deg.41Mts.E', '89Deg.3Mts.E', '88Deg.42Mts.E', '89Deg.14Mts.E',
       '88Deg.53Mts.E', '88Deg.52Mts.E', '89Deg.10Mts.E', '89Deg.32Mts.E',
       '89Deg.36Mts.E', '89Deg.5Mts.E', '90Deg.20Mts.E', '90Deg.39Mts.E',
       '90Deg.14Mts.E', '90Deg.42Mts.E', '91Deg.49Mts.E', '91Deg.11Mts.E',
       '91Deg.56Mts.E', '91Deg.25Mts.E', '91Deg.6Mts.E', '91Deg.51Mts.E',
       '92Deg.12Mts.E', '91Deg.26Mts.E', '91Deg.42Mts.E', '92Deg.18Mts.E'],
      dtype=object)

In [14]:
df_min['Longitude'].nunique()

32

In [15]:
df_min['Year'].unique()

array([1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
       1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023])

In [16]:
df_min['Year'].nunique()

44

In [17]:
# Find unique values in 'Month column
df_min['Month'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [18]:
# Find unique values in 'Day column
df_min['Day'].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', '25', '26', '27', '28', '29', '30', '31'], dtype=object)

In [19]:
# Find data types 'Day column
df_min['Day'].dtypes

dtype('O')

In [20]:
# Convert the Day column to numeric
df_min['Day'] = pd.to_numeric(df_min['Day'])
df_min

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,MinimumTemperature
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,1,11.8
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,1,9.8
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,3,1,19.2
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,4,1,21.4
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,5,1,23.1
...,...,...,...,...,...,...,...,...
544975,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,8,31,****
544976,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,9,31,
544977,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,10,31,****
544978,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,11,31,


In [21]:
## Find the datatypes of the columns again
df_min.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Day,int64
MinimumTemperature,object


In [22]:
# Find unique values in 'MinimumTemperature' column
df_min['MinimumTemperature'].unique()

array(['11.8', '9.8', '19.2', '21.4', '23.1', '25.6', '27.8', '26.4',
       '26.1', '24.3', '21.7', '14', '10.7', '17', '22', '18.4', '27.2',
       '23.6', '26.7', '25', '23.9', '15', '11', '15.7', '16.7', '25.3',
       '28.1', '25.8', '20.3', '12.8', '10.3', '13.3', '15.8', '22.8',
       '23.3', '28.3', '26.9', '26.3', '22.1', '15.6', '12.1', '11.5',
       '14.2', '25.4', '18.9', '28.4', '27.3', '21.2', '16.6', '13.1',
       '19.6', '21', '25.9', '24.8', '23', '20.7', '16.9', '12.2', '19.5',
       '16.3', '19', '20.8', '26.8', '24.4', '13.9', '18', '28.8', '22.2',
       '12.9', '11.7', '22.9', '23.8', '22.4', '24.2', '27.4', '27',
       '20.6', '20.2', '14.7', '11.6', '19.7', '22.3', '27.5', '22.6',
       '11.4', '27.7', '21.5', '14.8', '15.2', '16.8', '26.2', '24.6',
       '26.6', '13.7', '18.5', '23.2', '21.8', '24', '23.5', '16.4',
       '17.8', '28', '21.6', '13.4', '18.7', '28.2', '17.3', '28.5',
       '25.7', '23.4', '12.7', '15.4', '26', '12', '15.3', '17.2', '24.5

## **Handling Missing value and Unwanted Symbol**

In [23]:
## Count the number of '****' in the value column
df_min['MinimumTemperature'][df_min['MinimumTemperature'] == '****'].count()

np.int64(7038)

In [24]:
# Filter rows where 'MinimumTemperature' is '****'
rows_with_asterisks = df_min[df_min['MinimumTemperature'] == '****']

# Display the rows
rows_with_asterisks


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,MinimumTemperature
528,Tangail,41909,24Deg.15Mts.N,89Deg.55Mts.E,1987,1,1,****
529,Tangail,41909,24Deg.15Mts.N,89Deg.55Mts.E,1987,2,1,****
530,Tangail,41909,24Deg.15Mts.N,89Deg.55Mts.E,1987,3,1,****
531,Tangail,41909,24Deg.15Mts.N,89Deg.55Mts.E,1987,4,1,****
936,Tangail,41909,24Deg.15Mts.N,89Deg.55Mts.E,2021,1,1,****
...,...,...,...,...,...,...,...,...
544627,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,1994,8,31,****
544629,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,1994,10,31,****
544975,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,8,31,****
544977,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,10,31,****


In [25]:
# Filter rows where 'MinimumTemperature' is '****'
rows_with_asterisks = df_min[df_min['MinimumTemperature'] == '****']

# Group by 'StationName' and display the results
station_specific_results = rows_with_asterisks.groupby('StationName')

# Display results for each station
for station, data in station_specific_results:
    print(f"Station: {station}")
    print(data, '\n')


Station: Ambagan(Ctg.)
          StationName  StationID       Latitude      Longitude  Year  Month  \
99563   Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  1999     12   
504067  Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  2013      8   

        Day MinimumTemperature  
99563     6               ****  
504067   29               ****   

Station: Barishal
       StationName  StationID       Latitude      Longitude  Year  Month  Day  \
26794     Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1994     11    2   
114549    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1982     10    7   
114701    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1995      6    7   
202418    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1980      3   12   
272825    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1987      6   16   
290316    Barishal      11704  22Deg.45Mts.N  90Deg.20Mts.E  1980      1   17   
290405    Barishal      11704  22Deg.45Mts.N  90Deg.2

In [26]:
# To find the null values
df_min.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Day,0
MinimumTemperature,9893


In [27]:
# Replace '****' with NaN in the original DataFrame
df_min.loc[df_min['MinimumTemperature'] == '****', 'MinimumTemperature'] = np.nan

In [28]:
# To find the null values
df_min.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Day,0
MinimumTemperature,16931


In [29]:
# Step 2: Convert 'MinimumTemperature' to numeric, coercing errors (turn non-numeric to NaN)
df_min['MinimumTemperature'] = pd.to_numeric(df_min['MinimumTemperature'], errors='coerce')

# Group by StationName, Month, Day, and Time, and calculate the mean for each group
MinimumTemperature_mean = df_min.groupby(['StationName', 'Month', 'Day'])['MinimumTemperature'].transform('mean')

# Replace '****' (NaN values) with the mean of the same station, month, day, and time
df_min['MinimumTemperature'] = df_min['MinimumTemperature'].fillna(MinimumTemperature_mean)

# Verify the result
print(df_min)


       StationName  StationID       Latitude      Longitude  Year  Month  Day  \
0            Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      1    1   
1            Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      2    1   
2            Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      3    1   
3            Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      4    1   
4            Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      5    1   
...            ...        ...            ...            ...   ...    ...  ...   
544975      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2023      8   31   
544976      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2023      9   31   
544977      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2023     10   31   
544978      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2023     11   31   
544979      Teknaf      11929  20Deg.52Mts.N  92Deg.18Mts.E  2023     12   31   

        MinimumTemperature 

In [30]:
# Check it again to  find the null values
df_min.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Day,0
MinimumTemperature,8790


In [31]:
# Check rows where 'MinimumTemperature' is still NaN after replacement
rows_with_nan = df_min[df_min['MinimumTemperature'].isna()]

# If there are still rows with NaN, inspect them
rows_with_nan

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,MinimumTemperature
509821,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,30,
509833,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1981,2,30,
509845,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1982,2,30,
509857,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1983,2,30,
509869,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1984,2,30,
...,...,...,...,...,...,...,...,...
544969,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,2,31,
544971,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,4,31,
544973,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,6,31,
544976,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,9,31,


In [32]:
# Sample: Define valid days for each month (for non-leap years)
valid_days_per_month = {
    1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30,
    7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31
}

# Adjust for leap years (for February only)
def is_leap_year(year):
    return (year % 4 == 0 and (year % 100 != 0 or year % 400 == 0))

# Add a column to check if a year is a leap year
df_min['IsLeapYear'] = df_min['Year'].apply(is_leap_year)

# Adjust February days for leap years
df_min.loc[df_min['IsLeapYear'], 'ValidDays'] = df_min.loc[df_min['IsLeapYear'], 'Month'].apply(lambda month: 29 if month == 2 else valid_days_per_month[month])
df_min.loc[~df_min['IsLeapYear'], 'ValidDays'] = df_min.loc[~df_min['IsLeapYear'], 'Month'].apply(lambda month: valid_days_per_month[month])

In [33]:
# Remove rows where 'Day' is greater than the valid days for that month
df_min = df_min[df_min['Day'] <= df_min['ValidDays']]

In [34]:
df_min

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,MinimumTemperature,IsLeapYear,ValidDays
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,1,11.800000,True,31.0
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,1,9.800000,True,29.0
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,3,1,19.200000,True,31.0
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,4,1,21.400000,True,30.0
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,5,1,23.100000,True,31.0
...,...,...,...,...,...,...,...,...,...,...
544972,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,5,31,28.800000,False,31.0
544974,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,7,31,26.700000,False,31.0
544975,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,8,31,25.411905,False,31.0
544977,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,10,31,23.288095,False,31.0


In [35]:
# Check it again to  find the null values
df_min.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Day,0
MinimumTemperature,0
IsLeapYear,0
ValidDays,0


In [36]:
df_min = df_min.drop(columns=['IsLeapYear', 'ValidDays'])


In [37]:
df_min

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,MinimumTemperature
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,1,11.800000
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,1,9.800000
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,3,1,19.200000
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,4,1,21.400000
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,5,1,23.100000
...,...,...,...,...,...,...,...,...
544972,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,5,31,28.800000
544974,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,7,31,26.700000
544975,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,8,31,25.411905
544977,Teknaf,11929,20Deg.52Mts.N,92Deg.18Mts.E,2023,10,31,23.288095


## **Coverting Latitude and Longitude to Numerical Value**

In [38]:
df_min.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Day,int64
MinimumTemperature,float64


In [39]:
def convert_to_decimal(deg_min, direction):
    """
    Converts degree-minute format to decimal degrees.
    Example Input: "23Deg.46Mts.N" -> Output: 23.7667
    """
    parts = deg_min.split("Deg.")
    degrees = float(parts[0])
    minutes = float(parts[1].split("Mts.")[0])

    decimal = degrees + (minutes / 60)

    # Convert South/West to negative
    if direction in ["S", "W"]:
        decimal = -decimal

    return decimal

# Example usage
latitude = convert_to_decimal("23Deg.46Mts.", "N")
longitude = convert_to_decimal("90Deg.23Mts.", "E")

print(latitude, longitude)  # Output: 23.7667, 90.3833


23.766666666666666 90.38333333333334


In [40]:
df_min["Latitude"] = df_min["Latitude"].apply(lambda x: convert_to_decimal(x[:-1], x[-1]))  # Extract last character (N/S)
df_min["Longitude"] = df_min["Longitude"].apply(lambda x: convert_to_decimal(x[:-1], x[-1]))  # Extract last character (E/W)
df_min

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,MinimumTemperature
0,Dhaka,11111,23.766667,90.383333,1980,1,1,11.800000
1,Dhaka,11111,23.766667,90.383333,1980,2,1,9.800000
2,Dhaka,11111,23.766667,90.383333,1980,3,1,19.200000
3,Dhaka,11111,23.766667,90.383333,1980,4,1,21.400000
4,Dhaka,11111,23.766667,90.383333,1980,5,1,23.100000
...,...,...,...,...,...,...,...,...
544972,Teknaf,11929,20.866667,92.300000,2023,5,31,28.800000
544974,Teknaf,11929,20.866667,92.300000,2023,7,31,26.700000
544975,Teknaf,11929,20.866667,92.300000,2023,8,31,25.411905
544977,Teknaf,11929,20.866667,92.300000,2023,10,31,23.288095


In [41]:
df_min.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,float64
Longitude,float64
Year,int64
Month,int64
Day,int64
MinimumTemperature,float64


## **Creating Date Time Column**

In [42]:
import calendar

# Function to check if a date is valid
def is_invalid_date(row):
    days_in_month = calendar.monthrange(row['Year'], row['Month'])[1]
    return row['Day'] > days_in_month

# Remove invalid rows
df_min = df_min[~df_min.apply(is_invalid_date, axis=1)].reset_index(drop=True)

# Convert datetime again
df_min['Datetime'] = pd.to_datetime(df_min[['Year', 'Month', 'Day']])

print(df_min.info())  # Verify that datetime is correctly created


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 535087 entries, 0 to 535086
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   StationName         535087 non-null  object        
 1   StationID           535087 non-null  int64         
 2   Latitude            535087 non-null  float64       
 3   Longitude           535087 non-null  float64       
 4   Year                535087 non-null  int64         
 5   Month               535087 non-null  int64         
 6   Day                 535087 non-null  int64         
 7   MinimumTemperature  535087 non-null  float64       
 8   Datetime            535087 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(4), object(1)
memory usage: 36.7+ MB
None


In [43]:
df_min

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,MinimumTemperature,Datetime
0,Dhaka,11111,23.766667,90.383333,1980,1,1,11.800000,1980-01-01
1,Dhaka,11111,23.766667,90.383333,1980,2,1,9.800000,1980-02-01
2,Dhaka,11111,23.766667,90.383333,1980,3,1,19.200000,1980-03-01
3,Dhaka,11111,23.766667,90.383333,1980,4,1,21.400000,1980-04-01
4,Dhaka,11111,23.766667,90.383333,1980,5,1,23.100000,1980-05-01
...,...,...,...,...,...,...,...,...,...
535082,Teknaf,11929,20.866667,92.300000,2023,5,31,28.800000,2023-05-31
535083,Teknaf,11929,20.866667,92.300000,2023,7,31,26.700000,2023-07-31
535084,Teknaf,11929,20.866667,92.300000,2023,8,31,25.411905,2023-08-31
535085,Teknaf,11929,20.866667,92.300000,2023,10,31,23.288095,2023-10-31


In [44]:
print(df_min["Datetime"].apply(type).value_counts())


Datetime
<class 'pandas._libs.tslibs.timestamps.Timestamp'>    535087
Name: count, dtype: int64


In [45]:
print(df_min.groupby("StationID")["Datetime"].nunique())  # Check number of unique timestamps per station


StationID
10120    15705
10208    16071
10320    16071
10408    16071
10609    16071
10705    16071
10724    15340
10910    16071
11111    16071
11313    16071
11316    15705
11407    16071
11505    16071
11513    16071
11604    16071
11610    16071
11704    16071
11706    16071
11805    16071
11809    16071
11814    14611
11912    16071
11916    15706
11921    14245
11925    14244
11927    16071
11929    16071
12007    16071
12103    15705
12110    16071
41858    12053
41909    13514
41926    12783
41958    12783
41977     9131
Name: Datetime, dtype: int64


In [46]:
# Reorganizing columns
df_min = df_min[[
    'Datetime', 'StationName',
    'StationID', 'Latitude', 'Longitude',
    'Year', 'Month', 'Day',
    'MinimumTemperature'
]]

# Display the first few rows to confirm
df_min.head()

Unnamed: 0,Datetime,StationName,StationID,Latitude,Longitude,Year,Month,Day,MinimumTemperature
0,1980-01-01,Dhaka,11111,23.766667,90.383333,1980,1,1,11.8
1,1980-02-01,Dhaka,11111,23.766667,90.383333,1980,2,1,9.8
2,1980-03-01,Dhaka,11111,23.766667,90.383333,1980,3,1,19.2
3,1980-04-01,Dhaka,11111,23.766667,90.383333,1980,4,1,21.4
4,1980-05-01,Dhaka,11111,23.766667,90.383333,1980,5,1,23.1


In [47]:
# Count the number of duplicate rows
duplicate_count = df_min.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 0


## **Data Summary**

In [48]:
print(df_min.info())  # Check data types and missing values
print(df_min.describe())  # Summary statistics for numeric columns
print(df_min["Datetime"].min(), df_min["Datetime"].max())  # Verify datetime range
print(df_min.duplicated().sum())  # Check for duplicate rows


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 535087 entries, 0 to 535086
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   Datetime            535087 non-null  datetime64[ns]
 1   StationName         535087 non-null  object        
 2   StationID           535087 non-null  int64         
 3   Latitude            535087 non-null  float64       
 4   Longitude           535087 non-null  float64       
 5   Year                535087 non-null  int64         
 6   Month               535087 non-null  int64         
 7   Day                 535087 non-null  int64         
 8   MinimumTemperature  535087 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(4), object(1)
memory usage: 36.7+ MB
None
                            Datetime      StationID       Latitude  \
count                         535087  535087.000000  535087.000000   
mean   2002-09-26 04:48:14.952035584

In [49]:
print(df_min.isnull().sum())


Datetime              0
StationName           0
StationID             0
Latitude              0
Longitude             0
Year                  0
Month                 0
Day                   0
MinimumTemperature    0
dtype: int64


In [50]:
print(df_min.duplicated(subset=['StationID', 'Datetime']).sum())


0


In [51]:
print(df_min['StationName'].unique())


['Dhaka' 'Tangail' 'Mymensingh' 'Faridpur' 'Madaripur' 'Srimangal'
 'Sylhet' 'Bogura' 'Dinajpur' 'Ishurdi' 'Rajshahi' 'Rangpur' 'Saidpur'
 'Chuadanga' 'Jessore' 'Khulna' 'Mongla' 'Satkhira' 'Barishal' 'Bhola'
 'Khepupara' 'Patuakhali' 'Chandpur' 'Ambagan(Ctg.)' 'Chittagong'
 'Cumilla' "Cox'sBazar" 'Feni' 'Hatiya' 'Kutubdia' 'Maijdee_court'
 'Rangamati' 'Sandwip' 'Sitakunda' 'Teknaf']


In [52]:
print(df_min['Datetime'].is_monotonic_increasing)


False


In [53]:
df_min

Unnamed: 0,Datetime,StationName,StationID,Latitude,Longitude,Year,Month,Day,MinimumTemperature
0,1980-01-01,Dhaka,11111,23.766667,90.383333,1980,1,1,11.800000
1,1980-02-01,Dhaka,11111,23.766667,90.383333,1980,2,1,9.800000
2,1980-03-01,Dhaka,11111,23.766667,90.383333,1980,3,1,19.200000
3,1980-04-01,Dhaka,11111,23.766667,90.383333,1980,4,1,21.400000
4,1980-05-01,Dhaka,11111,23.766667,90.383333,1980,5,1,23.100000
...,...,...,...,...,...,...,...,...,...
535082,2023-05-31,Teknaf,11929,20.866667,92.300000,2023,5,31,28.800000
535083,2023-07-31,Teknaf,11929,20.866667,92.300000,2023,7,31,26.700000
535084,2023-08-31,Teknaf,11929,20.866667,92.300000,2023,8,31,25.411905
535085,2023-10-31,Teknaf,11929,20.866667,92.300000,2023,10,31,23.288095


In [54]:
df_min['MinimumTemperature'] = df_min['MinimumTemperature'].round(1)

In [56]:
df_min

Unnamed: 0,Datetime,StationName,StationID,Latitude,Longitude,Year,Month,Day,MinimumTemperature
0,1980-01-01,Dhaka,11111,23.766667,90.383333,1980,1,1,11.8
1,1980-02-01,Dhaka,11111,23.766667,90.383333,1980,2,1,9.8
2,1980-03-01,Dhaka,11111,23.766667,90.383333,1980,3,1,19.2
3,1980-04-01,Dhaka,11111,23.766667,90.383333,1980,4,1,21.4
4,1980-05-01,Dhaka,11111,23.766667,90.383333,1980,5,1,23.1
...,...,...,...,...,...,...,...,...,...
535082,2023-05-31,Teknaf,11929,20.866667,92.300000,2023,5,31,28.8
535083,2023-07-31,Teknaf,11929,20.866667,92.300000,2023,7,31,26.7
535084,2023-08-31,Teknaf,11929,20.866667,92.300000,2023,8,31,25.4
535085,2023-10-31,Teknaf,11929,20.866667,92.300000,2023,10,31,23.3


## **Saving the Data Finally**

In [55]:
# Save the DataFrame to a CSV file
df_min.to_csv('/content/drive/MyDrive/Thesis/Research Data/Preprocessed Data/Processed_min_temp_data.csv', index=False)
