## **Installing Necessary Libraries**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import calendar
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Data Loading & Make ready for Preprocessing**

In [None]:
## Loading the data
path ='/content/drive/MyDrive/Thesis/Research Data/Original_Data/Daily Total Sunshine Hours.csv'
df = pd.read_csv(path, low_memory=False)
df

Unnamed: 0,StationID,Latitude,Longitude,StationName,Year,Month,1,2,3,4,...,22,23,24,25,26,27,28,29,30,31
0,11111,23Deg.46Mts.N,90Deg.23Mts.E,Dhaka,1980,1,9.3,3.2,2.8,9.3,...,8.6,2.8,8.7,9.1,9.1,8,6.5,7,9.6,9.3
1,11111,23Deg.46Mts.N,90Deg.23Mts.E,Dhaka,1980,2,9.2,9,8.6,5.6,...,9.7,7.3,9.5,9.3,8.6,8.8,7.5,5.4,,
2,11111,23Deg.46Mts.N,90Deg.23Mts.E,Dhaka,1980,3,0,9.9,7.1,4.7,...,11,10.5,8.7,4.5,10.4,4.4,9.3,7.2,6.1,10.9
3,11111,23Deg.46Mts.N,90Deg.23Mts.E,Dhaka,1980,4,11.1,11.3,11.2,7.4,...,9.8,8.4,9.2,9.5,9.5,9.4,8.7,7,9.9,
4,11111,23Deg.46Mts.N,90Deg.23Mts.E,Dhaka,1980,5,9.2,10.3,6.7,8.9,...,7.7,9.4,10,7.9,8.3,11.5,6.4,4.8,0.2,6.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,11912,23Deg.35Mts.N,91Deg.42Mts.E,Sitakunda,2023,3,9.4,9.1,9.7,9.6,...,3.9,6.2,9.7,9.3,8.9,1.2,7.3,2.5,7.2,1.4
16277,11912,23Deg.35Mts.N,91Deg.42Mts.E,Sitakunda,2023,4,0,9.4,9,7.3,...,7.7,6.4,8.5,9.3,8.8,8.8,7.2,8,6.6,
16278,11912,23Deg.35Mts.N,91Deg.42Mts.E,Sitakunda,2023,5,6.8,5.9,0.2,0.2,...,8.6,6.7,0.2,4.7,8.1,6.3,7.6,8,7.8,7.7
16279,11912,23Deg.35Mts.N,91Deg.42Mts.E,Sitakunda,2023,6,7.9,7.7,6.1,7.1,...,8.2,1.9,3.5,7.8,0.8,0,7,3.5,4.1,


In [None]:
# Melt the DataFrame into a long format
df_sunshine = pd.melt(df, id_vars=['StationName','StationID','Latitude','Longitude', 'Year', 'Month'], var_name='Day', value_name='SunshineHours')
df_sunshine


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,SunshineHours
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,1,9.3
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,1,9.2
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,3,1,0
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,4,1,11.1
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,5,1,9.2
...,...,...,...,...,...,...,...,...
504706,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,3,31,1.4
504707,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,4,31,
504708,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,5,31,7.7
504709,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,6,31,


In [None]:
## Find the datatypes of the columns
df_sunshine.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Day,object
SunshineHours,object


In [None]:
df_sunshine.shape

(504711, 8)

In [None]:
# Find unique values in 'Month column
df_sunshine['StationName'].unique()

array(['Dhaka       ', 'Tangail     ', 'Mymensingh  ', 'Faridpur    ',
       'Madaripur   ', 'Srimangal   ', 'Sylhet      ', 'Bogura       ',
       'Dinajpur    ', 'Ishurdi     ', 'Rajshahi    ', 'Rangpur     ',
       'Saidpur      ', 'Chuadanga   ', 'Jessore     ', 'Khulna      ',
       'Mongla      ', 'Satkhira    ', 'Barishal     ', 'Bhola       ',
       'Khepupara   ', 'Patuakhali  ', 'Chandpur    ', 'Teknaf      ',
       'Ambagan(Ctg.)', 'Cumilla     ', "Cox'sBazar ", 'Feni        ',
       'Hatiya      ', 'Kutubdia    ', 'Maijdee_court', 'Rangamati   ',
       'Sandwip     ', 'Chittagong  ', 'Sitakunda   '], dtype=object)

In [None]:
# Trim the extra spaces from 'StationName' column
df_sunshine['StationName'] = df_sunshine['StationName'].str.strip()

# Verify if spaces are removed
print(df_sunshine['StationName'].unique())


['Dhaka' 'Tangail' 'Mymensingh' 'Faridpur' 'Madaripur' 'Srimangal'
 'Sylhet' 'Bogura' 'Dinajpur' 'Ishurdi' 'Rajshahi' 'Rangpur' 'Saidpur'
 'Chuadanga' 'Jessore' 'Khulna' 'Mongla' 'Satkhira' 'Barishal' 'Bhola'
 'Khepupara' 'Patuakhali' 'Chandpur' 'Teknaf' 'Ambagan(Ctg.)' 'Cumilla'
 "Cox'sBazar" 'Feni' 'Hatiya' 'Kutubdia' 'Maijdee_court' 'Rangamati'
 'Sandwip' 'Chittagong' 'Sitakunda']


In [None]:
# Find unique values in 'Month column
df_sunshine['StationName'].unique()

array(['Dhaka', 'Tangail', 'Mymensingh', 'Faridpur', 'Madaripur',
       'Srimangal', 'Sylhet', 'Bogura', 'Dinajpur', 'Ishurdi', 'Rajshahi',
       'Rangpur', 'Saidpur', 'Chuadanga', 'Jessore', 'Khulna', 'Mongla',
       'Satkhira', 'Barishal', 'Bhola', 'Khepupara', 'Patuakhali',
       'Chandpur', 'Teknaf', 'Ambagan(Ctg.)', 'Cumilla', "Cox'sBazar",
       'Feni', 'Hatiya', 'Kutubdia', 'Maijdee_court', 'Rangamati',
       'Sandwip', 'Chittagong', 'Sitakunda'], dtype=object)

In [None]:
# Count unique StationIDs
num_unique_station_name = df_sunshine['StationName'].nunique()

print(f"Count of unique Station IDs: {num_unique_station_name}")


Count of unique Station IDs: 35


In [None]:
# Find unique values in 'Year column
df_sunshine['StationID'].unique()

array([11111, 41909, 10609, 11505, 11513, 10704, 10705, 10408, 10120,
       10910, 10320, 10208, 41858, 41926, 11407, 11604, 41958, 11610,
       11704, 11706, 12110, 12103, 11316, 11929, 41977, 11313, 11927,
       11805, 11814, 11925, 11809, 12007, 11916, 11921, 11912])

In [None]:
# Count unique StationIDs
num_unique_station_ids = df_sunshine['StationID'].nunique()

print(f"Count of unique Station IDs: {num_unique_station_ids}")


Count of unique Station IDs: 35


In [None]:
# Find unique values in 'Month column
df_sunshine['Latitude'].unique()

array(['23Deg.46Mts.N', '24Deg.15Mts.N', '24Deg.43Mts.N', '23Deg.36Mts.N',
       '23Deg.10Mts.N', '24Deg.18Mts.N', '24Deg.54Mts.N', '24Deg.51Mts.N',
       '25Deg.39Mts.N', '24Deg.8Mts.N', '24Deg.22Mts.N', '25Deg.44Mts.N',
       '25Deg.47Mts.N', '23Deg.39Mts.N', '23Deg.11Mts.N', '22Deg.47Mts.N',
       '22Deg.20Mts.N', '22Deg.43Mts.N', '22Deg.45Mts.N', '22Deg.41Mts.N',
       '21Deg.59Mts.N', '23Deg.16Mts.N', '20Deg.52Mts.N', '22Deg.21Mts.N',
       '23Deg.26Mts.N', '21Deg.26Mts.N', '23Deg.2Mts.N', '22Deg.26Mts.N',
       '21Deg.49Mts.N', '22Deg.52Mts.N', '22Deg.32Mts.N', '22Deg.29Mts.N',
       '22Deg.16Mts.N', '23Deg.35Mts.N'], dtype=object)

In [None]:
# Count unique StationIDs
num_unique_latitude = df_sunshine['Latitude'].nunique()

print(f"Count of unique Station IDs: {num_unique_latitude}")


Count of unique Station IDs: 34


In [None]:
# Find unique values in 'Month column
df_sunshine['Longitude'].unique()

array(['90Deg.23Mts.E', '89Deg.55Mts.E', '90Deg.26Mts.E', '89Deg.51Mts.E',
       '90Deg.11Mts.E', '91Deg.44Mts.E', '91Deg.53Mts.E', '89Deg.22Mts.E',
       '88Deg.41Mts.E', '89Deg.3Mts.E', '88Deg.42Mts.E', '89Deg.14Mts.E',
       '88Deg.53Mts.E', '88Deg.52Mts.E', '89Deg.10Mts.E', '89Deg.32Mts.E',
       '89Deg.36Mts.E', '89Deg.5Mts.E', '90Deg.20Mts.E', '90Deg.39Mts.E',
       '90Deg.14Mts.E', '90Deg.42Mts.E', '92Deg.18Mts.E', '91Deg.49Mts.E',
       '91Deg.11Mts.E', '91Deg.56Mts.E', '91Deg.25Mts.E', '91Deg.6Mts.E',
       '91Deg.51Mts.E', '92Deg.12Mts.E', '91Deg.26Mts.E', '91Deg.42Mts.E'],
      dtype=object)

In [None]:
df_sunshine['Longitude'].nunique()

32

In [None]:
df_sunshine['Year'].unique()

array([1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1992,
       1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
       2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
       2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 1990, 1991])

In [None]:
df_sunshine['Year'].nunique()

44

In [None]:
# Find unique values in 'Month column
df_sunshine['Month'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [None]:
# Find unique values in 'Day column
df_sunshine['Day'].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
       '24', '25', '26', '27', '28', '29', '30', '31'], dtype=object)

In [None]:
# Find data types 'Day column
df_sunshine['Day'].dtypes

dtype('O')

In [None]:
# Convert the Day column to numeric
df_sunshine['Day'] = pd.to_numeric(df_sunshine['Day'])
df_sunshine

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,SunshineHours
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,1,9.3
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,1,9.2
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,3,1,0
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,4,1,11.1
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,5,1,9.2
...,...,...,...,...,...,...,...,...
504706,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,3,31,1.4
504707,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,4,31,
504708,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,5,31,7.7
504709,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,6,31,


In [None]:
## Find the datatypes of the columns again
df_sunshine.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Day,int64
SunshineHours,object


In [None]:
# Find unique values in 'SunshineHours' column
df_sunshine['SunshineHours'].unique()

array(['9.3', '9.2', '0', '11.1', '3.2', '4.9', '8.7', '5.8', '10.1', '9',
       '6', '5.1', '10.3', '5.7', '0.1', '7.6', '2.8', '8.5', '8.3',
       '9.1', '1.7', '8.4', '4.1', '6.5', '10.2', '7.2', '9.6', '8.8',
       '4.4', '9.4', '6.7', '3.8', '8.2', '****', '7.1', '8.1', '9.8',
       '11.7', '2', '4', '10', '1.4', '8.6', '10.6', '4.6', '6.9', '7.4',
       '10.7', '6.1', '3.4', '7.8', '6.6', '5', '0.2', '0.4', '3.6',
       '7.9', '1.6', '8', '5.3', '6.2', '9.5', '4.5', '0.7', '6.8', '1.3',
       '0.3', '5.9', '7.5', '2.4', '2.3', '3.5', '2.7', '1.8', '8.9',
       '2.6', '3.9', '3.1', '5.6', '2.9', '10.4', '4.3', '7.3', '10.9',
       '0.6', '3.3', '2.1', '2.2', '7', '5.2', '6.4', '0.8', '5.5', '7.7',
       '3.7', '3', '5.4', '10.8', '9.7', '1.2', '6.3', '2.5', '0.5', '1',
       '1.9', '11.5', '11.6', '4.7', '0.9', '4.8', '4.2', '9.9', '11',
       '1.1', '10.5', '12', '11.3', '11.4', '11.2', '11.8', '1.5', '13.2',
       '12.6', '11.9', '12.2', '12.5', '13.1', '12.1', '12.

## **Handling Missing value and Unwanted Symbol**

In [None]:
## Count the number of '****' in the value column
df_sunshine['SunshineHours'][df_sunshine['SunshineHours'] == '****'].count()

np.int64(26963)

In [None]:
# Filter rows where 'SunshineHours' is '****'
rows_with_asterisks = df_sunshine[df_sunshine['SunshineHours'] == '****']

# Display the rows
rows_with_asterisks


Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,SunshineHours
42,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1983,7,1,****
46,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1983,11,1,****
78,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1986,7,1,****
79,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1986,8,1,****
120,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1992,1,1,****
...,...,...,...,...,...,...,...,...
504315,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,1990,8,31,****
504327,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,1991,8,31,****
504435,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2000,8,31,****
504449,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2001,10,31,****


In [None]:
# Filter rows where 'SunshineHours' is '****'
rows_with_asterisks = df_sunshine[df_sunshine['SunshineHours'] == '****']

# Group by 'StationName' and display the results
station_specific_results = rows_with_asterisks.groupby('StationName')

# Display results for each station
for station, data in station_specific_results:
    print(f"Station: {station}")
    print(data, '\n')


Station: Ambagan(Ctg.)
          StationName  StationID       Latitude      Longitude  Year  Month  \
11430   Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  2010      7   
11559   Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  2021      4   
11564   Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  2021      9   
27711   Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  2010      7   
27840   Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  2021      4   
...               ...        ...            ...            ...   ...    ...   
467432  Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  2021      9   
483579  Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  2010      7   
483708  Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  2021      4   
483713  Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  2021      9   
499860  Ambagan(Ctg.)      41977  22Deg.21Mts.N  91Deg.49Mts.E  2010      7   

        Day SunshineHours  


In [None]:
# To find the null values
df_sunshine.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Day,0
SunshineHours,9196


In [None]:
# Replace '****' with NaN in the original DataFrame
df_sunshine.loc[df_sunshine['SunshineHours'] == '****', 'SunshineHours'] = np.nan

In [None]:
# To find the null values
df_sunshine.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Day,0
SunshineHours,36159


In [None]:
# Step 2: Convert 'SunshineHours' to numeric, coercing errors (turn non-numeric to NaN)
df_sunshine['SunshineHours'] = pd.to_numeric(df_sunshine['SunshineHours'], errors='coerce')

# Group by StationName, Month, Day, and Time, and calculate the mean for each group
SunshineHours_mean = df_sunshine.groupby(['StationName', 'Month', 'Day'])['SunshineHours'].transform('mean')

# Replace '****' (NaN values) with the mean of the same station, month, day, and time
df_sunshine['SunshineHours'] = df_sunshine['SunshineHours'].fillna(SunshineHours_mean)

# Verify the result
print(df_sunshine)


       StationName  StationID       Latitude      Longitude  Year  Month  Day  \
0            Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      1    1   
1            Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      2    1   
2            Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      3    1   
3            Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      4    1   
4            Dhaka      11111  23Deg.46Mts.N  90Deg.23Mts.E  1980      5    1   
...            ...        ...            ...            ...   ...    ...  ...   
504706   Sitakunda      11912  23Deg.35Mts.N  91Deg.42Mts.E  2023      3   31   
504707   Sitakunda      11912  23Deg.35Mts.N  91Deg.42Mts.E  2023      4   31   
504708   Sitakunda      11912  23Deg.35Mts.N  91Deg.42Mts.E  2023      5   31   
504709   Sitakunda      11912  23Deg.35Mts.N  91Deg.42Mts.E  2023      6   31   
504710   Sitakunda      11912  23Deg.35Mts.N  91Deg.42Mts.E  2023      7   31   

        SunshineHours  
0  

In [None]:
# Check it again to  find the null values
df_sunshine.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Day,0
SunshineHours,6416


In [None]:
# Check rows where 'SunshineHours' is still NaN after replacement
rows_with_nan = df_sunshine[df_sunshine['SunshineHours'].isna()]

# If there are still rows with NaN, inspect them
rows_with_nan

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,SunshineHours
472150,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,30,
472162,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1981,2,30,
472174,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1982,2,30,
472186,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1983,2,30,
472198,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1984,2,30,
...,...,...,...,...,...,...,...,...
504700,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2022,9,31,
504702,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2022,11,31,
504705,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,2,31,
504707,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,4,31,


In [None]:
# Sample: Define valid days for each month (for non-leap years)
valid_days_per_month = {
    1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30,
    7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31
}

# Adjust for leap years (for February only)
def is_leap_year(year):
    return (year % 4 == 0 and (year % 100 != 0 or year % 400 == 0))

# Add a column to check if a year is a leap year
df_sunshine['IsLeapYear'] = df_sunshine['Year'].apply(is_leap_year)

# Adjust February days for leap years
df_sunshine.loc[df_sunshine['IsLeapYear'], 'ValidDays'] = df_sunshine.loc[df_sunshine['IsLeapYear'], 'Month'].apply(lambda month: 29 if month == 2 else valid_days_per_month[month])
df_sunshine.loc[~df_sunshine['IsLeapYear'], 'ValidDays'] = df_sunshine.loc[~df_sunshine['IsLeapYear'], 'Month'].apply(lambda month: valid_days_per_month[month])

In [None]:
# Remove rows where 'Day' is greater than the valid days for that month
df_sunshine = df_sunshine[df_sunshine['Day'] <= df_sunshine['ValidDays']]

In [None]:
df_sunshine

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,SunshineHours,IsLeapYear,ValidDays
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,1,9.3,True,31.0
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,1,9.2,True,29.0
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,3,1,0.0,True,31.0
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,4,1,11.1,True,30.0
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,5,1,9.2,True,31.0
...,...,...,...,...,...,...,...,...,...,...
504703,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2022,12,31,8.3,False,31.0
504704,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,1,31,8.4,False,31.0
504706,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,3,31,1.4,False,31.0
504708,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,5,31,7.7,False,31.0


In [None]:
# Check it again to  find the null values
df_sunshine.isnull().sum()

Unnamed: 0,0
StationName,0
StationID,0
Latitude,0
Longitude,0
Year,0
Month,0
Day,0
SunshineHours,0
IsLeapYear,0
ValidDays,0


In [None]:
df_sunshine = df_sunshine.drop(columns=['IsLeapYear', 'ValidDays'])


In [None]:
df_sunshine

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,SunshineHours
0,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,1,1,9.3
1,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,2,1,9.2
2,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,3,1,0.0
3,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,4,1,11.1
4,Dhaka,11111,23Deg.46Mts.N,90Deg.23Mts.E,1980,5,1,9.2
...,...,...,...,...,...,...,...,...
504703,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2022,12,31,8.3
504704,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,1,31,8.4
504706,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,3,31,1.4
504708,Sitakunda,11912,23Deg.35Mts.N,91Deg.42Mts.E,2023,5,31,7.7


## **Coverting Latitude and Longitude to Numerical Value**

In [None]:
df_sunshine.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,object
Longitude,object
Year,int64
Month,int64
Day,int64
SunshineHours,float64


In [None]:
def convert_to_decimal(deg_min, direction):
    """
    Converts degree-minute format to decimal degrees.
    Example Input: "23Deg.46Mts.N" -> Output: 23.7667
    """
    parts = deg_min.split("Deg.")
    degrees = float(parts[0])
    minutes = float(parts[1].split("Mts.")[0])

    decimal = degrees + (minutes / 60)

    # Convert South/West to negative
    if direction in ["S", "W"]:
        decimal = -decimal

    return decimal

# Example usage
latitude = convert_to_decimal("23Deg.46Mts.", "N")
longitude = convert_to_decimal("90Deg.23Mts.", "E")

print(latitude, longitude)  # Output: 23.7667, 90.3833


23.766666666666666 90.38333333333334


In [None]:
df_sunshine["Latitude"] = df_sunshine["Latitude"].apply(lambda x: convert_to_decimal(x[:-1], x[-1]))  # Extract last character (N/S)
df_sunshine["Longitude"] = df_sunshine["Longitude"].apply(lambda x: convert_to_decimal(x[:-1], x[-1]))  # Extract last character (E/W)
df_sunshine

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,SunshineHours
0,Dhaka,11111,23.766667,90.383333,1980,1,1,9.3
1,Dhaka,11111,23.766667,90.383333,1980,2,1,9.2
2,Dhaka,11111,23.766667,90.383333,1980,3,1,0.0
3,Dhaka,11111,23.766667,90.383333,1980,4,1,11.1
4,Dhaka,11111,23.766667,90.383333,1980,5,1,9.2
...,...,...,...,...,...,...,...,...
504703,Sitakunda,11912,23.583333,91.700000,2022,12,31,8.3
504704,Sitakunda,11912,23.583333,91.700000,2023,1,31,8.4
504706,Sitakunda,11912,23.583333,91.700000,2023,3,31,1.4
504708,Sitakunda,11912,23.583333,91.700000,2023,5,31,7.7


In [None]:
df_sunshine.dtypes

Unnamed: 0,0
StationName,object
StationID,int64
Latitude,float64
Longitude,float64
Year,int64
Month,int64
Day,int64
SunshineHours,float64


## **Creating Date Time Column**

In [None]:
import calendar

# Function to check if a date is valid
def is_invalid_date(row):
    days_in_month = calendar.monthrange(row['Year'], row['Month'])[1]
    return row['Day'] > days_in_month

# Remove invalid rows
df_sunshine = df_sunshine[~df_sunshine.apply(is_invalid_date, axis=1)].reset_index(drop=True)

# Convert datetime again
df_sunshine['Datetime'] = pd.to_datetime(df_sunshine[['Year', 'Month', 'Day']])

print(df_sunshine.info())  # Verify that datetime is correctly created


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495516 entries, 0 to 495515
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   StationName    495516 non-null  object        
 1   StationID      495516 non-null  int64         
 2   Latitude       495516 non-null  float64       
 3   Longitude      495516 non-null  float64       
 4   Year           495516 non-null  int64         
 5   Month          495516 non-null  int64         
 6   Day            495516 non-null  int64         
 7   SunshineHours  495516 non-null  float64       
 8   Datetime       495516 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(4), object(1)
memory usage: 34.0+ MB
None


In [None]:
df_sunshine

Unnamed: 0,StationName,StationID,Latitude,Longitude,Year,Month,Day,SunshineHours,Datetime
0,Dhaka,11111,23.766667,90.383333,1980,1,1,9.3,1980-01-01
1,Dhaka,11111,23.766667,90.383333,1980,2,1,9.2,1980-02-01
2,Dhaka,11111,23.766667,90.383333,1980,3,1,0.0,1980-03-01
3,Dhaka,11111,23.766667,90.383333,1980,4,1,11.1,1980-04-01
4,Dhaka,11111,23.766667,90.383333,1980,5,1,9.2,1980-05-01
...,...,...,...,...,...,...,...,...,...
495511,Sitakunda,11912,23.583333,91.700000,2022,12,31,8.3,2022-12-31
495512,Sitakunda,11912,23.583333,91.700000,2023,1,31,8.4,2023-01-31
495513,Sitakunda,11912,23.583333,91.700000,2023,3,31,1.4,2023-03-31
495514,Sitakunda,11912,23.583333,91.700000,2023,5,31,7.7,2023-05-31


In [None]:
print(df_sunshine["Datetime"].apply(type).value_counts())


Datetime
<class 'pandas._libs.tslibs.timestamps.Timestamp'>    495516
Name: count, dtype: int64


In [None]:
print(df_sunshine.groupby("StationID")["Datetime"].nunique())  # Check number of unique timestamps per station


StationID
10120    12630
10208    15887
10320    15552
10408    15553
10609    15918
10704    13726
10705    15887
10910    14060
11111    15188
11313    14822
11316    15553
11407    15918
11505    14060
11513    13726
11604    14457
11610    14426
11704    15918
11706    15552
11805    14060
11809    14091
11814    12630
11912    15918
11916    13361
11921    14426
11925    14457
11927    15918
11929    15918
12007    13361
12103    14091
12110    12996
41858    14091
41909    13361
41926    14091
41958     8247
41977     5449
Name: Datetime, dtype: int64


In [None]:
# Reorganizing columns
df_sunshine = df_sunshine[[
    'Datetime', 'StationName',
    'StationID', 'Latitude', 'Longitude',
    'Year', 'Month', 'Day',
    'SunshineHours'
]]

# Display the first few rows to confirm
df_sunshine.head()

Unnamed: 0,Datetime,StationName,StationID,Latitude,Longitude,Year,Month,Day,SunshineHours
0,1980-01-01,Dhaka,11111,23.766667,90.383333,1980,1,1,9.3
1,1980-02-01,Dhaka,11111,23.766667,90.383333,1980,2,1,9.2
2,1980-03-01,Dhaka,11111,23.766667,90.383333,1980,3,1,0.0
3,1980-04-01,Dhaka,11111,23.766667,90.383333,1980,4,1,11.1
4,1980-05-01,Dhaka,11111,23.766667,90.383333,1980,5,1,9.2


## **Data Summary**

In [None]:
print(df_sunshine.info())  # Check data types and missing values
print(df_sunshine.describe())  # Summary statistics for numeric columns
print(df_sunshine["Datetime"].min(), df_sunshine["Datetime"].max())  # Verify datetime range
print(df_sunshine.duplicated().sum())  # Check for duplicate rows


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495516 entries, 0 to 495515
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Datetime       495516 non-null  datetime64[ns]
 1   StationName    495516 non-null  object        
 2   StationID      495516 non-null  int64         
 3   Latitude       495516 non-null  float64       
 4   Longitude      495516 non-null  float64       
 5   Year           495516 non-null  int64         
 6   Month          495516 non-null  int64         
 7   Day            495516 non-null  int64         
 8   SunshineHours  495516 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(4), object(1)
memory usage: 34.0+ MB
None
                            Datetime      StationID       Latitude  \
count                         495516  495516.000000  495516.000000   
mean   2003-08-14 03:51:29.114377728   14791.944044      23.348205   
min              1980

In [None]:
print(df_sunshine.isnull().sum())


Datetime         0
StationName      0
StationID        0
Latitude         0
Longitude        0
Year             0
Month            0
Day              0
SunshineHours    0
dtype: int64


In [None]:
# Count the number of duplicate rows
duplicate_count = df_sunshine.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")


Number of duplicate rows: 199


In [None]:
# Remove duplicate rows based on the entire row (all columns)
df_sunshine_clean = df_sunshine.drop_duplicates()

# If you want to drop duplicates based on specific columns:
df_sunshine_clean = df_sunshine.drop_duplicates(subset=['StationName', 'Year', 'Month'])


In [None]:
print(df_sunshine.duplicated(subset=['StationID', 'Datetime']).sum())


217


In [None]:
print(df_sunshine['StationName'].unique())


['Dhaka' 'Tangail' 'Mymensingh' 'Faridpur' 'Madaripur' 'Srimangal'
 'Sylhet' 'Bogura' 'Dinajpur' 'Ishurdi' 'Rajshahi' 'Rangpur' 'Saidpur'
 'Chuadanga' 'Jessore' 'Khulna' 'Mongla' 'Satkhira' 'Barishal' 'Bhola'
 'Khepupara' 'Patuakhali' 'Chandpur' 'Teknaf' 'Ambagan(Ctg.)' 'Cumilla'
 "Cox'sBazar" 'Feni' 'Hatiya' 'Kutubdia' 'Maijdee_court' 'Rangamati'
 'Sandwip' 'Chittagong' 'Sitakunda']


In [None]:
print(df_sunshine['Datetime'].is_monotonic_increasing)


False


In [None]:
df_sunshine['SunshineHours'] = df_sunshine['SunshineHours'].round(1)


## **Saving the Data Finally**

In [None]:
# Save the DataFrame to a CSV file
df_sunshine.to_csv('/content/drive/MyDrive/Thesis/Research Data/Preprocessed Data/Processed_daily_Sunshine_data.csv', index=False)
