In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
file_path = '../../data/raw data.csv' 
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,time,weathercode,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,sunrise,sunset,...,precipitation_hours,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,et0_fao_evapotranspiration,latitude,longitude,elevation,country,city
0,1/1/2010,2,30.0,22.7,26.1,34.4,25.2,29.2,6:22:00 AM,6:05:00 PM,...,0,11.7,27.4,20,4.58,7.0,79.899994,16,Sri Lanka,Colombo
1,1/2/2010,51,29.9,23.5,26.2,33.8,26.2,29.8,6:22:00 AM,6:06:00 PM,...,1,13.0,27.0,24,3.84,7.0,79.899994,16,Sri Lanka,Colombo
2,1/3/2010,51,29.5,23.2,26.0,34.3,26.3,29.9,6:23:00 AM,6:06:00 PM,...,3,12.3,27.4,16,3.65,7.0,79.899994,16,Sri Lanka,Colombo
3,1/4/2010,2,28.9,21.9,25.3,31.6,23.4,27.8,6:23:00 AM,6:07:00 PM,...,0,17.0,34.6,356,3.79,7.0,79.899994,16,Sri Lanka,Colombo
4,1/5/2010,1,28.1,21.3,24.5,30.1,23.1,26.1,6:23:00 AM,6:07:00 PM,...,0,18.7,37.1,355,4.97,7.0,79.899994,16,Sri Lanka,Colombo


In [10]:
# 🧹 Step 3: Initial data inspection
df.info()
df.describe(include='all')
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147480 entries, 0 to 147479
Data columns (total 24 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   time                        147480 non-null  object 
 1   weathercode                 147480 non-null  int64  
 2   temperature_2m_max          147480 non-null  float64
 3   temperature_2m_min          147480 non-null  float64
 4   temperature_2m_mean         147480 non-null  float64
 5   apparent_temperature_max    147480 non-null  float64
 6   apparent_temperature_min    147480 non-null  float64
 7   apparent_temperature_mean   147480 non-null  float64
 8   sunrise                     147480 non-null  object 
 9   sunset                      147480 non-null  object 
 10  shortwave_radiation_sum     147480 non-null  float64
 11  precipitation_sum           147480 non-null  float64
 12  rain_sum                    147480 non-null  float64
 13  snowfall_sum  

time                          0
weathercode                   0
temperature_2m_max            0
temperature_2m_min            0
temperature_2m_mean           0
apparent_temperature_max      0
apparent_temperature_min      0
apparent_temperature_mean     0
sunrise                       0
sunset                        0
shortwave_radiation_sum       0
precipitation_sum             0
rain_sum                      0
snowfall_sum                  0
precipitation_hours           0
windspeed_10m_max             0
windgusts_10m_max             0
winddirection_10m_dominant    0
et0_fao_evapotranspiration    0
latitude                      0
longitude                     0
elevation                     0
country                       0
city                          0
dtype: int64

In [11]:
# 🛠️ Step 4: Convert time columns to proper datetime format if needed

def convert_time_column(time_str):
    try:
        return pd.to_datetime(time_str, format='%I:%M:%S %p').time()
    except:
        return np.nan

df['sunrise'] = df['sunrise'].apply(convert_time_column)
df['sunset'] = df['sunset'].apply(convert_time_column)


In [12]:
# 📏 Step 5: Check for and remove duplicates (if any)
print(f"Before removing duplicates: {df.shape}")
df.drop_duplicates(inplace=True)
print(f"After removing duplicates: {df.shape}")


Before removing duplicates: (147480, 24)
After removing duplicates: (147480, 24)


In [13]:
# 🧽 Step 6: Handle missing values

# View rows with missing values
missing_df = df[df.isnull().any(axis=1)]
print("Rows with missing values:\n", missing_df)

# Option 1: Drop rows with nulls
df.dropna(inplace=True)

# Recheck
df.isnull().sum()


Rows with missing values:
 Empty DataFrame
Columns: [time, weathercode, temperature_2m_max, temperature_2m_min, temperature_2m_mean, apparent_temperature_max, apparent_temperature_min, apparent_temperature_mean, sunrise, sunset, shortwave_radiation_sum, precipitation_sum, rain_sum, snowfall_sum, precipitation_hours, windspeed_10m_max, windgusts_10m_max, winddirection_10m_dominant, et0_fao_evapotranspiration, latitude, longitude, elevation, country, city]
Index: []

[0 rows x 24 columns]


time                          0
weathercode                   0
temperature_2m_max            0
temperature_2m_min            0
temperature_2m_mean           0
apparent_temperature_max      0
apparent_temperature_min      0
apparent_temperature_mean     0
sunrise                       0
sunset                        0
shortwave_radiation_sum       0
precipitation_sum             0
rain_sum                      0
snowfall_sum                  0
precipitation_hours           0
windspeed_10m_max             0
windgusts_10m_max             0
winddirection_10m_dominant    0
et0_fao_evapotranspiration    0
latitude                      0
longitude                     0
elevation                     0
country                       0
city                          0
dtype: int64

In [14]:

cleaned_path = '../FC212025 udayanga/cleaned_data.csv'
df.to_csv(cleaned_path, index=False)
print(f"Cleaned dataset saved to {cleaned_path}")

Cleaned dataset saved to ../FC212025 udayanga/cleaned_data.csv
