Solar Radiation Measurement Data EDA

In [5]:
# Load the data 
import pandas as pd

togo_data = pd.read_csv('../../data/data/togo-dapaong_qc.csv')

benin_data = pd.read_csv('../../data/data/benin-malanville.csv')

sierra_leone_data = pd.read_csv('../../data/data/sierraleone-bumbuna.csv')

In [6]:
# initial data exploration

print(togo_data.head())
print(benin_data.head())
print(sierra_leone_data.head())

print(togo_data.info())
print(benin_data.info())
print(sierra_leone_data.info())


          Timestamp  GHI  DNI  DHI  ModA  ModB  Tamb    RH   WS  WSgust  \
0  2021-10-25 00:01 -1.3  0.0  0.0   0.0   0.0  24.8  94.5  0.9     1.1   
1  2021-10-25 00:02 -1.3  0.0  0.0   0.0   0.0  24.8  94.4  1.1     1.6   
2  2021-10-25 00:03 -1.3  0.0  0.0   0.0   0.0  24.8  94.4  1.2     1.4   
3  2021-10-25 00:04 -1.2  0.0  0.0   0.0   0.0  24.8  94.3  1.2     1.6   
4  2021-10-25 00:05 -1.2  0.0  0.0   0.0   0.0  24.8  94.0  1.3     1.6   

   WSstdev     WD  WDstdev   BP  Cleaning  Precipitation  TModA  TModB  \
0      0.4  227.6      1.1  977         0            0.0   24.7   24.4   
1      0.4  229.3      0.7  977         0            0.0   24.7   24.4   
2      0.3  228.5      2.9  977         0            0.0   24.7   24.4   
3      0.3  229.1      4.6  977         0            0.0   24.7   24.4   
4      0.4  227.5      1.6  977         0            0.0   24.7   24.4   

   Comments  
0       NaN  
1       NaN  
2       NaN  
3       NaN  
4       NaN  
          Timestamp 

In [7]:
# Missing values check
print(togo_data.isnull().sum())
print(benin_data.isnull().sum())
print(sierra_leone_data.isnull().sum())

Timestamp             0
GHI                   0
DNI                   0
DHI                   0
ModA                  0
ModB                  0
Tamb                  0
RH                    0
WS                    0
WSgust                0
WSstdev               0
WD                    0
WDstdev               0
BP                    0
Cleaning              0
Precipitation         0
TModA                 0
TModB                 0
Comments         525600
dtype: int64
Timestamp             0
GHI                   0
DNI                   0
DHI                   0
ModA                  0
ModB                  0
Tamb                  0
RH                    0
WS                    0
WSgust                0
WSstdev               0
WD                    0
WDstdev               0
BP                    0
Cleaning              0
Precipitation         0
TModA                 0
TModB                 0
Comments         525600
dtype: int64
Timestamp             0
GHI                   0
DNI           

Data Cleaning


In [8]:
# Convert 'Timestamp' to datetime
for df in [togo_data, benin_data, sierra_leone_data]:
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])


In [10]:
# Handle missing values by forward filling
for df in [togo_data, benin_data, sierra_leone_data]:
    df.fillna(method='ffill', inplace=True)


  df.fillna(method='ffill', inplace=True)


In [11]:
# Dropping the empty 'Comments' column
for df in [togo_data, benin_data, sierra_leone_data]:
    df.drop(columns=['Comments'], inplace=True)


In [12]:
import numpy as np

def clean_negative_values(df):
    for col in ['GHI', 'DNI', 'DHI']:
        df[col] = df[col].apply(lambda x: np.nan if x < 0 else x)
        df[col] = df[col].apply(lambda x: np.nan if x == 0 else x)
    return df

# Clean negative and zero values in GHI, DNI, and DHI
togo_data = clean_negative_values(togo_data)
benin_data = clean_negative_values(benin_data)
sierra_leone_data = clean_negative_values(sierra_leone_data)


In [13]:
from scipy.stats import zscore

# Calculate Z-scores for each dataset
togo_z_scores = togo_data.select_dtypes(include=[float]).apply(zscore)
benin_z_scores = benin_data.select_dtypes(include=[float]).apply(zscore)
sierra_leone_z_scores = sierra_leone_data.select_dtypes(include=[float]).apply(zscore)


In [14]:
# Identify outliers (Z-score > 3 or < -3)
togo_outliers = (togo_z_scores.abs() > 3).any(axis=1)
benin_outliers = (benin_z_scores.abs() > 3).any(axis=1)
sierra_leone_outliers = (sierra_leone_z_scores.abs() > 3).any(axis=1)

# Removing outliers
togo_cleaned = togo_data[~togo_outliers]
benin_cleaned = benin_data[~benin_outliers]
sierra_leone_cleaned = sierra_leone_data[~sierra_leone_outliers]


In [None]:
Summary statistics

In [15]:
# Summary statistics
togo_summary = togo_cleaned.describe()
benin_summary = benin_cleaned.describe()
sierra_leone_summary = sierra_leone_cleaned.describe()

print(togo_summary)
print(benin_summary)
print(sierra_leone_summary)


                           Timestamp            GHI            DNI  \
count                         510913  256462.000000  212094.000000   
mean   2022-04-25 02:58:24.396795136     449.362385     353.565783   
min              2021-10-25 00:01:00       0.100000       0.100000   
25%              2022-01-23 06:17:00     156.300000      79.600000   
50%              2022-04-24 20:29:00     421.900000     340.600000   
75%              2022-07-25 19:14:00     734.500000     589.600000   
max              2022-10-25 00:00:00    1279.000000    1004.500000   
std                              NaN     316.024174     271.915452   

                 DHI           ModA           ModB           Tamb  \
count  261660.000000  510913.000000  510913.000000  510913.000000   
mean      223.298455     220.062755     213.661351      27.731760   
min         0.200000       0.000000       0.000000      14.900000   
25%       105.100000       0.000000       0.000000      24.200000   
50%       205.900000    