In [2]:
import pandas as pd
import numpy as np
import os

class DataFolderProcessor:
    def __init__(self, folder_name='cleaned_data'):
        self.current_directory = os.getcwd()
        self.data_folder_path = os.path.join(self.current_directory, folder_name)
        self.csv_files = [f for f in os.listdir(self.data_folder_path) if f.endswith('.csv')]

    def process_file(self, csv_file):
        file_path = os.path.join(self.data_folder_path, csv_file)
        df = pd.read_csv(file_path)
        return df

# Function to check data quality
def data_quality_check(df):
    # Check for missing values
    missing_values = df.isnull().sum()
    print("Missing Values:\n", missing_values)
    
    # Check for negative values in GHI, DNI, DHI
    for col in ['GHI', 'DNI', 'DHI']:
        negative_values = df[df[col] < 0]
        print(f"Negative values in {col}:\n", negative_values)
    
    # Check for outliers using Z-scores
    for col in ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']:
        df[f'{col}_zscore'] = (df[col] - df[col].mean()) / df[col].std()
        outliers = df[np.abs(df[f'{col}_zscore']) > 3]
        print(f"Outliers in {col}:\n", outliers)

# Function to clean data
def data_cleaning(df):
    # Handle missing values
    df.fillna(df.mean(), inplace=True)
    
    # Remove negative values in GHI, DNI, DHI
    for col in ['GHI', 'DNI', 'DHI']:
        df[col] = df[col].apply(lambda x: max(x, 0))
    
    # Remove outliers
    for col in ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']:
        df = df[np.abs(df[f'{col}_zscore']) <= 3]
    
    return df

# Example usage
processor = DataFolderProcessor()
csv_files = processor.csv_files

for csv_file in csv_files:
    df = processor.process_file(csv_file)
    
    # Convert Timestamp to datetime
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    
    # Perform data quality check
    print(f"Data Quality Check for {csv_file}")
    data_quality_check(df)
    
    # Clean the data
    df_cleaned = data_cleaning(df)
    
    # Save the cleaned data
    cleaned_file_path = os.path.join(processor.data_folder_path, f'cleaned_{csv_file}')
    df_cleaned.to_csv(cleaned_file_path, index=False)
    print(f"Cleaned data saved to {cleaned_file_path}")

Data Quality Check for cleaned_benin-malanville.csv
Missing Values:
 Timestamp        0
GHI              0
DNI              0
DHI              0
ModA             0
ModB             0
Tamb             0
RH               0
WS               0
WSgust           0
WSstdev          0
WD               0
WDstdev          0
BP               0
Cleaning         0
Precipitation    0
TModA            0
TModB            0
dtype: int64
Negative values in GHI:
                  Timestamp  GHI  DNI  DHI  ModA  ModB  Tamb    RH   WS  \
0      2021-08-09 00:01:00 -1.2 -0.2 -1.1   0.0   0.0  26.2  93.4  0.0   
1      2021-08-09 00:02:00 -1.1 -0.2 -1.1   0.0   0.0  26.2  93.6  0.0   
2      2021-08-09 00:03:00 -1.1 -0.2 -1.1   0.0   0.0  26.2  93.7  0.3   
3      2021-08-09 00:04:00 -1.1 -0.1 -1.0   0.0   0.0  26.2  93.3  0.2   
4      2021-08-09 00:05:00 -1.0 -0.1 -1.0   0.0   0.0  26.2  93.3  0.1   
...                    ...  ...  ...  ...   ...   ...   ...   ...  ...   
525595 2022-08-08 23:56:00 -5.5 -

In [6]:
# Example usage
processor = DataFolderProcessor()
csv_files = processor.csv_files
for csv_file in csv_files:
    df = processor.process_file(csv_file)
    print(f"Contents of {csv_file}:")
    display(df.head(100))

Contents of cleaned_cleaned_benin-malanville.csv:


Unnamed: 0,Timestamp,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,...,Precipitation,TModA,TModB,GHI_zscore,DNI_zscore,DHI_zscore,ModA_zscore,ModB_zscore,WS_zscore,WSgust_zscore
0,2021-08-09 00:01:00,0.0,0.0,0.0,0.0,0.0,26.2,93.4,0.0,0.4,...,0.0,26.3,26.2,-0.730101,-0.639590,-0.733872,-0.723748,-0.723087,-1.322830,-1.187310
1,2021-08-09 00:02:00,0.0,0.0,0.0,0.0,0.0,26.2,93.6,0.0,0.0,...,0.0,26.3,26.2,-0.729799,-0.639590,-0.733872,-0.723748,-0.723087,-1.322830,-1.384440
2,2021-08-09 00:03:00,0.0,0.0,0.0,0.0,0.0,26.2,93.7,0.3,1.1,...,0.0,26.4,26.2,-0.729799,-0.639590,-0.733872,-0.723748,-0.723087,-1.135735,-0.842333
3,2021-08-09 00:04:00,0.0,0.0,0.0,0.0,0.0,26.2,93.3,0.2,0.7,...,0.0,26.4,26.3,-0.729799,-0.639208,-0.733242,-0.723748,-0.723087,-1.198100,-1.039463
4,2021-08-09 00:05:00,0.0,0.0,0.0,0.0,0.0,26.2,93.3,0.1,0.7,...,0.0,26.4,26.3,-0.729497,-0.639208,-0.733242,-0.723748,-0.723087,-1.260465,-1.039463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2021-08-09 01:36:00,0.0,0.0,0.0,0.0,0.0,25.8,94.3,0.5,1.1,...,0.0,25.7,25.6,-0.731007,-0.639972,-0.735763,-0.723748,-0.723087,-1.011005,-0.842333
96,2021-08-09 01:37:00,0.0,0.0,0.0,0.0,0.0,25.8,94.5,0.7,1.1,...,0.0,25.6,25.6,-0.731007,-0.639972,-0.735763,-0.723748,-0.723087,-0.886276,-0.842333
97,2021-08-09 01:38:00,0.0,0.0,0.0,0.0,0.0,25.8,94.5,0.4,1.1,...,0.0,25.6,25.5,-0.731007,-0.639972,-0.736393,-0.723748,-0.723087,-1.073370,-0.842333
98,2021-08-09 01:39:00,0.0,0.0,0.0,0.0,0.0,25.8,94.5,0.1,0.7,...,0.0,25.6,25.5,-0.731007,-0.639972,-0.735763,-0.723748,-0.723087,-1.260465,-1.039463


Contents of cleaned_cleaned_sierraleone-bumbuna.csv:


Unnamed: 0,Timestamp,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,...,Precipitation,TModA,TModB,GHI_zscore,DNI_zscore,DHI_zscore,ModA_zscore,ModB_zscore,WS_zscore,WSgust_zscore
0,2021-10-30 00:01:00,0.0,0.0,0.0,0.0,0.0,21.9,99.1,0.0,0.0,...,0.0,22.3,22.6,-0.678931,-0.532700,-0.720500,-0.686757,-0.685781,-0.924845,-1.046104
1,2021-10-30 00:02:00,0.0,0.0,0.0,0.0,0.0,21.9,99.2,0.0,0.0,...,0.0,22.3,22.6,-0.678931,-0.532700,-0.720500,-0.686757,-0.685781,-0.924845,-1.046104
2,2021-10-30 00:03:00,0.0,0.0,0.0,0.0,0.0,21.9,99.2,0.0,0.0,...,0.0,22.3,22.6,-0.678931,-0.532700,-0.720500,-0.686757,-0.685781,-0.924845,-1.046104
3,2021-10-30 00:04:00,0.0,0.0,0.0,0.0,0.0,21.9,99.3,0.0,0.0,...,0.1,22.3,22.6,-0.678931,-0.532243,-0.720500,-0.686757,-0.685781,-0.924845,-1.046104
4,2021-10-30 00:05:00,0.0,0.0,0.0,0.0,0.0,21.9,99.3,0.0,0.0,...,0.0,22.3,22.6,-0.678931,-0.532700,-0.720500,-0.686757,-0.685781,-0.924845,-1.046104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2021-10-30 01:36:00,0.0,0.0,0.0,0.0,0.0,21.9,99.5,0.0,0.0,...,0.0,22.3,22.7,-0.679266,-0.532700,-0.721129,-0.686757,-0.685781,-0.924845,-1.046104
96,2021-10-30 01:37:00,0.0,0.0,0.0,0.0,0.0,21.9,99.6,0.1,0.7,...,0.0,22.3,22.8,-0.679266,-0.532700,-0.721129,-0.686757,-0.685781,-0.844151,-0.613218
97,2021-10-30 01:38:00,0.0,0.0,0.0,0.0,0.0,21.9,99.5,0.1,0.7,...,0.0,22.3,22.7,-0.679266,-0.532700,-0.721129,-0.686757,-0.685781,-0.844151,-0.613218
98,2021-10-30 01:39:00,0.0,0.0,0.0,0.0,0.0,21.9,99.5,0.6,1.1,...,0.0,22.3,22.7,-0.679266,-0.532700,-0.721129,-0.686757,-0.685781,-0.440681,-0.365854


Contents of cleaned_cleaned_togo-dapaong_qc.csv:


Unnamed: 0,Timestamp,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,...,Precipitation,TModA,TModB,GHI_zscore,DNI_zscore,DHI_zscore,ModA_zscore,ModB_zscore,WS_zscore,WSgust_zscore
0,2021-10-25 00:01:00,0.0,0.0,0.0,0.0,0.0,24.8,94.5,0.9,1.1,...,0.0,24.7,24.4,-0.718858,-0.602727,-0.743955,-0.712609,-0.713041,-1.003709,-1.131164
1,2021-10-25 00:02:00,0.0,0.0,0.0,0.0,0.0,24.8,94.4,1.1,1.6,...,0.0,24.7,24.4,-0.718858,-0.602727,-0.743955,-0.712609,-0.713041,-0.866973,-0.865569
2,2021-10-25 00:03:00,0.0,0.0,0.0,0.0,0.0,24.8,94.4,1.2,1.4,...,0.0,24.7,24.4,-0.718858,-0.602727,-0.743955,-0.712609,-0.713041,-0.798605,-0.971807
3,2021-10-25 00:04:00,0.0,0.0,0.0,0.0,0.0,24.8,94.3,1.2,1.6,...,0.0,24.7,24.4,-0.718548,-0.602727,-0.743955,-0.712609,-0.713041,-0.798605,-0.865569
4,2021-10-25 00:05:00,0.0,0.0,0.0,0.0,0.0,24.8,94.0,1.3,1.6,...,0.0,24.7,24.4,-0.718548,-0.602727,-0.743955,-0.712609,-0.713041,-0.730237,-0.865569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2021-10-25 01:36:00,0.0,0.0,0.0,0.0,0.0,25.1,95.4,0.0,0.0,...,0.0,25.0,24.8,-0.718238,-0.602727,-0.743955,-0.712609,-0.713041,-1.619024,-1.715473
96,2021-10-25 01:37:00,0.0,0.0,0.0,0.0,0.0,25.1,95.6,0.0,0.0,...,0.0,25.0,24.8,-0.718238,-0.602727,-0.743955,-0.712609,-0.713041,-1.619024,-1.715473
97,2021-10-25 01:38:00,0.0,0.0,0.0,0.0,0.0,25.1,95.8,0.0,0.0,...,0.0,25.1,24.8,-0.718238,-0.602727,-0.743955,-0.712609,-0.713041,-1.619024,-1.715473
98,2021-10-25 01:39:00,0.0,0.0,0.0,0.0,0.0,25.1,95.9,0.0,0.0,...,0.0,25.1,24.8,-0.717928,-0.602727,-0.743955,-0.712609,-0.713041,-1.619024,-1.715473
