In [1]:
import os
import pandas as pd

# Define the DataQualityCheck class
class DataQualityCheck:
    def __init__(self, data):
        self.data = data

    def check_missing_values(self):
        return self.data.isnull().sum()

    def check_outliers(self):
        outliers = {}
        for column in ['GHI', 'DNI', 'DHI']:
            if column in self.data.columns:
                outliers[column] = self.data[self.data[column] < 0]
        return outliers

# Function to perform data quality check and display results
def data_quality_check():
    # Get the current working directory
    current_directory = os.getcwd()

    # Construct the path to the data folder
    data_folder_path = os.path.join(current_directory, 'data')

    # List all CSV files in the data folder
    csv_files = [f for f in os.listdir(data_folder_path) if f.endswith('.csv')]

    # Perform data quality check for each CSV file
    for csv_file in csv_files:
        file_path = os.path.join(data_folder_path, csv_file)
        print(f"\nProcessing file: {csv_file}")
        try:
            data = pd.read_csv(file_path)
            
            # Initialize DataQualityCheck class
            data_checker = DataQualityCheck(data)
            
            # Perform data quality check
            missing_values = data_checker.check_missing_values()
            outliers = data_checker.check_outliers()
            
            # Print missing values
            print("Missing Values:")
            print(missing_values)
            
            # Print outliers
            print("Outliers:")
            for column, outlier_data in outliers.items():
                if not outlier_data.empty:
                    print(f"Outliers in {column} column:")
                    print(outlier_data)
                else:
                    print(f"No outliers found in {column} column")
        except FileNotFoundError:
            print(f"File not found at path: {file_path}")

# Run the data quality check function
data_quality_check()



Processing file: benin-malanville.csv
Missing Values:
Timestamp             0
GHI                   0
DNI                   0
DHI                   0
ModA                  0
ModB                  0
Tamb                  0
RH                    0
WS                    0
WSgust                0
WSstdev               0
WD                    0
WDstdev               0
BP                    0
Cleaning              0
Precipitation         0
TModA                 0
TModB                 0
Comments         525600
dtype: int64
Outliers:
Outliers in GHI column:
               Timestamp  GHI  DNI  DHI  ModA  ModB  Tamb    RH   WS  WSgust  \
0       2021-08-09 00:01 -1.2 -0.2 -1.1   0.0   0.0  26.2  93.4  0.0     0.4   
1       2021-08-09 00:02 -1.1 -0.2 -1.1   0.0   0.0  26.2  93.6  0.0     0.0   
2       2021-08-09 00:03 -1.1 -0.2 -1.1   0.0   0.0  26.2  93.7  0.3     1.1   
3       2021-08-09 00:04 -1.1 -0.1 -1.0   0.0   0.0  26.2  93.3  0.2     0.7   
4       2021-08-09 00:05 -1.0 -0.1 -1.0   