## Data Cleaning 

In [20]:
import os
import pandas as pd

class DataFolderProcessor:
    def __init__(self, folder_name='data'):
        self.current_directory = os.getcwd()
        self.data_folder_path = os.path.join(self.current_directory, folder_name)
        self.csv_files = [f for f in os.listdir(self.data_folder_path) if f.endswith('.csv')]

    def process_file(self, csv_file):
        file_path = os.path.join(self.data_folder_path, csv_file)
        df = pd.read_csv(file_path)
        return df

# Example usage
processor = DataFolderProcessor()
csv_files = processor.csv_files
for csv_file in csv_files:
    df = processor.process_file(csv_file)  
    print(f'The dataset of: {csv_file}:\n{df.isnull().sum()}')

The dataset of: benin-malanville.csv:
Timestamp             0
GHI                   0
DNI                   0
DHI                   0
ModA                  0
ModB                  0
Tamb                  0
RH                    0
WS                    0
WSgust                0
WSstdev               0
WD                    0
WDstdev               0
BP                    0
Cleaning              0
Precipitation         0
TModA                 0
TModB                 0
Comments         525600
dtype: int64
The dataset of: sierraleone-bumbuna.csv:
Timestamp             0
GHI                   0
DNI                   0
DHI                   0
ModA                  0
ModB                  0
Tamb                  0
RH                    0
WS                    0
WSgust                0
WSstdev               0
WD                    0
WDstdev               0
BP                    0
Cleaning              0
Precipitation         0
TModA                 0
TModB                 0
Comments         525

#### Remove missing value from data set

In [21]:
for csv_file in csv_files:
    df = processor.process_file(csv_file)  
    df.dropna()

In [26]:
import pandas as pd

class DataFolderProcessor:
    def __init__(self, folder_name='data'):
        self.current_directory = os.getcwd()
        self.data_folder_path = os.path.join(self.current_directory, folder_name)
        self.csv_files = [f for f in os.listdir(self.data_folder_path) if f.endswith('.csv')]

    def process_file(self, csv_file):
        file_path = os.path.join(self.data_folder_path, csv_file)
        df = pd.read_csv(file_path)
        return df

# Example usage
processor = DataFolderProcessor()
csv_files = processor.csv_files

# Process each CSV file
for csv_file in csv_files:
    df = processor.process_file(csv_file)
    
    # Step 1: Identify Missing Values in 'Comments' column
    if 'Comments' in df.columns:
        missing_comments = df['Comments'].isnull().all()
        
        # Step 2: Handle Missing Values
        if missing_comments:
            # Drop the 'Comments' column if all values are missing
            df.drop(columns=['Comments'], inplace=True)
            print(f"The 'Comments' column in {csv_file} was entirely null and has been dropped.")
    
    # Drop rows with any missing values
    cleaned_df = df.dropna()
    
    # Save the cleaned DataFrame
    cleaned_df.to_csv(f'cleaned_{csv_file}', index=False)
    print(f'Cleaned data saved to cleaned_{csv_file}')

The 'Comments' column in benin-malanville.csv was entirely null and has been dropped.
Cleaned data saved to cleaned_benin-malanville.csv
The 'Comments' column in sierraleone-bumbuna.csv was entirely null and has been dropped.
Cleaned data saved to cleaned_sierraleone-bumbuna.csv
The 'Comments' column in togo-dapaong_qc.csv was entirely null and has been dropped.
Cleaned data saved to cleaned_togo-dapaong_qc.csv


In [27]:
class DataFolderProcessor:
    def __init__(self, folder_name='cleaned_data'):
        self.current_directory = os.getcwd()
        self.data_folder_path = os.path.join(self.current_directory, folder_name)
        self.csv_files = [f for f in os.listdir(self.data_folder_path) if f.endswith('.csv')]

    def process_file(self, csv_file):
        file_path = os.path.join(self.data_folder_path, csv_file)
        df = pd.read_csv(file_path)
        return df

# Example usage
processor = DataFolderProcessor()
csv_files = processor.csv_files
for csv_file in csv_files:
    df = processor.process_file(csv_file)  
    df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525600 entries, 0 to 525599
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Timestamp      525600 non-null  object 
 1   GHI            525600 non-null  float64
 2   DNI            525600 non-null  float64
 3   DHI            525600 non-null  float64
 4   ModA           525600 non-null  float64
 5   ModB           525600 non-null  float64
 6   Tamb           525600 non-null  float64
 7   RH             525600 non-null  float64
 8   WS             525600 non-null  float64
 9   WSgust         525600 non-null  float64
 10  WSstdev        525600 non-null  float64
 11  WD             525600 non-null  float64
 12  WDstdev        525600 non-null  float64
 13  BP             525600 non-null  int64  
 14  Cleaning       525600 non-null  int64  
 15  Precipitation  525600 non-null  float64
 16  TModA          525600 non-null  float64
 17  TModB          525600 non-nul