In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
df = pd.read_csv("preprocessed_energy_data.csv")

In [4]:
def data_quality_report(df):
    report = pd.DataFrame()
    
    report['Column'] = df.columns
    report['Data_Type'] = df.dtypes.values
    report['Non_Null_Count'] = df.count().values
    report['Missing_Count'] = df.isnull().sum().values
    report['Missing_%'] = (df.isnull().sum().values / len(df)) * 100
    report['Unique_Values'] = df.nunique().values
    
    return report
dq_report = data_quality_report(df)
print("\nData Quality Summary: - data_validating.ipynb:13")
print(dq_report)


Data Quality Summary: - data_validating.ipynb:13
                                               Column Data_Type  \
0                                        Country_Name    object   
1                                        Country_Code    object   
2                                                Time   float64   
3                                           Time_Code    object   
4   Access_to_Clean_Fuels_and_Technologies_for_coo...   float64   
5   Access_to_electricity_percent_of_rural_populat...   float64   
6   Access_to_electricity_percent_of_total_populat...   float64   
7   Access_to_electricity_percent_of_urban_populat...   float64   
8   Energy_intensity_level_of_primary_energy_MJ/20...   float64   
9   Renewable_electricity_output_GWh_[4.1.2_REN.EL...   float64   
10  Renewable_electricity_share_of_total_electrici...   float64   
11  Renewable_energy_consumption_TJ_[3.1_RE.CONSUM...   float64   
12  Renewable_energy_share_of_TFEC_percent_[2.1_SH...   float64   
13  Total_el

In [5]:
#Duplicate Check

duplicates = df.duplicated().sum()
print(f"\nDuplicate Records: {duplicates} - data_validating.ipynb:4")

# Remove duplicates if any
df = df.drop_duplicates()


Duplicate Records: 0 - data_validating.ipynb:4


In [6]:
# Numeric columns → fill with median
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

# Categorical columns → fill with mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [7]:
 #Missing Value Handling

# Numeric columns → fill with median
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

# Categorical columns → fill with mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [8]:
#Outlier Detection (IQR Method)

def remove_outliers_iqr(df, cols):
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    
    return df

df_cleaned = remove_outliers_iqr(df, numeric_cols)

print("\nShape after outlier removal: - data_validating.ipynb:18", df_cleaned.shape)


Shape after outlier removal: - data_validating.ipynb:18 (449, 15)


In [9]:
#Final Validation Checks

print("\nFinal Validation Checks - data_validating.ipynb:3")

print("Remaining Missing Values: - data_validating.ipynb:5", df_cleaned.isnull().sum().sum())
print("Remaining Duplicates: - data_validating.ipynb:6", df_cleaned.duplicated().sum())

# Schema Validation (Optional Example)
print("\nFinal Data Types: - data_validating.ipynb:9")
print(df_cleaned.dtypes)


Final Validation Checks - data_validating.ipynb:3
Remaining Missing Values: - data_validating.ipynb:5 0
Remaining Duplicates: - data_validating.ipynb:6 0

Final Data Types: - data_validating.ipynb:9
Country_Name                                                                                            object
Country_Code                                                                                            object
Time                                                                                                   float64
Time_Code                                                                                               object
Access_to_Clean_Fuels_and_Technologies_for_cooking_percent_of_total_population_[2.1_ACCESS.CFT.TOT]    float64
Access_to_electricity_percent_of_rural_population_with_access_[1.2_ACCESS.ELECTRICITY.RURAL]           float64
Access_to_electricity_percent_of_total_population_[1.1_ACCESS.ELECTRICITY.TOT]                         float64
Access_to_electricity_p

In [11]:
#Save Cleaned Dataset
cleaned_file_path = "final_energy_dataset_cleaned.csv"
df_cleaned.to_csv(cleaned_file_path, index=False)

print(f"\nCleaned dataset saved at: {cleaned_file_path} - data_validating.ipynb:5")


Cleaned dataset saved at: final_energy_dataset_cleaned.csv - data_validating.ipynb:5


In [13]:
#Generate Data Dictionary

data_dictionary = pd.DataFrame({
    "Column_Name": df_cleaned.columns,
    "Data_Type": df_cleaned.dtypes.values,
    "Description": ["Add description here"] * len(df_cleaned.columns),
    "Unique_Values": df_cleaned.nunique().values,
    "Sample_Values": [df_cleaned[col].dropna().unique()[:5] for col in df_cleaned.columns]
})

data_dict_path = "data_dictionary.csv"
data_dictionary.to_csv(data_dict_path, index=False)

print(f"Data dictionary saved at: {data_dict_path} - data_validating.ipynb:14")


Data dictionary saved at: data_dictionary.csv - data_validating.ipynb:14


In [14]:
#Save Data Quality Report

dq_report_path = "data_quality_report.csv"
dq_report.to_csv(dq_report_path, index=False)

print(f"Data Quality report saved at: {dq_report_path} - data_validating.ipynb:6")

print("\n✅ DATA QUALITY & FINAL VALIDATION COMPLETED SUCCESSFULLY - data_validating.ipynb:8")

Data Quality report saved at: data_quality_report.csv - data_validating.ipynb:6

✅ DATA QUALITY & FINAL VALIDATION COMPLETED SUCCESSFULLY - data_validating.ipynb:8
