In [4]:
import pandas as pd
import numpy as np

#1 Use the filename directly
file_path = 'household_power_consumption.txt'

print(f"Loading Raw Data from {file_path}...")

# Load the dataset
df = pd.read_csv(file_path, 
                 sep=';', 
                 parse_dates={'dt': ['Date', 'Time']}, 
                 dayfirst=True, 
                 low_memory=False, 
                 na_values=['?'], 
                 index_col='dt')

print("Initial Data Shape:", df.shape)
df.head()

#2 Check for missing values
null_count = df.isnull().sum().sum()
print(f"Missing values found: {null_count}. Filling...")

# Fill gaps using linear interpolation
df.interpolate(method='linear', inplace=True)

print("Missing values after cleaning:", df.isnull().sum().sum())

print("Resampling data to Daily frequency...")

#3 Convert minute-by-minute data to Daily averages
df_daily = df.resample('D').mean()

# Add simple date features
df_daily['year'] = df_daily.index.year
df_daily['month'] = df_daily.index.month
df_daily['day_of_week'] = df_daily.index.dayofweek

print("New Daily Data Shape:", df_daily.shape)

#4 Save the final result
df_daily.to_csv('cleaned_daily_data.csv')

print("Success! 'cleaned_daily_data.csv' is ready.")

Loading Raw Data from household_power_consumption.txt...


  df = pd.read_csv(file_path,


Initial Data Shape: (2075259, 7)
Missing values found: 181853. Filling...
Missing values after cleaning: 0
Resampling data to Daily frequency...
New Daily Data Shape: (1442, 10)
Success! 'cleaned_daily_data.csv' is ready.


In [None]:
#Other way to write above code
import pandas as pd
import numpy as np

def process_energy_data(file_path):
    print(f"--- Step 1: Loading Raw Data from {file_path} ---")
    
    # 1. Load the raw dataset 
    # We use na_values=['?'] because the original dataset uses '?' for missing sensor readings
    df = pd.read_csv(file_path, 
                     sep=';', 
                     parse_dates={'dt': ['Date', 'Time']}, 
                     infer_datetime_format=True, 
                     low_memory=False, 
                     na_values=['?'], 
                     index_col='dt')
    
    print("Initial Data Shape:", df.shape)

    # 2. Data Cleaning (Handling Missing Values)
    # Energy data is a time-series; we use linear interpolation to fill gaps 
    # so we don't break the 'flow' of time.
    null_count = df.isnull().sum().sum()
    print(f"Missing values found: {null_count}. Filling with Linear Interpolation...")
    df.interpolate(method='linear', inplace=True)

    # 3. Downsampling / Resampling
    # Converting minute-by-minute data (2 million rows) to Daily averages.
    # This makes the project run 1440x faster and reveals seasonal patterns.
    print("Resampling data to Daily frequency...")
    df_daily = df.resample('D').mean()
    
    # 4. Basic Feature Extraction
    # These are safe to add now because they don't depend on 'future' data.
    df_daily['year'] = df_daily.index.year
    df_daily['month'] = df_daily.index.month
    df_daily['day_of_week'] = df_daily.index.dayofweek
    
    # 5. Save the Clean Dataset
    output_file = 'cleaned_daily_data.csv'
    df_daily.to_csv(output_file)
    
    print(f"--- Step 1 Complete! ---")
    print(f"Cleaned data saved to: {output_file}")
    print(f"New Data Shape: {df_daily.shape}")
    return df_daily

if __name__ == "__main__":

    process_energy_data('household_power_consumption.txt')