In [24]:
import pandas as pd
import numpy as np 
data = {
    "Energy Source": ["Solar", "Wind", "Hydropower", "Geothermal", "Biomass", "Nuclear"],
    "Energy Consumption (MWh)": [1200, np.nan, 2900, np.nan, 2500, 3200],
    "Cost (Million $)": [200, 400, np.nan, 150, 250, np.nan]
}

energy_df = pd.DataFrame(data)
print('Original energy data with missing values:')
energy_df.head()

Original energy data with missing values:


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,,400.0
2,Hydropower,2900.0,
3,Geothermal,,150.0
4,Biomass,2500.0,250.0


In [26]:
cleaned_df = energy_df.dropna()
print('\nData after removing rows with missing values:')
cleaned_df.head()


Data after removing rows with missing values:


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
4,Biomass,2500.0,250.0


In [28]:
ec_mean = energy_df['Energy Consumption (MWh)'].mean()
print('Mean of Energy Consumption (MWh)', ec_mean)
cost_mean = energy_df['Cost (Million $)'].mean()
print('Mean of Cost (Million $)', cost_mean)

Mean of Energy Consumption (MWh) 2450.0
Mean of Cost (Million $) 250.0


In [30]:
#Impute missing values in ' Energy consumption (MWh)' with the mean
energy_df['Energy Consumption (MWh)'].fillna(ec_mean, inplace = True)

#Impute missing values in 'Cost (Million $)' with the mean
energy_df['Cost (Million $)'].fillna(cost_mean, inplace = True)

print('\nData after Imputing Values with Mean:')
energy_df.head()


Data after Imputing Values with Mean:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  energy_df['Energy Consumption (MWh)'].fillna(ec_mean, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  energy_df['Cost (Million $)'].fillna(cost_mean, inplace = True)


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,2450.0,400.0
2,Hydropower,2900.0,250.0
3,Geothermal,2450.0,150.0
4,Biomass,2500.0,250.0


In [32]:
# Forward fill missing values
forward_filled_df = energy_df.fillna(method = 'ffill')
print('\nData Before Forward Fillin:')
print(energy_df)
print('\nData After Forward Fillin:')
forward_filled_df.head()


Data Before Forward Fillin:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                    1200.0             200.0
1          Wind                    2450.0             400.0
2    Hydropower                    2900.0             250.0
3    Geothermal                    2450.0             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0             250.0

Data After Forward Fillin:


  forward_filled_df = energy_df.fillna(method = 'ffill')


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,2450.0,400.0
2,Hydropower,2900.0,250.0
3,Geothermal,2450.0,150.0
4,Biomass,2500.0,250.0


In [34]:
# Create a flag column indicating missing values in 'Energy Consumption (MWh)'
energy_df["Missing Consumption"] = energy_df["Energy Consumption (MWh)"].isna().astype(int)
energy_df["Missing Cost"] = energy_df["Cost (Million $)"].isna().astype(int)

print("\nData with Missing Values Flagged:")
energy_df.head()


Data with Missing Values Flagged:


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $),Missing Consumption,Missing Cost
0,Solar,1200.0,200.0,0,0
1,Wind,2450.0,400.0,0,0
2,Hydropower,2900.0,250.0,0,0
3,Geothermal,2450.0,150.0,0,0
4,Biomass,2500.0,250.0,0,0


In [None]:
xmin_ec = np.min(energy_df[['Energy Consumption (MWh)']])
print('Min Energy Consumption (MWh):')

In [42]:
from sklearn.preprocessing import MinMaxScaler

# Normalize the 'Energy Consumption (MWh)' and 'Cost (Million $)'
scaler = MinMaxScaler()
energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]] = scaler.fit_transform(energy_df[["Energy Consumption (MWh)", "Cost (Million $)"]])

print("\nData After Normalization (Min-Max Scaling):")
print(energy_df)



Data After Normalization (Min-Max Scaling):
  Energy Source  Energy Consumption (MWh)  Cost (Million $)  \
0         Solar                     0.000               0.2   
1          Wind                     0.625               1.0   
2    Hydropower                     0.850               0.4   
3    Geothermal                     0.625               0.0   
4       Biomass                     0.650               0.4   
5       Nuclear                     1.000               0.4   

   Missing Consumption  Missing Cost  
0                    0             0  
1                    0             0  
2                    0             0  
3                    0             0  
4                    0             0  
5                    0             0  
