In [None]:
import pandas as pd
import numpy as np

data = {
    "Energy Source": ["Solar", "Wind", "Hydropower", "Geothermal", "Biomass", "Nuclear"],
    "Energy Consumption (MWh)": [1200, np.nan, 2900, np.nan, 2500, 3200],
    "Cost (Million $)": [200, 400, np.nan, 150, 250, np.nan]
}

print("Original data")
pd.DataFrame(data) #as og data is in dict format so we convert it in data frame


Original data


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,,400.0
2,Hydropower,2900.0,
3,Geothermal,,150.0
4,Biomass,2500.0,250.0
5,Nuclear,3200.0,


In [None]:
#remove rows with missing values
cleaned_df =data_df.dropna()
print("\nCleaned data (rows with missing values removed)")
cleaned_df.head()


Cleaned data (rows with missing values removed)


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
4,Biomass,2500.0,250.0


In [None]:
#impute missing val with the mean

ec_mean=data_df["Energy Consumption (MWh)"].mean()
print("Mean of energy consumption(Mwh)",ec_mean)
cost_mean=data_df["Cost (Million $)"].mean()
print("Mean of cost (Million $)", cost_mean)

Mean of energy consumption(Mwh) 2450.0
Mean of cost (Million $) 250.0


In [None]:
#impute missing values in 'energy consumption (Mwh)' with the mean
data_df["Energy Consumption (MWh)"].fillna(ec_mean, inplace=True)

#impute missing values in 'cost (Million $)' with the mean
data_df["Cost (Million $)"].fillna(cost_mean,inplace=True)

print("\nData after imputing missing values with mean:")
data_df.head()


Data after imputing missing values with mean:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_df["Energy Consumption (MWh)"].fillna(ec_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_df["Cost (Million $)"].fillna(cost_mean,inplace=True)


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,2450.0,400.0
2,Hydropower,2900.0,250.0
3,Geothermal,2450.0,150.0
4,Biomass,2500.0,250.0


In [None]:
#forward fill method for missing values
forward_fill=data_df.fillna(method="ffill")
print("Data before forward fill:")
print(data_df)
print("\nData after forward fill:")
forward_fill.head()

Data before forward fill:
  Energy Source  Energy Consumption (MWh)  Cost (Million $)
0         Solar                    1200.0             200.0
1          Wind                    2450.0             400.0
2    Hydropower                    2900.0             250.0
3    Geothermal                    2450.0             150.0
4       Biomass                    2500.0             250.0
5       Nuclear                    3200.0             250.0

Data after forward fill:


  forward_fill=data_df.fillna(method="ffill")


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,2450.0,400.0
2,Hydropower,2900.0,250.0
3,Geothermal,2450.0,150.0
4,Biomass,2500.0,250.0


In [None]:
xmin_ec=np.min(data_df["Energy Consumption (MWh)"])
xmax_ec=np.max(data_df["Energy Consumption (MWh)"])

data scaling as 2 methodes: minmax and z-score

In [None]:
from sklearn.preprocessing import MinMaxScaler

#normalize the 'energy consumption(Mwh)' and 'cost'
scaler=MinMaxScaler()

data_df[["Energy Consumption (MWh)","Cost (Million $)"]]=scaler.fit_transform(data_df[["Energy Consumption (MWh)","Cost (Million $)"]])

print("\nNormalized data:")
data_df


Normalized data:


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,0.0,0.2
1,Wind,0.625,1.0
2,Hydropower,0.85,0.4
3,Geothermal,0.625,0.0
4,Biomass,0.65,0.4
5,Nuclear,1.0,0.4


In [None]:
#z-score scaling

from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

data_df[["Energy Consumption (MWh)","Cost (Million $)"]]=scaler.fit_transform(data_df[["Energy Consumption (MWh)","Cost (Million $)"]])

print("\nData after z-score scaling:")
data_df


Data after z-score scaling:


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,-2.005893,-0.6546537
1,Wind,3.563181e-16,1.963961
2,Hydropower,0.7221213,1.817029e-16
3,Geothermal,3.563181e-16,-1.309307
4,Biomass,0.0802357,1.817029e-16
5,Nuclear,1.203536,1.817029e-16


In [None]:
  #encoding categorical values
  energy_encoded_df=pd.get_dummies(data_df,columns=["Energy Source"])
  print("data after encoding")
  energy_encoded_df.head()

data after encoding


Unnamed: 0,Energy Consumption (MWh),Cost (Million $),Energy Source_Biomass,Energy Source_Geothermal,Energy Source_Hydropower,Energy Source_Nuclear,Energy Source_Solar,Energy Source_Wind
0,-2.005893,-0.6546537,False,False,False,False,True,False
1,3.563181e-16,1.963961,False,False,False,False,False,True
2,0.7221213,1.817029e-16,False,False,True,False,False,False
3,3.563181e-16,-1.309307,False,True,False,False,False,False
4,0.0802357,1.817029e-16,True,False,False,False,False,False
