In [1]:
import pandas as pd
import numpy as np

data = {
    "Energy Source": ["Solar", "Wind", "Hydropower", "Geothermal", "Biomass", "Nuclear"],
    "Energy Consumption (MWh)": [1200, np.nan, 2900, np.nan, 2500, 3200],
    "Cost (Million $)": [200, 400, np.nan, 150, 250, np.nan]
}

frame = pd.DataFrame(data)
frame

Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,,400.0
2,Hydropower,2900.0,
3,Geothermal,,150.0
4,Biomass,2500.0,250.0
5,Nuclear,3200.0,


In [5]:
frame.isnull()

Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,False,False,False
1,False,True,False
2,False,False,True
3,False,True,False
4,False,False,False
5,False,False,True


### TREATING NULL VALUES

#### 1. DROPING THE NULL VALUE ROW

In [7]:
# removing the rows with the null values
r = frame.dropna(axis = 0) # axis = 0 means deleting the row if axis = 1 the it will delete the complete column containing the null values
r.head()

Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
4,Biomass,2500.0,250.0


#### 2. TAKING THE MEAN OF COLUMNS

In [14]:
#### 2. REPLACING THE NULL VALUE BY MEAN OF THAT COLUMN

In [34]:
ec_m = frame["Energy Consumption (MWh)"].mean()
c_m = frame["Cost (Million $)"].mean()

print(ec_m)
print(c_m)

0.0
2.4560480230214043e-16


In [33]:
frame["Energy Consumption (MWh)"].fillna(ec_m , inplace = True)
frame["Cost (Million $)"].fillna(c_m , inplace = True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  frame["Energy Consumption (MWh)"].fillna(ec_m , inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  frame["Cost (Million $)"].fillna(c_m , inplace = True)


#### 3. FORWARD FILLING AND BACKWARD FILLING

In [12]:
# forward fill --> used to treat null values 
# it fiils the previous value in place of null

frame = pd.DataFrame(data)
forwrd_fill = frame.ffill()

forwrd_fill

Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,1200.0,400.0
2,Hydropower,2900.0,400.0
3,Geothermal,2900.0,150.0
4,Biomass,2500.0,250.0
5,Nuclear,3200.0,250.0


In [13]:
# backward fill --> used to treat null values 
# it fiils the next value in place of null


frame = pd.DataFrame(data)
bkwrd_fill = frame.bfill()

bkwrd_fill

Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,2900.0,400.0
2,Hydropower,2900.0,150.0
3,Geothermal,2500.0,150.0
4,Biomass,2500.0,250.0
5,Nuclear,3200.0,


### SCALING METHOS

#### 1. NORMALIZATON

In [23]:
# we use min-max scaler when we know the range of data
from sklearn.preprocessing import MinMaxScaler

# treating the null values
ec_m = frame["Energy Consumption (MWh)"].mean()
c_m = frame["Cost (Million $)"].mean()

frame["Energy Consumption (MWh)"].fillna(ec_m , inplace = True)
frame["Cost (Million $)"].fillna(c_m , inplace = True)

scaler = MinMaxScaler()

frame[["Energy Consumption (MWh)","Cost (Million $)"]] = scaler.fit_transform(frame[["Energy Consumption (MWh)","Cost (Million $)"]])

print("Data after Normalization (Min-Max Scaling) : ")
frame

Data after Normalization (Min-Max Scaling) : 


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  frame["Energy Consumption (MWh)"].fillna(ec_m , inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  frame["Cost (Million $)"].fillna(c_m , inplace = True)


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,0.0,0.2
1,Wind,0.625,1.0
2,Hydropower,0.85,0.4
3,Geothermal,0.625,0.0
4,Biomass,0.65,0.4
5,Nuclear,1.0,0.4


#### 2. STANDARDIZATION

In [24]:
# standard scaler --> we use standard scaler when we don't know the range of data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

frame[["Energy Consumption (MWh)","Cost (Million $)"]] = scaler.fit_transform(frame[["Energy Consumption (MWh)","Cost (Million $)"]])

print("Data after Standardization (Z-score Scaling) : ")
frame

Data after Standardization (Z-score Scaling) : 


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,-2.005893,-0.6546537
1,Wind,0.0,1.963961
2,Hydropower,0.722121,1.817029e-16
3,Geothermal,0.0,-1.309307
4,Biomass,0.080236,1.817029e-16
5,Nuclear,1.203536,1.817029e-16


### DATA ENCODING

In [29]:
# CATEGORIAL VARIABLES -> One-hot encoding 

energy_encoded = pd.get_dummies(frame , columns = ["Energy Source"])

print("Data after one-hot encoding : ")
energy_encoded

Data after one-hot encoding : 


Unnamed: 0,Energy Consumption (MWh),Cost (Million $),Energy Source_Biomass,Energy Source_Geothermal,Energy Source_Hydropower,Energy Source_Nuclear,Energy Source_Solar,Energy Source_Wind
0,-2.005893,-0.6546537,False,False,False,False,True,False
1,0.0,1.963961,False,False,False,False,False,True
2,0.722121,1.817029e-16,False,False,True,False,False,False
3,0.0,-1.309307,False,True,False,False,False,False
4,0.080236,1.817029e-16,True,False,False,False,False,False
5,1.203536,1.817029e-16,False,False,False,True,False,False
