# Day 3

In [2]:
import numpy as np


energy_consumption=np.array([1200, 3400, 2900, 1800, 2500])
print("Energy Consumptionin mwh for different energy sources:")
energy_consumption

Energy Consumptionin mwh for different energy sources:


array([1200, 3400, 2900, 1800, 2500])

#### Calculate the total energy consumption

In [4]:
total_consumption = np.sum(energy_consumption)
print(f'The total energy consumption is {total_consumption} in mwh')

The total energy consumption is 11800 in mwh


#### Calculate the mean energy consumption 

In [6]:
mean_consumption=np.mean(energy_consumption)
print(f"The mean energy consumption is {mean_consumption} MWh")

The mean energy consumption is 2360.0 MWh


#### Calculate the standard deviation of energy consumption 

In [11]:
std_ec=np.std(energy_consumption)
std_ec

781.2809993849844

In [13]:
print(f"The standard deviation of energy consumption {std_ec} MWh")

The standard deviation of energy consumption 781.2809993849844 MWh


#### Reshape the array

In [16]:
reshaped_array=energy_consumption.reshape((5,1))
print("Reshaped array is:")
print(reshaped_array)

Reshaped array is:
[[1200]
 [3400]
 [2900]
 [1800]
 [2500]]


### Data preprocessing

In [19]:
import pandas as pd
import numpy as np

# Sample data with missing values
data = {
    "Energy Source": ['Solar', 'Wind', 'Hydropower', 'Geothermal', 'Biomass', 'Nuclear'],
    "Energy Consumption (MWh)": [1200, np.nan, 2900, np.nan, 2500, 3200],
    "Cost (millions $)": [200, 400, np.nan, 150, 250, np.nan]
}


#### Create a DataFrame

In [22]:
df = pd.DataFrame(data)
print(df)

  Energy Source  Energy Consumption (MWh)  Cost (millions $)
0         Solar                    1200.0              200.0
1          Wind                       NaN              400.0
2    Hydropower                    2900.0                NaN
3    Geothermal                       NaN              150.0
4       Biomass                    2500.0              250.0
5       Nuclear                    3200.0                NaN


## Handling the Missing Value:
- Removing(drop) mv

- impute(fill) mv - mean

#### Remove rows with missing values 

In [26]:
cleaned_df= df.dropna()
cleaned_df

Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (millions $)
0,Solar,1200.0,200.0
4,Biomass,2500.0,250.0


## Imputation - to add or fill
 - with mean

 - forward/backward filling

In [33]:
# Impute missing values with the mean for energy consumption
df["Energy Consumption (MWh)"].fillna(df['Energy Consumption (MWh)']. mean(), inplace=True)
df["Cost (millions $)"].fillna(df["Cost (millions $)"].mean(), inplace=True)

df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Energy Consumption (MWh)"].fillna(df['Energy Consumption (MWh)']. mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Cost (millions $)"].fillna(df["Cost (millions $)"].mean(), inplace=True)


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (millions $)
0,Solar,1200.0,200.0
1,Wind,2450.0,400.0
2,Hydropower,2900.0,250.0
3,Geothermal,2450.0,150.0
4,Biomass,2500.0,250.0
5,Nuclear,3200.0,250.0


### Forward filing missing values

In [36]:
ff_df = df.fillna(method="ffill")
ff_df

  ff_df = df.fillna(method="ffill")


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (millions $)
0,Solar,1200.0,200.0
1,Wind,2450.0,400.0
2,Hydropower,2900.0,250.0
3,Geothermal,2450.0,150.0
4,Biomass,2500.0,250.0
5,Nuclear,3200.0,250.0


### Data Preprocessing - Normalization

In [39]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[["Energy Consumption (MWh)", "Cost (millions $)"]] = scaler.fit_transform(df[["Energy Consumption (MWh)", "Cost (millions $)"]])

df

Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (millions $)
0,Solar,0.0,0.2
1,Wind,0.625,1.0
2,Hydropower,0.85,0.4
3,Geothermal,0.625,0.0
4,Biomass,0.65,0.4
5,Nuclear,1.0,0.4


In [41]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df[['Energy Consumption (MWh)', 'Cost (millions $)']] = scaler.fit_transform(
    df[['Energy Consumption (MWh)', 'Cost (millions $)']])
df

Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (millions $)
0,Solar,-2.005893,-0.6546537
1,Wind,3.563181e-16,1.963961
2,Hydropower,0.7221213,1.817029e-16
3,Geothermal,3.563181e-16,-1.309307
4,Biomass,0.0802357,1.817029e-16
5,Nuclear,1.203536,1.817029e-16


In [43]:
ohe_df = pd.get_dummies(df, columns=["Energy Source"])
print(ohe_df)

   Energy Consumption (MWh)  Cost (millions $)  Energy Source_Biomass  \
0             -2.005893e+00      -6.546537e-01                  False   
1              3.563181e-16       1.963961e+00                  False   
2              7.221213e-01       1.817029e-16                  False   
3              3.563181e-16      -1.309307e+00                  False   
4              8.023570e-02       1.817029e-16                   True   
5              1.203536e+00       1.817029e-16                  False   

   Energy Source_Geothermal  Energy Source_Hydropower  Energy Source_Nuclear  \
0                     False                     False                  False   
1                     False                     False                  False   
2                     False                      True                  False   
3                      True                     False                  False   
4                     False                     False                  False   
5       

### Feature engineering

In [46]:
ohe_df["Consumption per Million $"] = ohe_df['Energy Consumption (MWh)'] / ohe_df['Cost (millions $)']

ohe_df

Unnamed: 0,Energy Consumption (MWh),Cost (millions $),Energy Source_Biomass,Energy Source_Geothermal,Energy Source_Hydropower,Energy Source_Nuclear,Energy Source_Solar,Energy Source_Wind,Consumption per Million $
0,-2.005893,-0.6546537,False,False,False,False,True,False,3.064052
1,3.563181e-16,1.963961,False,False,False,False,False,True,1.814283e-16
2,0.7221213,1.817029e-16,False,False,True,False,False,False,3974187000000000.0
3,3.563181e-16,-1.309307,False,True,False,False,False,False,-2.721424e-16
4,0.0802357,1.817029e-16,True,False,False,False,False,False,441576400000000.0
5,1.203536,1.817029e-16,False,False,False,True,False,False,6623646000000000.0
