In [1]:
import pandas as pd

df = pd.read_csv("bioenergy_dataset.csv")
df.info()
df.describe()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Sample_ID                 250 non-null    int64  
 1   Year                      250 non-null    int64  
 2   Crop_Type                 250 non-null    object 
 3   Yield_t_ha                250 non-null    float64
 4   Residue_to_Product_Ratio  247 non-null    float64
 5   Residue_Quantity_t        250 non-null    float64
 6   Regional_Adjustment       250 non-null    float64
 7   Moisture_%                235 non-null    float64
 8   Calorific_Value_MJ_kg     241 non-null    float64
 9   Predicted_Energy_MJ       250 non-null    float64
dtypes: float64(7), int64(2), object(1)
memory usage: 19.7+ KB


Unnamed: 0,Sample_ID,Year,Crop_Type,Yield_t_ha,Residue_to_Product_Ratio,Residue_Quantity_t,Regional_Adjustment,Moisture_%,Calorific_Value_MJ_kg,Predicted_Energy_MJ
0,0,2022,Wheat,3.117,1.622,5.1266,1.014,19.13,17.346,71914.46
1,1,2020,Maize,3.198,2.257,7.4416,1.031,22.96,17.137,98246.57
2,2,2020,Sugarcane,81.318,0.344,25.9034,0.926,46.97,20.924,287424.05
3,3,2022,Cotton,0.534,6.1,3.518,1.08,25.03,17.515,46194.84
4,4,2022,Maize,3.326,2.432,7.9513,0.983,26.5,17.412,101759.31


In [2]:
#dropping unnecessary columns
df = df.drop(columns=['Regional_Adjustment'])
df.head()

Unnamed: 0,Sample_ID,Year,Crop_Type,Yield_t_ha,Residue_to_Product_Ratio,Residue_Quantity_t,Moisture_%,Calorific_Value_MJ_kg,Predicted_Energy_MJ
0,0,2022,Wheat,3.117,1.622,5.1266,19.13,17.346,71914.46
1,1,2020,Maize,3.198,2.257,7.4416,22.96,17.137,98246.57
2,2,2020,Sugarcane,81.318,0.344,25.9034,46.97,20.924,287424.05
3,3,2022,Cotton,0.534,6.1,3.518,25.03,17.515,46194.84
4,4,2022,Maize,3.326,2.432,7.9513,26.5,17.412,101759.31


In [3]:
#Handling missing values
import warnings
warnings.filterwarnings('ignore')
df['Moisture_%'].fillna(df['Moisture_%'].median(), inplace=True)
df['Calorific_Value_MJ_kg'].fillna(df['Calorific_Value_MJ_kg'].mean(), inplace=True)
df['Residue_to_Product_Ratio'].fillna(df['Residue_to_Product_Ratio'].median(), inplace=True)
df

Unnamed: 0,Sample_ID,Year,Crop_Type,Yield_t_ha,Residue_to_Product_Ratio,Residue_Quantity_t,Moisture_%,Calorific_Value_MJ_kg,Predicted_Energy_MJ
0,0,2022,Wheat,3.117,1.622,5.1266,19.13,17.346,71914.46
1,1,2020,Maize,3.198,2.257,7.4416,22.96,17.137,98246.57
2,2,2020,Sugarcane,81.318,0.344,25.9034,46.97,20.924,287424.05
3,3,2022,Cotton,0.534,6.100,3.5180,25.03,17.515,46194.84
4,4,2022,Maize,3.326,2.432,7.9513,26.50,17.412,101759.31
...,...,...,...,...,...,...,...,...,...
245,245,2020,Sugarcane,76.901,0.363,27.4963,54.55,19.318,241418.37
246,246,2020,Maize,3.093,2.278,7.5743,14.85,17.075,110125.49
247,247,2023,Sugarcane,83.578,0.355,31.0647,46.31,22.056,367864.03
248,248,2022,Rice,3.775,1.765,7.0960,17.21,16.050,94290.19


In [6]:
#Check and correct data types
df = df.astype({
    'Yield_t_ha': 'float64',
    'Residue_to_Product_Ratio': 'float64',
    'Residue_Quantity_t': 'float64',
    'Moisture_%': 'float64',
    'Calorific_Value_MJ_kg': 'float64',
    'Predicted_Energy_MJ': 'float64'
})
df

Unnamed: 0,Sample_ID,Year,Crop_Type,Yield_t_ha,Residue_to_Product_Ratio,Residue_Quantity_t,Moisture_%,Calorific_Value_MJ_kg,Predicted_Energy_MJ
0,0,2022,Wheat,3.117,1.622,5.1266,19.13,17.346,71914.46
1,1,2020,Maize,3.198,2.257,7.4416,22.96,17.137,98246.57
2,2,2020,Sugarcane,81.318,0.344,25.9034,46.97,20.924,287424.05
3,3,2022,Cotton,0.534,6.100,3.5180,25.03,17.515,46194.84
4,4,2022,Maize,3.326,2.432,7.9513,26.50,17.412,101759.31
...,...,...,...,...,...,...,...,...,...
245,245,2020,Sugarcane,76.901,0.363,27.4963,54.55,19.318,241418.37
246,246,2020,Maize,3.093,2.278,7.5743,14.85,17.075,110125.49
247,247,2023,Sugarcane,83.578,0.355,31.0647,46.31,22.056,367864.03
248,248,2022,Rice,3.775,1.765,7.0960,17.21,16.050,94290.19


In [7]:
#Detect and handle outliers
from scipy import stats

numeric_cols = ['Yield_t_ha','Residue_to_Product_Ratio','Residue_Quantity_t','Moisture_%','Calorific_Value_MJ_kg','Predicted_Energy_MJ']
z_scores = stats.zscore(df[numeric_cols])
abs_z_scores = abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
df = df[filtered_entries]
df

Unnamed: 0,Sample_ID,Year,Crop_Type,Yield_t_ha,Residue_to_Product_Ratio,Residue_Quantity_t,Moisture_%,Calorific_Value_MJ_kg,Predicted_Energy_MJ
0,0,2022,Wheat,3.117,1.622,5.1266,19.13,17.346,71914.46
1,1,2020,Maize,3.198,2.257,7.4416,22.96,17.137,98246.57
2,2,2020,Sugarcane,81.318,0.344,25.9034,46.97,20.924,287424.05
3,3,2022,Cotton,0.534,6.100,3.5180,25.03,17.515,46194.84
4,4,2022,Maize,3.326,2.432,7.9513,26.50,17.412,101759.31
...,...,...,...,...,...,...,...,...,...
245,245,2020,Sugarcane,76.901,0.363,27.4963,54.55,19.318,241418.37
246,246,2020,Maize,3.093,2.278,7.5743,14.85,17.075,110125.49
247,247,2023,Sugarcane,83.578,0.355,31.0647,46.31,22.056,367864.03
248,248,2022,Rice,3.775,1.765,7.0960,17.21,16.050,94290.19


In [8]:
df.to_csv("Cleaned_Bioenergy_Dataset_.csv", index=False)
