# OMDENA Millets project - EDA and Feature Engineering


In [6]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv("data_omdena_without_imputation.csv", delimiter= ";", index_col=0)
df.head()

Unnamed: 0_level_0,Types of Millets,Common Name,Category,Drought resistant,Flood Resistant,Min Temperature (ºC),Max Temperature (ºC),pH level of the soil Min,pH level of the soil Max,Soil type,...,Ash (g),Crude Fibre (g),Carbo- hydrates (g),Energy (kcal),Calcium (mg),Iron (mg),Thiamine (mg),Ribo- flavin (mg),Nia- cin (mg),Price (US$ / Kg)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Pearl millet (Pennisetum glaucum),PEARL,Major,1,0,30,34,6.0,7.0,"LC, SL",...,1.37,11.49,61.78,348.0,27.35,6.42,0.25,0.2,0.86,19.0
2,Finger millet (Eleusine coracana),FINGER,Major,1,0,26,29,4.5,7.5,L,...,2.04,11.18,66.82,320.7,364.0,4.62,0.37,0.17,1.3,19.0
3,Foxtail millet (Setaria italica),FOXTAIL,Minor,1,0,5,35,5.5,7.0,SL,...,0.0,8.0,60.9,331.0,31.0,2.8,0.59,0.11,3.2,24.0
4,Proso millet (Panicum miliaceum) (Chena in India),PROSO,Major,1,0,20,30,5.5,6.5,"SL, A",...,1.72,6.39,66.19,331.7,15.27,2.34,0.29,0.2,1.49,
5,Little millet (Panicum sumatrense),LITTLE,Minor,0,1,25,30,5.5,6.5,"C,L,S",...,1.34,7.72,65.55,346.3,16.06,1.26,0.26,0.05,1.29,20.0


### Exploratory Analysis

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34 entries, 1 to 34
Data columns (total 43 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Types of Millets                     34 non-null     object 
 1   Common Name                          34 non-null     object 
 2   Category                             34 non-null     object 
 3   Drought resistant                    34 non-null     int64  
 4   Flood Resistant                      34 non-null     int64  
 5   Min Temperature (ºC)                 34 non-null     int64  
 6   Max Temperature (ºC)                 34 non-null     int64  
 7   pH level of the soil Min             34 non-null     float64
 8   pH level of the soil Max             34 non-null     float64
 9   Soil type                            34 non-null     object 
 10  Soil Salinity (dS/m) Min             34 non-null     float64
 11  Soil Salinity (dS/m) Max          

In [4]:
#Missing values
df.isna().sum()

Types of Millets                        0
Common Name                             0
Category                                0
Drought resistant                       0
Flood Resistant                         0
Min Temperature (ºC)                    0
Max Temperature (ºC)                    0
pH level of the soil Min                0
pH level of the soil Max                0
Soil type                               0
Soil Salinity (dS/m) Min                0
Soil Salinity (dS/m) Max                0
Rainfall Required (cm) Min              0
Rainfall Required (cm) Max              0
Altitude range (m) Min                  0
Altitude range (m) Max                  0
Soil Temperature (ºC) Min               6
Soil Temperature (ºC) Max               6
Soil moisture\nmin                      0
Soil moisture\nmax                      0
Light Duration (hours) Min             13
Light Duration (hours) Max             12
Land usage for each crop (t/ha) Min     0
Land usage for each crop (t/ha) Ma

### Imputation - Simple imputer

In [8]:
#Identify columns with non-numeric data
non_numeric_cols = df.select_dtypes(exclude=[int, float]).columns

In [9]:
#Select only numeric columns for imputation
numeric_cols = df.columns.difference(non_numeric_cols)

In [10]:
#Perform mean imputation on numeric columns
imputer = SimpleImputer(strategy='mean')
df_imputed = df.copy()
df_imputed[numeric_cols] = imputer.fit_transform(df[numeric_cols])

In [20]:
#Check the imputed DataFrame
df_imputed.head()

Unnamed: 0_level_0,Types of Millets,Common Name,Category,Drought resistant,Flood Resistant,Min Temperature (ºC),Max Temperature (ºC),pH level of the soil Min,pH level of the soil Max,Soil type,...,Ash (g),Crude Fibre (g),Carbo- hydrates (g),Energy (kcal),Calcium (mg),Iron (mg),Thiamine (mg),Ribo- flavin (mg),Nia- cin (mg),Price (US$ / Kg)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Pearl millet (Pennisetum glaucum),PEARL,Major,1.0,0.0,30.0,34.0,6.0,7.0,"LC, SL",...,1.37,11.49,61.78,348.0,27.35,6.42,0.25,0.2,0.86,19.0
2,Finger millet (Eleusine coracana),FINGER,Major,1.0,0.0,26.0,29.0,4.5,7.5,L,...,2.04,11.18,66.82,320.7,364.0,4.62,0.37,0.17,1.3,19.0
3,Foxtail millet (Setaria italica),FOXTAIL,Minor,1.0,0.0,5.0,35.0,5.5,7.0,SL,...,0.0,8.0,60.9,331.0,31.0,2.8,0.59,0.11,3.2,24.0
4,Proso millet (Panicum miliaceum) (Chena in India),PROSO,Major,1.0,0.0,20.0,30.0,5.5,6.5,"SL, A",...,1.72,6.39,66.19,331.7,15.27,2.34,0.29,0.2,1.49,17.244878
5,Little millet (Panicum sumatrense),LITTLE,Minor,0.0,1.0,25.0,30.0,5.5,6.5,"C,L,S",...,1.34,7.72,65.55,346.3,16.06,1.26,0.26,0.05,1.29,20.0


In [17]:
#Check the count of missing values after imputation
df_imputed.isna().any().any()

False

In [18]:
df_imputed.head()

Unnamed: 0_level_0,Types of Millets,Common Name,Category,Drought resistant,Flood Resistant,Min Temperature (ºC),Max Temperature (ºC),pH level of the soil Min,pH level of the soil Max,Soil type,...,Ash (g),Crude Fibre (g),Carbo- hydrates (g),Energy (kcal),Calcium (mg),Iron (mg),Thiamine (mg),Ribo- flavin (mg),Nia- cin (mg),Price (US$ / Kg)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Pearl millet (Pennisetum glaucum),PEARL,Major,1.0,0.0,30.0,34.0,6.0,7.0,"LC, SL",...,1.37,11.49,61.78,348.0,27.35,6.42,0.25,0.2,0.86,19.0
2,Finger millet (Eleusine coracana),FINGER,Major,1.0,0.0,26.0,29.0,4.5,7.5,L,...,2.04,11.18,66.82,320.7,364.0,4.62,0.37,0.17,1.3,19.0
3,Foxtail millet (Setaria italica),FOXTAIL,Minor,1.0,0.0,5.0,35.0,5.5,7.0,SL,...,0.0,8.0,60.9,331.0,31.0,2.8,0.59,0.11,3.2,24.0
4,Proso millet (Panicum miliaceum) (Chena in India),PROSO,Major,1.0,0.0,20.0,30.0,5.5,6.5,"SL, A",...,1.72,6.39,66.19,331.7,15.27,2.34,0.29,0.2,1.49,17.244878
5,Little millet (Panicum sumatrense),LITTLE,Minor,0.0,1.0,25.0,30.0,5.5,6.5,"C,L,S",...,1.34,7.72,65.55,346.3,16.06,1.26,0.26,0.05,1.29,20.0


In [19]:
#df_imputed to a CSV file
df_imputed.to_csv("data_omdena_imputed.csv", sep=";", index=True)