# **Data Preprocessing**

## 1. Missing values handling

Missing values are handled by mean(numerical) and mode(categorical)

In [101]:
#import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [102]:
#Loading data as df
df = pd.read_csv('/content/Crop Production data.csv')

In [103]:
df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


In [104]:
#data description
df.describe()

Unnamed: 0,Crop_Year,Area,Production
count,246091.0,246091.0,242361.0
mean,2005.643018,12002.82,582503.4
std,4.952164,50523.4,17065810.0
min,1997.0,0.04,0.0
25%,2002.0,80.0,88.0
50%,2006.0,582.0,729.0
75%,2010.0,4392.0,7023.0
max,2015.0,8580100.0,1250800000.0


In [105]:
#checking for missing values
df.isna().sum()

State_Name          0
District_Name       0
Crop_Year           0
Season              0
Crop                0
Area                0
Production       3730
dtype: int64

In [106]:
# Fill missing values in numerical columns with the mean
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# Fill missing values in categorical columns with the mode if available
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    mode_val = df[col].mode().iloc[0]  # Get the mode value
    df[col] = df[col].fillna(mode_val)  # Fill missing values with the mode value
df

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0
...,...,...,...,...,...,...,...
246086,West Bengal,PURULIA,2014,Summer,Rice,306.0,801.0
246087,West Bengal,PURULIA,2014,Summer,Sesamum,627.0,463.0
246088,West Bengal,PURULIA,2014,Whole Year,Sugarcane,324.0,16250.0
246089,West Bengal,PURULIA,2014,Winter,Rice,279151.0,597899.0


## 2. Handling Duplicates and Inconsistencies

Dropping duplicates and inconsistencies.

In [107]:
# Check for duplicate rows based on selected columns
duplicate_rows = df.duplicated(subset=['State_Name', 'District_Name', 'Crop_Year', 'Crop'], keep=False)

# Print duplicate rows
print("Duplicate rows based on selected columns:")
print(df[duplicate_rows])


Duplicate rows based on selected columns:
            State_Name District_Name  Crop_Year       Season          Crop  \
207     Andhra Pradesh     ANANTAPUR       1997  Kharif       Dry chillies   
208     Andhra Pradesh     ANANTAPUR       1997  Kharif          Groundnut   
209     Andhra Pradesh     ANANTAPUR       1997  Kharif         Horse-gram   
210     Andhra Pradesh     ANANTAPUR       1997  Kharif              Jowar   
211     Andhra Pradesh     ANANTAPUR       1997  Kharif              Korra   
...                ...           ...        ...          ...           ...   
246085     West Bengal       PURULIA       2014  Summer              Maize   
246086     West Bengal       PURULIA       2014  Summer               Rice   
246087     West Bengal       PURULIA       2014  Summer            Sesamum   
246089     West Bengal       PURULIA       2014  Winter               Rice   
246090     West Bengal       PURULIA       2014  Winter            Sesamum   

            Area  Pro

In [108]:
# Remove duplicate rows
df.drop_duplicates(subset=['State_Name', 'District_Name', 'Crop_Year', 'Crop'], inplace=True)
df.isna().sum()

State_Name       0
District_Name    0
Crop_Year        0
Season           0
Crop             0
Area             0
Production       0
dtype: int64

## 3. Encoding categorical values

## Using label encoding for 'season' and 'crop' variables.

In [109]:
# Selecting categorical columns for label encoding
categorical_cols = ['State_Name', 'District_Name', 'Season', 'Crop']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode each categorical column and replace the original column in the DataFrame
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

print(df)

        State_Name  District_Name  Crop_Year  Season  Crop    Area  Production
0                0            427       2000       1     2  1254.0      2000.0
1                0            427       2000       1    74     2.0         1.0
2                0            427       2000       1    95   102.0       321.0
3                0            427       2000       4     7   176.0       641.0
4                0            427       2000       4    22   720.0       165.0
...            ...            ...        ...     ...   ...     ...         ...
246080          32            471       2014       2    87   477.0      9995.0
246081          32            471       2014       2    92  1885.0      1508.0
246082          32            471       2014       2    98    54.0        37.0
246084          32            471       2014       2   119  1622.0      3663.0
246088          32            471       2014       4   106   324.0     16250.0

[214206 rows x 7 columns]


## 4.Normalization of numerical features

Using min-max scaling normalization technique for the normalization of numerical data

In [110]:
# Selecting numerical columns for min-max normalization
numerical_cols = ['Crop_Year', 'Area', 'Production']

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the selected numerical columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Display the normalized DataFrame
print(df)

        State_Name  District_Name  Crop_Year  Season  Crop          Area  \
0                0            427   0.166667       1     2  1.461475e-04   
1                0            427   0.166667       1    74  2.284356e-07   
2                0            427   0.166667       1    95  1.188331e-05   
3                0            427   0.166667       4     7  2.050792e-05   
4                0            427   0.166667       4    22  8.391044e-05   
...            ...            ...        ...     ...   ...           ...   
246080          32            471   0.944444       2    87  5.558910e-05   
246081          32            471   0.944444       2    92  2.196897e-04   
246082          32            471   0.944444       2    98  6.288971e-06   
246084          32            471   0.944444       2   119  1.890374e-04   
246088          32            471   0.944444       4   106  3.775714e-05   

          Production  
0       1.598977e-06  
1       7.994883e-10  
2       2.566358e-