# India Crop Production


## Importing required dataset

In [27]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

## Getting dataset

In [2]:
df = pd.read_csv("data/proj73/crop_production.csv")
df.head()

Unnamed: 0,index,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


## Cleaning dataset

In [3]:
df.isna().sum()

index               0
State_Name          0
District_Name       0
Crop_Year           0
Season              0
Crop                0
Area                0
Production       3730
dtype: int64

In [13]:
imputer = KNNImputer()
X = df["Production"].values.reshape(-1, 1)
imputer.fit(X)

In [14]:
imp_prod = imputer.transform(X)

In [20]:
df["Production"] = imp_prod

In [21]:
df.head()

Unnamed: 0,index,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


## Pre-Processing dataset

In [23]:
df.drop("index", inplace=True, axis=1)
labels = {}
for column in df.keys():
    if pd.api.types.is_object_dtype(df[column]):
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        labels[column] = le.classes_

In [24]:
labels

{'State_Name': array(['Andaman and Nicobar Islands', 'Andhra Pradesh',
        'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh',
        'Chhattisgarh', 'Dadra and Nagar Haveli', 'Goa', 'Gujarat',
        'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir ', 'Jharkhand',
        'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur',
        'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Puducherry',
        'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana ',
        'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal'],
       dtype=object),
 'District_Name': array(['24 PARAGANAS NORTH', '24 PARAGANAS SOUTH', 'ADILABAD',
        'AGAR MALWA', 'AGRA', 'AHMADABAD', 'AHMEDNAGAR', 'AIZAWL', 'AJMER',
        'AKOLA', 'ALAPPUZHA', 'ALIGARH', 'ALIRAJPUR', 'ALLAHABAD',
        'ALMORA', 'ALWAR', 'AMBALA', 'AMBEDKAR NAGAR', 'AMETHI',
        'AMRAVATI', 'AMRELI', 'AMRITSAR', 'AMROHA', 'ANAND', 'ANANTAPUR',
        'ANANTNAG', 'ANJAW', 'ANUGUL', 'ANUPPUR', 'ARARIA', 'A

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246091 entries, 0 to 246090
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   State_Name     246091 non-null  int64  
 1   District_Name  246091 non-null  int64  
 2   Crop_Year      246091 non-null  int64  
 3   Season         246091 non-null  int64  
 4   Crop           246091 non-null  int64  
 5   Area           246091 non-null  float64
 6   Production     246091 non-null  float64
dtypes: float64(2), int64(5)
memory usage: 13.1 MB


In [26]:
df.isna().sum()

State_Name       0
District_Name    0
Crop_Year        0
Season           0
Crop             0
Area             0
Production       0
dtype: int64

In [28]:
X = df.drop("Production", axis=1)
y = df["Production"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Model Training

In [29]:
rreg = RandomForestRegressor()
rreg.fit(X_train, y_train).score(X_test, y_test)

0.9588118976738894

In [30]:
xreg = XGBRegressor()
xreg.fit(X_train.values, y_train.values)
xreg.score(X_test.values, y_test.values)

0.9692566608632325

## Prediction

In [37]:
labels["Crop"].tolist().index("Carrot")

21

In [39]:
rreg.predict([[27, 595, 2023, 4, 21, 1200]])



array([76171.36249263])