# **Crop Yield Prediction Algorithm**

### Data Preparation

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("India_Crop_Production.csv")

In [5]:
df.size

1722637

In [6]:
df.sample(5)

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
85402,Karnataka,DAKSHIN KANNAD,2008,Kharif,Dry chillies,62.0,58.0
155105,Odisha,KHORDHA,2006,Summer,Moong(Green Gram),14731.0,2180.2
68190,Haryana,KAITHAL,2010,Rabi,Masoor,107.0,200.0
130956,Maharashtra,LATUR,1998,Kharif,Rice,20300.0,9800.0
74885,Jammu and Kashmir,LEH LADAKH,2001,Whole Year,Onion,8.0,5.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246091 entries, 0 to 246090
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   State_Name     246091 non-null  object 
 1   District_Name  246091 non-null  object 
 2   Crop_Year      246091 non-null  int64  
 3   Season         246091 non-null  object 
 4   Crop           246091 non-null  object 
 5   Area           246091 non-null  float64
 6   Production     246091 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 13.1+ MB


In [8]:
df.describe()

Unnamed: 0,Crop_Year,Area,Production
count,246091.0,246091.0,246091.0
mean,2005.643018,12002.82,582503.4
std,4.952164,50523.4,16935990.0
min,1997.0,0.04,0.0
25%,2002.0,80.0,91.0
50%,2006.0,582.0,788.0
75%,2010.0,4392.0,8000.0
max,2015.0,8580100.0,1250800000.0


In [9]:
df.isnull().sum()

State_Name       0
District_Name    0
Crop_Year        0
Season           0
Crop             0
Area             0
Production       0
dtype: int64

In [10]:
df.isnull().values.any()

np.False_

### Pandas Profiling for data visualization

In [11]:
   !pip install ydata-profiling

In [12]:
from ydata_profiling import ProfileReport

pro = ProfileReport(df)
pro.to_file(output_file='output.html')

ModuleNotFoundError: No module named 'ydata_profiling'

In [13]:
df.isnull().values.any()

np.False_

In [14]:
df['Yield'] = (df['Production'] / df['Area']) #creating yield column for y train and test 
df.sample(1)

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production,Yield
222662,Uttar Pradesh,MEERUT,2001,Rabi,Rapeseed &Mustard,2673.0,3170.0,1.185933


In [15]:
Visual_data = df.drop(['State_Name','District_Name','Season','Crop'], axis = 1)

In [16]:
Visual_data.corr()

Unnamed: 0,Crop_Year,Area,Production,Yield
Crop_Year,1.0,-0.026022,0.006945,0.014464
Area,-0.026022,1.0,0.040545,-0.012387
Production,0.006945,0.040545,1.0,0.004095
Yield,0.014464,-0.012387,0.004095,1.0


In [17]:
df.State_Name.unique()

array(['Andaman and Nicobar Islands', 'Andhra Pradesh',
       'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh',
       'Chhattisgarh', 'Dadra and Nagar Haveli', 'Goa', 'Gujarat',
       'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir ', 'Jharkhand',
       'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur',
       'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Puducherry',
       'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana ',
       'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal'],
      dtype=object)

In [18]:
df.District_Name.unique()

array(['NICOBARS', 'NORTH AND MIDDLE ANDAMAN', 'SOUTH ANDAMANS',
       'ANANTAPUR', 'CHITTOOR', 'EAST GODAVARI', 'GUNTUR', 'KADAPA',
       'KRISHNA', 'KURNOOL', 'PRAKASAM', 'SPSR NELLORE', 'SRIKAKULAM',
       'VISAKHAPATANAM', 'VIZIANAGARAM', 'WEST GODAVARI', 'ANJAW',
       'CHANGLANG', 'DIBANG VALLEY', 'EAST KAMENG', 'EAST SIANG',
       'KURUNG KUMEY', 'LOHIT', 'LONGDING', 'LOWER DIBANG VALLEY',
       'LOWER SUBANSIRI', 'NAMSAI', 'PAPUM PARE', 'TAWANG', 'TIRAP',
       'UPPER SIANG', 'UPPER SUBANSIRI', 'WEST KAMENG', 'WEST SIANG',
       'BAKSA', 'BARPETA', 'BONGAIGAON', 'CACHAR', 'CHIRANG', 'DARRANG',
       'DHEMAJI', 'DHUBRI', 'DIBRUGARH', 'DIMA HASAO', 'GOALPARA',
       'GOLAGHAT', 'HAILAKANDI', 'JORHAT', 'KAMRUP', 'KAMRUP METRO',
       'KARBI ANGLONG', 'KARIMGANJ', 'KOKRAJHAR', 'LAKHIMPUR', 'MARIGAON',
       'NAGAON', 'NALBARI', 'SIVASAGAR', 'SONITPUR', 'TINSUKIA',
       'UDALGURI', 'ARARIA', 'ARWAL', 'AURANGABAD', 'BANKA', 'BEGUSARAI',
       'BHAGALPUR', 'BHOJPUR', 'B

In [19]:
df.Crop.unique()

array(['Arecanut', 'Other Kharif pulses', 'Rice', 'Banana', 'Cashewnut',
       'Coconut ', 'Dry ginger', 'Sugarcane', 'Sweet potato', 'Tapioca',
       'Black pepper', 'Dry chillies', 'other oilseeds', 'Turmeric',
       'Maize', 'Moong(Green Gram)', 'Urad', 'Arhar/Tur', 'Groundnut',
       'Sunflower', 'Bajra', 'Castor seed', 'Cotton(lint)', 'Horse-gram',
       'Jowar', 'Korra', 'Ragi', 'Tobacco', 'Gram', 'Wheat', 'Masoor',
       'Sesamum', 'Linseed', 'Safflower', 'Onion', 'other misc. pulses',
       'Samai', 'Small millets', 'Coriander', 'Potato',
       'Other  Rabi pulses', 'Soyabean', 'Beans & Mutter(Vegetable)',
       'Bhindi', 'Brinjal', 'Citrus Fruit', 'Cucumber', 'Grapes', 'Mango',
       'Orange', 'other fibres', 'Other Fresh Fruits', 'Other Vegetables',
       'Papaya', 'Pome Fruit', 'Tomato', 'Rapeseed &Mustard', 'Mesta',
       'Cowpea(Lobia)', 'Lemon', 'Pome Granet', 'Sapota', 'Cabbage',
       'Peas  (vegetable)', 'Niger seed', 'Bottle Gourd', 'Sannhamp',
       'Va

In [20]:
df.Crop.unique()

array(['Arecanut', 'Other Kharif pulses', 'Rice', 'Banana', 'Cashewnut',
       'Coconut ', 'Dry ginger', 'Sugarcane', 'Sweet potato', 'Tapioca',
       'Black pepper', 'Dry chillies', 'other oilseeds', 'Turmeric',
       'Maize', 'Moong(Green Gram)', 'Urad', 'Arhar/Tur', 'Groundnut',
       'Sunflower', 'Bajra', 'Castor seed', 'Cotton(lint)', 'Horse-gram',
       'Jowar', 'Korra', 'Ragi', 'Tobacco', 'Gram', 'Wheat', 'Masoor',
       'Sesamum', 'Linseed', 'Safflower', 'Onion', 'other misc. pulses',
       'Samai', 'Small millets', 'Coriander', 'Potato',
       'Other  Rabi pulses', 'Soyabean', 'Beans & Mutter(Vegetable)',
       'Bhindi', 'Brinjal', 'Citrus Fruit', 'Cucumber', 'Grapes', 'Mango',
       'Orange', 'other fibres', 'Other Fresh Fruits', 'Other Vegetables',
       'Papaya', 'Pome Fruit', 'Tomato', 'Rapeseed &Mustard', 'Mesta',
       'Cowpea(Lobia)', 'Lemon', 'Pome Granet', 'Sapota', 'Cabbage',
       'Peas  (vegetable)', 'Niger seed', 'Bottle Gourd', 'Sannhamp',
       'Va

In [21]:
df.Season.unique()

array(['Kharif     ', 'Whole Year ', 'Autumn     ', 'Rabi       ',
       'Summer     ', 'Winter     '], dtype=object)

### Converting categorical to numeric data

In [22]:
dummy_df = pd.get_dummies(df, dtype=int)
dummy_df.head(5)

Unnamed: 0,Crop_Year,Area,Production,Yield,State_Name_Andaman and Nicobar Islands,State_Name_Andhra Pradesh,State_Name_Arunachal Pradesh,State_Name_Assam,State_Name_Bihar,State_Name_Chandigarh,...,Crop_Turmeric,Crop_Turnip,Crop_Urad,Crop_Varagu,Crop_Water Melon,Crop_Wheat,Crop_Yam,Crop_other fibres,Crop_other misc. pulses,Crop_other oilseeds
0,2000,1254.0,2000.0,1.594896,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2000,2.0,1.0,0.5,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2000,102.0,321.0,3.147059,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2000,176.0,641.0,3.642045,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2000,720.0,165.0,0.229167,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Train test and split 

In [23]:
from sklearn.model_selection import train_test_split

X = dummy_df.drop(["Production","Yield"], axis=1)
y = dummy_df["Production"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=40)

print("X_train :",X_train.shape)
print("X_test :",X_test.shape)
print("y_train :",y_train.shape)
print("y_test :",y_test.shape)

X_train : (196872, 811)
X_test : (49219, 811)
y_train : (196872,)
y_test : (49219,)


In [24]:
print(X_train)
print(y_train)

        Crop_Year    Area  State_Name_Andaman and Nicobar Islands  \
28856        2000     1.0                                       0   
108679       2008   832.0                                       0   
171919       2003    15.0                                       0   
230706       2000   396.0                                       0   
116440       1999   289.0                                       0   
...           ...     ...                                     ...   
93176        1998  6314.0                                       0   
211109       1999    36.0                                       0   
30727        2008  4522.0                                       0   
112859       2007   163.0                                       0   
142662       2007    70.0                                       0   

        State_Name_Andhra Pradesh  State_Name_Arunachal Pradesh  \
28856                           0                             0   
108679                          0    

### Model selection and prediction

In [27]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 11)
model.fit(X_train,y_train)
rf_predict = model.predict(X_test)
rf_predict

array([  678.54545455, 18168.54545455,   373.72727273, ...,
         881.81818182,  1115.54545455,   817.63636364])

In [None]:
model.score(X_test,y_test)

0.9327905371070915

In [28]:
# Calculating R2 score

from sklearn.metrics import r2_score
r1 = r2_score(y_test,rf_predict)
print("R2 score : ",r1)

R2 score :  0.9409065483478346


In [29]:
# Calculating Adj. R2 score:

Adjr2_1 = 1 - (1-r1)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
print("Adj. R-Squared : {}".format(Adjr2_1))

Adj. R-Squared : 0.939916509938309


### Pickle Model 

In [None]:
import pickle

In [26]:
pickle.dump(model, open("model.pkl", "wb"))

NameError: name 'pickle' is not defined