In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Data gathering

In [None]:
df=pd.read_csv('crop_production.csv',error_bad_lines=False)
df

In [None]:
 df.tail()

## EDA and data preprocessing 

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df=df.dropna()
df

# Data Visulization

In [None]:
sns.lineplot(df['Crop_Year'],df['Production'])

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(df['State_Name'],df['Production'])
plt.xticks(rotation=90)

In [None]:
sns.barplot(df['Production'],df['Season'])

In [None]:
df['Crop'].value_counts().head()

In [None]:
# visulisation of the top 5 crops

In [None]:
df_rice=df[df['Crop']=='Rice']
df_rice.head()

In [None]:
sns.barplot(df_rice['Season'],df_rice['Production'])

In [None]:
# Insight - production of rice is highest in winter 

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(df_rice['State_Name'],df_rice['Production'])
plt.xticks(rotation=90)

In [None]:
# Insight- Punjab state has highest production 

In [None]:
sns.jointplot("Area","Production",data=df_rice,kind="reg")

In [None]:
dist_rice_df=df_rice.groupby('District_Name')['Production'].sum().reset_index().sort_values(by='Production',ascending=False)

In [None]:
dist_rice_df.head()

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(dist_rice_df['District_Name'],dist_rice_df['Production'])
plt.xticks(rotation=90)
plt.show()

In [None]:
df_maize=df[df['Crop']=='Maize']
df_maize.head()

In [None]:
sns.barplot(df_maize['Season'],df_maize['Production'])

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(df_maize['State_Name'],df_maize['Production'])
plt.xticks(rotation=90)

In [None]:
sns.jointplot("Area","Production",data=df_maize,kind="reg")

In [None]:
dist_maize_df=df_maize.groupby('District_Name')['Production'].sum().reset_index().sort_values(by='Production',ascending=False)

In [None]:
dist_maize_df.head()

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(dist_maize_df['District_Name'],dist_maize_df['Production'])
plt.xticks(rotation=90)
plt.show()

In [None]:
df_sugar=df[df['Crop']=='Sugarcane']
df_rice.head()

In [None]:
sns.barplot(df_sugar['Season'],df_sugar['Production'])

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(df_sugar['State_Name'],df_sugar['Production'])
plt.xticks(rotation=90)

In [None]:
sns.jointplot("Area","Production",data=df_sugar,kind="reg")

In [None]:
dist_sugar_df=df_sugar.groupby('District_Name')['Production'].sum().reset_index().sort_values(by='Production',ascending=False)

In [None]:
dist_sugar_df.head()

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(dist_sugar_df['District_Name'],dist_sugar_df['Production'])
plt.xticks(rotation=90)
plt.show()

In [None]:
df

# Feature Selection

In [None]:
df.drop(['District_Name','Crop_Year'],axis=1,inplace=True)

In [None]:
df

In [None]:
df['Crop'].nunique()

In [None]:
df_final=pd.get_dummies(df)

In [None]:
df_final.head()

In [None]:
df_final.info()

In [None]:
df_final.columns=df_final.columns.str.replace(' ','')

## model creation

In [None]:
x=df_final.drop('Production',axis=1)
y=df_final['Production']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=5)

In [None]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 242361 entries, 0 to 246090
Columns: 164 entries, Area to Crop_other oilseeds
dtypes: float64(1), uint8(163)
memory usage: 49.4 MB


In [None]:
linear_model=LinearRegression()
linear_model.fit(x_train,y_train)

LinearRegression()

In [None]:
#model evaluation on traning data 
y_train_pred=linear_model.predict(x_train)
y_train_pred[0:5]

array([-1410913.07177431, -1244100.60088973, -1319187.78030207,
       -3359383.66669143,   188646.88779966])

In [None]:
y_train[0:5]

78624      71.0
238776     43.0
245620    160.0
85958     118.0
150919     68.0
Name: Production, dtype: float64

In [None]:
# checking the accuracy
linear_accuracy=r2_score(y_train,y_train_pred)
linear_accuracy

0.1480018007454399

In [None]:
# model evaluation on testing data 
y_test_pred=linear_model.predict(x_test)
y_test_pred[0:5]

array([  637538.5650393 ,   417617.07353149, -1393046.88513298,
        3228131.3252092 ,   294546.88354067])

In [None]:
y_test[0:5]

188194       4760.0
159133        780.0
86095        3383.0
163014    1242000.0
182360        131.0
Name: Production, dtype: float64

In [None]:
# checking the accuracy
linear_accuracy_test=r2_score(y_test,y_test_pred)
linear_accuracy_test

0.1626576570980276

# Decision Tress

In [None]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(x_train,y_train)

DecisionTreeRegressor(random_state=42)

In [None]:
#model evaluation on traning data 
y_train_pred_dt=regressor.predict(x_train)
y_train_pred_dt[0:5]

array([ 59.  ,  64.5 , 160.  , 115.55,  68.  ])

In [None]:
y_train[0:5]

78624      71.0
238776     43.0
245620    160.0
85958     118.0
150919     68.0
Name: Production, dtype: float64

In [None]:
# checking the accuracy
regressor_accuracy_train_dt=r2_score(y_train,y_train_pred_dt)
regressor_accuracy_train_dt

0.9986160020825702

In [None]:
# model evaluation on testing data 
y_test_pred_dt=regressor.predict(x_test)
y_test_pred_dt[0:5]

array([4.4720e+03, 6.7000e+02, 2.9120e+03, 1.2015e+06, 1.1000e+02])

In [None]:
y_test[0:5]

188194       4760.0
159133        780.0
86095        3383.0
163014    1242000.0
182360        131.0
Name: Production, dtype: float64

In [None]:
# checking the accuracy
regressor_accuracy_test_dt=r2_score(y_test,y_test_pred_dt)
regressor_accuracy_test_dt

0.8205537219341198

## Random Forest

In [47]:
rf_model=RandomForestRegressor(random_state=35)
rf_model.fit(x_train,y_train)

RandomForestRegressor(random_state=35)

In [48]:
# model evaluation for traning data 
y_train_pred_rf=rf_model.predict(x_train)
y_train_pred_rf[0:5]

array([ 62.169    ,  71.8435   , 255.49     , 121.4219381,  73.2437   ])

In [49]:
y_train[0:5]

78624      71.0
238776     43.0
245620    160.0
85958     118.0
150919     68.0
Name: Production, dtype: float64

In [50]:
# accuracy on traning data 
rf_accuracy_train=r2_score(y_train,y_train_pred_rf)
rf_accuracy_train

0.9745260230569325

In [51]:
# model Evalution on test data 
y_test_pred_rf=rf_model.predict(x_test)
y_test_pred_rf[0:5]

array([4.90569000e+03, 6.82250000e+02, 2.63952417e+03, 1.19347218e+06,
       1.11498333e+02])

In [52]:
y_test[0:5]

188194       4760.0
159133        780.0
86095        3383.0
163014    1242000.0
182360        131.0
Name: Production, dtype: float64

In [53]:
# accuracy on testing data 
rf_accuracy_test=r2_score(y_test,y_test_pred_rf)
rf_accuracy_test

0.889318027938828

In [68]:
x.columns.T[30:50]



Index(['State_Name_Tripura', 'State_Name_UttarPradesh',
       'State_Name_Uttarakhand', 'State_Name_WestBengal', 'Season_Autumn',
       'Season_Kharif', 'Season_Rabi', 'Season_Summer', 'Season_WholeYear',
       'Season_Winter', 'Crop_Apple', 'Crop_Arcanut(Processed)',
       'Crop_Arecanut', 'Crop_Arhar/Tur', 'Crop_AshGourd', 'Crop_Atcanut(Raw)',
       'Crop_Bajra', 'Crop_Banana', 'Crop_Barley', 'Crop_Bean'],
      dtype='object')

In [69]:
Area=float(input("enter the area"))

array=np.zeros(164,)
array.shape
array



array[0]=Area

column_list=x.columns
name=str(input("enter the state:"))
state_name='State_Name_'+name
state_index=x.columns.get_loc(state_name)
array[state_index]=1
      
season=str(input("enter the season:"))
season_name='Season_' + season
season_index=x.columns.get_loc(season_name)
array[season_index]=1

Crop_name=str(input("enter the crop name:"))
crop= 'Crop_' + Crop_name
crop_index= x.columns.get_loc(crop)
array[crop_index]=1

array


pred=rf_model.predict([array])
pred[0]

enter the area720
enter the state:AndamanandNicobarIslands
enter the season:WholeYear
enter the crop name:Cashewnut


  "X does not have valid feature names, but"


204.43400000000003

In [70]:
import pickle 
with open('model_yeild1.pkl','wb') as file:
    pickle.dump(rf_model,file)