## Imports

In [None]:
import warnings
 
warnings.simplefilter('ignore')
 
import pandas as pd
 
import numpy as np
 
import matplotlib.pyplot as plt
 
import seaborn as sns
 
from sklearn.metrics import r2_score
 
from sklearn.metrics import mean_absolute_error
 
from sklearn.metrics import mean_squared_error
 
from sklearn.model_selection import train_test_split
 
from sklearn.ensemble import RandomForestRegressor
 
from sklearn.linear_model import LinearRegression
 
from sklearn.ensemble import GradientBoostingRegressor
 
from sklearn.ensemble import ExtraTreesRegressor
 
from sklearn.linear_model import BayesianRidge
 
from sklearn.svm import SVR
 
from xgboost import XGBRegressor
 
from sklearn.multioutput import MultiOutputRegressor
 
from sklearn.tree import DecisionTreeRegressor
 
from sklearn.neighbors import KNeighborsRegressor
 
from sklearn.cluster import KMeans
 
from sklearn.preprocessing import LabelEncoder
 
from sklearn.experimental import enable_iterative_imputer
 
from sklearn.impute import IterativeImputer
 
from sklearn.feature_selection import SelectKBest
 
from sklearn.feature_selection import f_regression

## Data Preprocessing

In [None]:
df = pd.read_excel('Master data file.xlsx' , sheet_name = 'Data', header = 3)
 
df.drop(df.columns[0], axis = 1, inplace = True)
 
df = df.iloc[:-1]

In [None]:
df

In [None]:
df['Yield/acre \n(Col M/Col G) (kilos)'].fillna(0, inplace = True)

df['Yield/seed used \n(Col M/ Col H) (kilos)'].fillna(0.0 , inplace  = True)

df = df.rename(columns={"Yield/acre \n(Col M/Col G) (kilos)": "Yield per Acre (kilos)" , "Yield/seed used \n(Col M/ Col H) (kilos)" : "Yield per Seed (kilos)"})

df.isnull().sum()

In [None]:
df = df.replace(0 , np.nan)

data_label_encoder = LabelEncoder()

encoded_list = data_label_encoder.fit_transform(df['Farmer ID'])

df['EncodedFarmerID'] = encoded_list

encoded_list = data_label_encoder.fit_transform(df['Village'])

df['EncodedVillage'] = encoded_list

encoded_list = data_label_encoder.fit_transform(df['Seed type'])

df['EncodedSeedType'] = encoded_list

encoded_list = data_label_encoder.fit_transform(df['Month of sowing'])

df['EncodedMonth'] = encoded_list

df = df.drop(['Farmer ID','Village','Seed type','Month of sowing'] , axis = 1)

df = df[['S.No','EncodedFarmerID','EncodedVillage','EncodedSeedType','EncodedMonth','Acres cultivated','Seed Used (kilos)','Pesticides used (kilos)','Fungicides (kilos)','Herbicides (kilos)','Fertilizers used (kilos)','Gross cob quantity (kilos)','Yield per Acre (kilos)','Yield per Seed (kilos)']]

df

## Iterative Imputer

In [None]:
def iterative_imputer(df , est = BayesianRidge()) :
  
  X = df.drop(['S.No' , 'EncodedFarmerID'] , axis = 1)
  
  imp = IterativeImputer(missing_values = np.nan , max_iter = 10 , min_value = 0)

  imp.fit(X)

  IterativeImputer()

  X = imp.transform(X)

  dataframe = pd.DataFrame(X , columns = df.drop(['S.No' , 'EncodedFarmerID'] , axis = 1).columns)

  dataframe.insert(0 , 'S.No' , df['S.No'])

  dataframe.insert(1 , 'EncodedFarmerID' , df['EncodedFarmerID'])

  return dataframe

## Regression Functions

In [None]:
def error_rate(y_test , y_pred , algo = ' Linear Regression') :

  print(algo + '\n')
  
  print(" Mean Absolute Error : ", round(mean_absolute_error(y_test, y_pred) , 3))  

  print(" Mean Squared Error : ", round(mean_squared_error(y_test, y_pred) , 3))  

  print(" Root Mean Squared Error : ", round(np.sqrt(mean_squared_error(y_test, y_pred)) , 3))

  print(" R2 Score : ", round(r2_score(y_test,y_pred) , 3))

  print('\n')

In [None]:
def regression_module(X_train, y_train, X_test, y_test) :
  
  reg = LinearRegression()

  reg.fit(X_train , y_train)

  y_pred = reg.predict(X_test)

  error_rate(y_test , y_pred)

  del reg , y_pred


  reg = RandomForestRegressor()

  reg.fit(X_train , y_train)

  y_pred = reg.predict(X_test)

  error_rate(y_test , y_pred , ' Random Forest Regression')

  del reg , y_pred



  reg = GradientBoostingRegressor()

  reggb = MultiOutputRegressor(reg, n_jobs=-1)
  
  reggb.fit(X_train , y_train)

  y_pred = reggb.predict(X_test)

  error_rate(y_test , y_pred , ' Gradient Boosting Regression')

  del reg , y_pred , reggb


  reg = ExtraTreesRegressor()

  reg.fit(X_train , y_train)

  y_pred = reg.predict(X_test)

  error_rate(y_test , y_pred , ' Extra Trees Regressor')

  del reg , y_pred


  reg = XGBRegressor(objective = 'reg:squarederror')

  regxgb = MultiOutputRegressor(reg, n_jobs=-1)
  
  regxgb.fit(X_train , y_train)

  y_pred = regxgb.predict(X_test)

  error_rate(y_test , y_pred , ' XGBoost Regressor')

  del reg , y_pred , regxgb


  reg  = SVR()

  regsvr = MultiOutputRegressor(reg, n_jobs = -1)

  regsvr.fit(X_train,y_train)

  y_pred  = regsvr.predict(X_test)

  error_rate(y_test , y_pred , 'LinearSVR Regression')

  del reg , y_pred , regsvr

## Selecting Top 5 Features Considering Yield per Acre (kilos) as Output Variable

In [None]:
def top_5_features_variable1(dataframe) :

  print(' Top 5 Features With Yield per Acre (kilos) as Output Variable \n')
  
  X = dataframe.drop(['S.No', 'EncodedFarmerID','Gross cob quantity (kilos)' , 'Yield per Acre (kilos)','Yield per Seed (kilos)'] , axis = 1)

  y = dataframe[['Yield per Acre (kilos)']]
  
  select_k_best = SelectKBest(f_regression , k = 5)

  k_best = select_k_best.fit(X , y)

  data = pd.DataFrame(dict(Features = X.columns , Importance = select_k_best.scores_)).sort_values('Importance', ascending = False).head(5)

  return data

## Selecting Top 5 Features Considering Yield per Seed (kilos) as Output Variable

In [None]:
def top_5_features_variable2(dataframe) :

  print(' Top 5 Features With Yield per Seed (kilos) as Output Variable \n')
  
  X = dataframe.drop(['S.No', 'EncodedFarmerID','Gross cob quantity (kilos)' , 'Yield per Acre (kilos)','Yield per Seed (kilos)'], axis = 1)

  y = dataframe[['Yield per Seed (kilos)']]
  
  select_k_best = SelectKBest(f_regression , k = 5)

  k_best = select_k_best.fit(X , y)

  data = pd.DataFrame(dict(Features = X.columns , Importance = select_k_best.scores_)).sort_values('Importance', ascending = False).head(5)

  return data

## Selecting Top 5 Features Considering Gross cob quantity (kilos) as the Output Variable

In [None]:
def top_5_features_variable3(dataframe) :

  print(' Top 5 Features With Gross cob quantity (kilos) as Output Variable \n')
  
  X = dataframe.drop(['S.No', 'EncodedFarmerID','Gross cob quantity (kilos)' , 'Yield per Acre (kilos)','Yield per Seed (kilos)'], axis = 1)

  y = dataframe[['Gross cob quantity (kilos)']]
  
  select_k_best = SelectKBest(f_regression , k = 5)

  k_best = select_k_best.fit(X , y)

  data = pd.DataFrame(dict(Features = X.columns , Importance = select_k_best.scores_)).sort_values('Importance', ascending = False).head(5)

  return data

## Selecting Features for all the three Output Variables in Descending Order


In [None]:
def top_features(dataframe) :
  
  d = {}

  X = dataframe.drop(['S.No', 'EncodedFarmerID','Gross cob quantity (kilos)' , 'Yield per Acre (kilos)','Yield per Seed (kilos)'] , axis = 1)

  y = dataframe[['Gross cob quantity (kilos)' , 'Yield per Acre (kilos)','Yield per Seed (kilos)']]
  
  for i in X.columns :

    x = X.drop([i], axis=1)

    X_train , X_test , y_train , y_test = train_test_split(x , y , test_size = 0.15)

    reg = GradientBoostingRegressor()

    reggb = MultiOutputRegressor(reg, n_jobs = -1)

    reggb.fit(X_train,y_train)

    y_pred = reggb.predict(X_test)

    d[i] = r2_score(y_test, y_pred)

    del reg , reggb ,  y_pred , x, X_train , X_test , y_train , y_test

  d = {k: v for k, v in sorted(d.items(), key=lambda x: x[1])}

  print(' Most to least important ')

  print('\n')

  count = 1
  
  for i in d.keys() :

    print(' ' + str(count) + '. ' + i)

    count = count + 1

    print('\n')

## Iterative Imputer With Bayesian Ridge

In [None]:
dataframe = iterative_imputer(df)

X = dataframe.drop(['S.No', 'EncodedFarmerID','Gross cob quantity (kilos)' , 'Yield per Acre (kilos)','Yield per Seed (kilos)'] , axis = 1)

y = dataframe[['Gross cob quantity (kilos)' , 'Yield per Acre (kilos)','Yield per Seed (kilos)']]

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.15)

regression_module(X_train , y_train , X_test , y_test)

#####  Least RMSE Value and High R2 Score for Gradient Boosting Regressor

## Selecting Top Features

#### For the Output Variable Yield per Acre (kilos)

In [None]:
dataframe = iterative_imputer(df)

top_5_features_variable1(dataframe)

#### For the Output Variable Yield per Seed (kilos)

In [None]:
dataframe = iterative_imputer(df)

top_5_features_variable2(dataframe)

#### For The Output Variable Gross cob quantity (kilos)

In [None]:
dataframe = iterative_imputer(df)

top_5_features_variable3(dataframe)

#### For all three Output Variables

In [None]:
dataframe = iterative_imputer(df)

top_features(dataframe)

## Pearson Distribution

In [None]:
gaussian_data = iterative_imputer(df)

gaussian_data = gaussian_data.drop(['S.No' , 'EncodedFarmerID' , 'EncodedVillage' , 'EncodedSeedType' , 'EncodedMonth'] , axis = 1)

gaussian_data.insert(0, 'Pesticides (kilos) per Acre', gaussian_data['Pesticides used (kilos)'] / gaussian_data['Acres cultivated'])

gaussian_data.insert(1 ,'Fungicides (kilos) per Acre' , gaussian_data['Fungicides (kilos)'] / gaussian_data['Acres cultivated'])

gaussian_data.insert(2 ,'Herbicides (kilos) per Acre' , gaussian_data['Herbicides (kilos)'] / gaussian_data['Acres cultivated'])

gaussian_data.insert(3 ,'Fertilizers (kilos) per Acre' , gaussian_data['Fertilizers used (kilos)'] / gaussian_data['Acres cultivated'])

gaussian_data.insert(4 ,'Pesticides (kilos) per Seeds (kilos)' , gaussian_data['Pesticides used (kilos)'] / gaussian_data['Seed Used (kilos)'])

gaussian_data.insert(5 ,'Fungicides (kilos) per Seeds (kilos)' , gaussian_data['Fungicides (kilos)'] / gaussian_data['Seed Used (kilos)'])

gaussian_data.insert(6 ,'Herbicides (kilos) per Seeds (kilos)' , gaussian_data['Herbicides (kilos)'] / gaussian_data['Seed Used (kilos)'])

gaussian_data.insert(7 , 'Fertilizers (kilos) per Seeds (kilos)' , gaussian_data['Fertilizers used (kilos)'] / gaussian_data['Seed Used (kilos)'])

gaussian_data = gaussian_data.drop(['Acres cultivated' , 'Seed Used (kilos)' , 'Pesticides used (kilos)' ,'Fungicides (kilos)' ,'Herbicides (kilos)' , 'Fertilizers used (kilos)' , 'Gross cob quantity (kilos)'] , axis = 1)

gaussian_data

#### Plotting Histograms wrt input variables to verify that the data has a Gaussian Distribution

In [None]:
def plot_histogram_input(dataframe) :

  X = dataframe.drop(['Yield per Acre (kilos)','Yield per Seed (kilos)'] , axis = 1)

  for i in range(len(X.columns)) :

    plt.figure(i)

    plt.hist(X[X.columns[i]])

    plt.title(X.columns[i], color = 'black')

  plt.show()

In [None]:
plot_histogram_input(gaussian_data)

#### Plotting Histograms wrt output variables to verify that the data has a Gaussian Distribution

In [None]:
def plot_histogram_output(dataframe) :

  y = dataframe[['Yield per Acre (kilos)','Yield per Seed (kilos)']]

  for i in range(len(y.columns)) :

    plt.figure(i)

    plt.hist(y[y.columns[i]])

    plt.title(y.columns[i], color = 'black')

  plt.show()

In [None]:
plot_histogram_output(gaussian_data)

#### Plotting Graphs to verify that the data is not linearly distributed

In [None]:
def plot_data(dataframe) :
  
  X = dataframe.drop(['Yield per Acre (kilos)','Yield per Seed (kilos)'] , axis = 1)
  
  y = dataframe[['Yield per Acre (kilos)','Yield per Seed (kilos)']]
  
  for i in X.columns :
 
    plt.figure(i)
 
    x = np.array(X[i])
 
    y1 = np.array(y['Yield per Acre (kilos)'])
 
    plt.plot(x, y1, 'o')
 
    m, b = np.polyfit(x, y1, 1)
 
    plt.plot(x, m*x + b)
 
    plt.title(i, color = 'black')
 
plt.show()

In [None]:
plot_data(gaussian_data)

#### Correlation Heatmap

In [None]:
plt.figure(figsize = (10, 8))

plt.title('\n Heatmap showing correlation between different features \n')

sns.heatmap(gaussian_data.corr(method = 'pearson'), xticklabels = gaussian_data.columns , yticklabels = gaussian_data.columns , annot = True, cmap = 'RdBu', linewidths = 0.5)

plt.yticks(rotation = 45) 

plt.show()