## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error


## Imports for model conversion to onnx, to be compatible for the front-end

In [None]:
!pip install onnxruntime
!pip install skl2onnx

In [3]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import onnxruntime as rt
import onnx
import onnxruntime

In [4]:
# load the dataset
file_path = "crop_yield_data.txt"

In [5]:
df = pd.read_csv(file_path)
df

Unnamed: 0,HealthScoreCrop,CropType,Location,PlantedMonth,CurrentMonth,HarvestMonth,PhosphorBudget,PhosphorInput,PhosphorOutput,YieldPerHectare,AverageWeather
0,85,Wheat,Munich,3,6,6,50,10,80,6000,6
1,72,Barley,Munich,4,7,7,55,12,85,5800,5
2,89,Oats,Munich,3,6,7,48,10,75,6200,6
3,78,Potato,Munich,4,7,9,40,8,70,7000,6
4,92,Carrot,Munich,5,8,10,42,9,68,5500,5
...,...,...,...,...,...,...,...,...,...,...,...
202,77,Barley,Munich,6,9,9,54,11,85,5100,8
203,83,Oats,Munich,5,8,9,49,9,72,5700,6
204,69,Potato,Munich,6,9,11,41,8,70,5900,5
205,85,Carrot,Munich,3,6,8,43,9,70,4900,8


In [6]:
df.describe()

Unnamed: 0,HealthScoreCrop,PlantedMonth,CurrentMonth,HarvestMonth,PhosphorBudget,PhosphorInput,PhosphorOutput,YieldPerHectare,AverageWeather
count,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0,207.0
mean,77.536232,4.777778,7.777778,8.763285,52.391304,11.995169,83.816425,7176.328502,4.913043
std,7.924853,0.964863,0.964863,1.28358,8.010704,3.067996,11.737217,1542.683971,1.771449
min,60.0,3.0,6.0,6.0,35.0,7.0,60.0,4700.0,2.0
25%,72.0,4.0,7.0,8.0,46.0,9.0,72.0,5800.0,4.0
50%,77.0,5.0,8.0,9.0,55.0,13.0,87.0,7000.0,5.0
75%,82.0,6.0,9.0,10.0,59.0,14.0,92.0,8500.0,6.0
max,96.0,6.0,9.0,11.0,64.0,18.0,105.0,9900.0,9.0


In [7]:
df.columns

Index(['HealthScoreCrop', 'CropType', 'Location', 'PlantedMonth',
       'CurrentMonth', 'HarvestMonth', 'PhosphorBudget', 'PhosphorInput',
       'PhosphorOutput', 'YieldPerHectare', 'AverageWeather'],
      dtype='object')

In [9]:
# selecting the relevant columns for the features
features = df.drop(columns=["YieldPerHectare","Location"])
y = df.YieldPerHectare

In [10]:
# check unique crop type present in the dataset
features.CropType.unique()

array(['Wheat', 'Barley', 'Oats', 'Potato', 'Carrot', 'Corn'],
      dtype=object)

In [11]:
# one-hot encoding of the categorical variables of the type of crops
crop_dummies = pd.get_dummies(features.CropType)
crop_dummies.sample()

Unnamed: 0,Barley,Carrot,Corn,Oats,Potato,Wheat
108,1,0,0,0,0,0


In [12]:
# combining features with the newly created one-hot encoding
features = pd.concat([features,crop_dummies], axis=1)
features.sample()

Unnamed: 0,HealthScoreCrop,CropType,PlantedMonth,CurrentMonth,HarvestMonth,PhosphorBudget,PhosphorInput,PhosphorOutput,AverageWeather,Barley,Carrot,Corn,Oats,Potato,Wheat
54,68,Corn,5,8,9,56,13,89,4,0,0,1,0,0,0


In [13]:
# dropping cropType columns due to redundancy
features = features.drop(columns=["CropType"])
features.sample()

Unnamed: 0,HealthScoreCrop,PlantedMonth,CurrentMonth,HarvestMonth,PhosphorBudget,PhosphorInput,PhosphorOutput,AverageWeather,Barley,Carrot,Corn,Oats,Potato,Wheat
27,70,4,7,8,59,14,94,4,0,0,1,0,0,0


In [14]:
features.columns

Index(['HealthScoreCrop', 'PlantedMonth', 'CurrentMonth', 'HarvestMonth',
       'PhosphorBudget', 'PhosphorInput', 'PhosphorOutput', 'AverageWeather',
       'Barley', 'Carrot', 'Corn', 'Oats', 'Potato', 'Wheat'],
      dtype='object')

In [15]:
features

Unnamed: 0,HealthScoreCrop,PlantedMonth,CurrentMonth,HarvestMonth,PhosphorBudget,PhosphorInput,PhosphorOutput,AverageWeather,Barley,Carrot,Corn,Oats,Potato,Wheat
0,85,3,6,6,50,10,80,6,0,0,0,0,0,1
1,72,4,7,7,55,12,85,5,1,0,0,0,0,0
2,89,3,6,7,48,10,75,6,0,0,0,1,0,0
3,78,4,7,9,40,8,70,6,0,0,0,0,1,0
4,92,5,8,10,42,9,68,5,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202,77,6,9,9,54,11,85,8,1,0,0,0,0,0
203,83,5,8,9,49,9,72,6,0,0,0,1,0,0
204,69,6,9,11,41,8,70,5,0,0,0,0,1,0
205,85,3,6,8,43,9,70,8,0,1,0,0,0,0


In [17]:
# Splitting the data in train and test
X_train, X_test, y_train, y_test = train_test_split( features, y, test_size=0.33, random_state=42)

In [18]:
# Converting to numpy array for the model
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [None]:
# fitting the data to the model
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [20]:
# predicting the model on the test dataset
predictions = clf.predict(X_test)

In [21]:
predictions

array([9700, 8200, 6200, 8500, 8200, 7000, 6700, 5500, 8000, 6200, 9400,
       8500, 8500, 5400, 4800, 5500, 9900, 8400, 8200, 6200, 4800, 8900,
       5500, 5200, 6200, 8400, 5200, 8800, 5400, 9400, 8800, 5200, 6000,
       6700, 6000, 5800, 6200, 8800, 8900, 5400, 6200, 9900, 6000, 5500,
       6200, 6700, 6700, 5400, 8400, 8100, 5800, 4800, 8800, 8900, 8800,
       6200, 8800, 9400, 8500, 5400, 8500, 5600, 5200, 6000, 5600, 8400,
       8000, 8400, 8900])

In [22]:
actual_values = y_test

In [23]:
mean_absolute_error(actual_values, predictions)

272.463768115942

In [24]:
# mean accuracy score of the model on the train data
clf.score(X_train,y_train)

0.6304347826086957

In [106]:
# mean accuracy score of the model on the test data

clf.score(X_test, y_test)

0.4927536231884058

In [115]:
test_datapoint1 = [86,3,6,6,45,10,80,6,1,0,0,0,0,0]
test_datapoint2 = [77,6,9,9,52,11,85,5200,8,0,1,0,0,0]
test_datapoint3 = [67, 3, 8, 9, 45, 13, 34, 4, 0, 0, 0, 0, 0, 1]

In [108]:
print('The yeild of the crop as per the model is ',clf.predict([test_datapoint1])[0], 'kilogarms per hectare')

The yeild of the crop as per the model is  6200 kilogarms per hectare


In [109]:
print('The yeild of the crop as per the model is ',clf.predict([test_datapoint2])[0], 'kilogarms per hectare')

The yeild of the crop as per the model is  4800 kilogarms per hectare


In [116]:
print('The yeild of the crop as per the model is ',clf.predict([test_datapoint3])[0], 'kilogarms per hectare')

The yeild of the crop as per the model is  5700 kilogarms per hectare


# Confidence interval

In [117]:
# Scale and predict probabilities

predicted_probabilities = clf.predict_proba(np.array(test_datapoint3).reshape(1,-1))

# Retrieve the class labels
classes = clf.classes_

# Print the class labels
print("Class Labels:", classes)

# Print the predicted probabilities
print(predicted_probabilities)

Class Labels: [4700 4800 5000 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200
 6300 6400 6600 6700 6800 7000 7100 7200 7500 8000 8100 8200 8300 8400
 8500 8800 8900 9100 9400 9700 9900]
[[3.12751484e-12 1.87926219e-14 3.48753777e-12 5.18696762e-08
  1.10588626e-08 4.28955412e-10 4.99476475e-02 1.18441094e-14
  7.85417854e-01 1.64616709e-01 1.37176683e-05 1.53944263e-10
  1.84604649e-09 4.79566743e-10 3.99764930e-06 2.05845751e-13
  3.28736430e-09 5.25366461e-16 2.43055924e-10 3.28145100e-16
  4.59558869e-09 2.06568097e-12 6.25378958e-10 2.70218136e-47
  3.63062625e-34 1.64817038e-42 5.87927204e-57 8.64377236e-22
  8.92723117e-38 5.95954467e-25 1.20888511e-30 1.02228085e-29
  1.94849021e-28 7.48469862e-31 5.12167177e-44]]


In [118]:

# Find the index of the maximum predicted probability
max_prob_index = np.argmax(predicted_probabilities)

# Apply threshold of 0.5
predicted_probabilities_thresholded = np.where(predicted_probabilities >= 0.5, predicted_probabilities, 0)

# Print the thresholded predicted probabilities
print(predicted_probabilities_thresholded)

# Get the class label associated with the maximum probability
max_prob_class = classes[max_prob_index]

# Calculate the confidence interval
confidence_interval = (max_prob_class - 100, max_prob_class + 100)  # Adjust the interval range as needed

# Print the confidence interval
print("Confidence Interval:", confidence_interval)

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.78541785 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]]
Confidence Interval: (5600, 5800)


The confidence interval range is in between Confidence Interval: (5600, 5800) kilogarms per hectare and the model predicted it to be 5700 kilogarms per hectare

In [122]:
confidence_percent = np.max(predicted_probabilities_thresholded) * 100

In [123]:
print(f"CropVest predicts this crop's yeild with {confidence_percent:.2f}% confidence")

CropVest predicts this crop's yeild with 78.54% confidence


# Onnx model for the blockchain frontend

In [90]:
# Intial type to be entered for the model. 14 FloatTensorType data as input features
initial_type = [('float_input', FloatTensorType([None, 14]))]
onx = convert_sklearn(clf, initial_types=initial_type)

#save the model in onnx form
with open("yield_predictor.onnx", "wb") as f:
    f.write(onx.SerializeToString())

In [125]:
# Testing the onnx model's prediction
sess = rt.InferenceSession("yield_predictor.onnx", providers=["CPUExecutionProvider"])
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
input = np.array([77,6,9,9,52,11,85,5200,8,1,0,0,0,0])
pred_onx = sess.run([label_name], {input_name: input.reshape(1,-1).astype(np.float32)})[0]
print('The yeild of the crop as per the model is ',pred_onx[0], 'kilogarms per hectare')

The yeild of the crop as per the model is  4800 kilogarms per hectare
