# Yield Prediction and Profitability Analysis

This notebook covers the following tasks:
1. Data Loading and Preprocessing
2. Model Training (Regression and Classification)
3. Model Evaluation
4. Incorporating Initial Costs
5. Testing and Accuracy Check


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score, mean_absolute_error
import joblib
import sys
sys.path.append('../../src')

In [2]:
import utils

In [3]:
df = utils.load_dataset('../../data/Processed/Data_for_model.csv')

In [4]:
df.head()

Unnamed: 0,Acre,Land_Planting,Strate_Fertilizer,Liquid_Fertilizer,Fungicide,Insecticide,Others,Area,KG,Total_Expenses,Investment,Estimated_Yield,Revenue,Profit,Is_Profitable,Cost_Per_Acre,Yield_Per_Acre
0,1,48000,156640,9872,47372,25538,9050,ampara,4971,296472,60000,838.359103,83835.910305,23835.910305,True,296472.0,4971.0
1,1,44000,158900,9932,47600,21638,9950,bakamuna,2313,292020,60000,396.034518,39603.451818,-20396.548182,False,292020.0,2313.0
2,1,46000,157250,9870,49902,23638,9050,dehiaththakandiya,1793,295710,60000,303.168645,30316.864496,-29683.135504,False,295710.0,1793.0
3,1,47000,130200,11630,48030,24600,11000,girithale,4600,272460,60000,844.160611,84416.061073,24416.061073,True,272460.0,4600.0
4,1,49200,145806,8950,49560,27650,9530,Kandakatiya,1238,290696,60000,212.937227,21293.722652,-38706.277348,False,290696.0,1238.0


In [5]:
df.tail(50)

Unnamed: 0,Acre,Land_Planting,Strate_Fertilizer,Liquid_Fertilizer,Fungicide,Insecticide,Others,Area,KG,Total_Expenses,Investment,Estimated_Yield,Revenue,Profit,Is_Profitable,Cost_Per_Acre,Yield_Per_Acre
1150,4,171147,664593,32894,198937,91086,40040,mahiyanganaya,11280,1198697,60000,470.510896,47051.089642,-12948.910358,False,299674.25,2820.0
1151,4,166196,569476,47627,187993,105159,45085,bakamuna,11850,1121536,60000,528.293341,52829.334056,-7170.665944,False,280384.0,2962.5
1152,4,173903,589015,34782,199428,86094,40085,girithale,8950,1123307,60000,398.377291,39837.729134,-20162.270866,False,280826.75,2237.5
1153,4,195477,667300,43723,202104,80748,43037,Kandakatiya,9700,1232389,60000,393.544571,39354.457075,-20645.542925,False,308097.25,2425.0
1154,4,176433,583956,56311,202947,96960,42085,medirigiriya,22050,1158692,60000,951.503937,95150.39372,35150.39372,True,289673.0,5512.5
1155,4,185812,622983,51921,196026,89181,36744,monaragala,9900,1182667,60000,418.545542,41854.554156,-18145.445844,False,295666.75,2475.0
1156,4,176940,661718,38899,192919,97721,41887,morawewa,18320,1210084,60000,756.972243,75697.224325,15697.224325,True,302521.0,4580.0
1157,4,176396,638480,43940,189453,93826,42417,nikaweratiya,15080,1184512,60000,636.54906,63654.905987,3654.905987,True,296128.0,3770.0
1158,4,193626,655831,58113,194327,82411,38850,kanthale,14150,1223158,60000,578.420776,57842.077638,-2157.922362,False,305789.5,3537.5
1159,4,195141,667371,51029,194246,108341,44465,ampara,15890,1260593,60000,630.258934,63025.893369,3025.893369,True,315148.25,3972.5


In [6]:
# One-hot encode categorical features
df_encoded = pd.get_dummies(df, columns=['Area'])

In [7]:
# Train-test split
X = df_encoded.drop(columns=['Profit', 'Is_Profitable'])
y_class = df_encoded['Is_Profitable']
y_reg = df_encoded['Profit']
X_train, X_test, y_train_class, y_test_class, y_train_reg, y_test_reg = train_test_split(X, y_class, y_reg, test_size=0.3, random_state=42)

In [8]:
# Train Regression Model
reg_model = LinearRegression()
reg_model.fit(X_train, y_train_reg)
joblib.dump(reg_model, '../../models/regression_model.pkl')

# Train Classification Model
clf_model = RandomForestClassifier()
clf_model.fit(X_train, y_train_class)
joblib.dump(clf_model, '../../models/classification_model.pkl')

['../../models/classification_model.pkl']

In [9]:
loaded_reg_model = joblib.load('../../models/regression_model.pkl')
loaded_clf_model = joblib.load('../../models/classification_model.pkl')

In [10]:
# Evaluate Regression Model
y_pred_reg = loaded_reg_model.predict(X_test)
reg_metrics = {
    'MAE': mean_absolute_error(y_test_reg, y_pred_reg),
    'MSE': mean_squared_error(y_test_reg, y_pred_reg),
    'R-squared': r2_score(y_test_reg, y_pred_reg)
}

print("Regression Model Metrics:")
print(reg_metrics)

Regression Model Metrics:
{'MAE': np.float64(5.102975819681888e-10), 'MSE': np.float64(3.352456417262843e-19), 'R-squared': 1.0}


In [11]:
# Evaluate Classification Model
y_pred_class = loaded_clf_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test_class, y_pred_class))
accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"Classification Model Accuracy: {accuracy:.2f}")

Classification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       230
        True       1.00      1.00      1.00       130

    accuracy                           1.00       360
   macro avg       1.00      1.00      1.00       360
weighted avg       1.00      1.00      1.00       360

Classification Model Accuracy: 1.00


In [12]:
initial_cost = 60000  # Example initial cost

# Predict profitability with initial costs
def predict_profitability_with_initial_cost(new_data, model, initial_cost):
    new_data['Investment'] = initial_cost
    new_data_encoded = pd.get_dummies(new_data, columns=['Area'])
    missing_cols = set(X.columns) - set(new_data_encoded.columns)
    for c in missing_cols:
        new_data_encoded[c] = 0
    new_data_encoded = new_data_encoded[X.columns]
    
    y_pred = model.predict(new_data_encoded)
    new_data['Predicted_Profitability'] = y_pred
    new_data['Adjusted_Profitability'] = new_data['Predicted_Profitability'] - initial_cost
    new_data['Is_Profitable_Adjusted'] = new_data['Adjusted_Profitability'] > 0
    return new_data

#predict profit with initial cost
def predict_profit_with_initial_cost(new_data, model, initial_cost):
    new_data['Investment'] = initial_cost
    new_data_encoded = pd.get_dummies(new_data, columns=['Area'])
    missing_cols = set(X.columns) - set(new_data_encoded.columns)
    for c in missing_cols:
        new_data_encoded[c] = 0
    new_data_encoded = new_data_encoded[X.columns]
    
    y_pred = model.predict(new_data_encoded)
    new_data['Predicted_Profit'] = y_pred
    return new_data


In [16]:
new_data = pd.DataFrame({
    'Acre': [1],
    'Land_Planting': [45000],
    'Strate_Fertilizer': [150000],
    'Liquid_Fertilizer': [10000],
    'Fungicide': [50000],
    'Insecticide': [25000],
    'Others': [8000],
    'Area': ['ampara'],
    'KG': [5000],
    'Total_Expenses': [300000],
    'Investment': [60000],
    'Estimated_Yield': [800],
    'Revenue': [80000]
})
adjusted_results = predict_profitability_with_initial_cost(new_data, loaded_clf_model, initial_cost)
print("Adjusted Profitability Prediction:")
adjusted_results

profit_results = predict_profit_with_initial_cost(new_data, loaded_reg_model, initial_cost)
profit_results

Adjusted Profitability Prediction:


Unnamed: 0,Acre,Land_Planting,Strate_Fertilizer,Liquid_Fertilizer,Fungicide,Insecticide,Others,Area,KG,Total_Expenses,Investment,Estimated_Yield,Revenue,Predicted_Profitability,Adjusted_Profitability,Is_Profitable_Adjusted,Predicted_Profit
0,1,45000,150000,10000,50000,25000,8000,ampara,5000,300000,60000,800,80000,True,-59999,False,20000.0


In [14]:
# Generate random indices for the test set
np.random.seed(42)
random_indices = np.random.choice(X_test.index, size=5, replace=False)

# Select random samples from the test set
random_test_data = X_test.loc[random_indices]
random_test_labels = y_test_class.loc[random_indices]

In [15]:
# Predict on random samples
random_predictions = loaded_clf_model.predict(random_test_data)
random_accuracy = accuracy_score(random_test_labels, random_predictions)
print(f"Random Sample Accuracy: {random_accuracy:.2f}")

Random Sample Accuracy: 1.00
