In [1]:
# Import Libraries:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV



# Ignore Futurewarnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("categorized_ty.csv")
df

Unnamed: 0,Area,Item,Year,Area_harvested,Yield,temperature,precipitation,humidity,Soil_PH,Soil_Nitrogen,Soil_OM,VHI,ASI,Category
0,Thailand,Areca nuts,1961,0.0,0.0,26.23,1740.10,78.50,4.33,0.920,3.69,0.36,19.17,Nuts and Seeds
1,Thailand,Areca nuts,1962,0.0,0.0,26.08,1586.14,76.30,5.95,4.050,2.66,0.55,3.96,Nuts and Seeds
2,Thailand,Areca nuts,1963,0.0,0.0,26.07,1664.81,77.30,4.72,1.490,3.91,0.40,8.93,Nuts and Seeds
3,Thailand,Areca nuts,1964,0.0,0.0,26.14,1689.46,73.19,4.89,3.020,3.65,0.43,7.68,Nuts and Seeds
4,Thailand,Areca nuts,1965,0.0,0.0,26.28,1600.65,73.50,4.50,2.430,2.92,0.41,18.26,Nuts and Seeds
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7888,Yemen,Wheat,2018,64339.0,14332.0,25.50,187.61,40.40,6.88,0.021,0.74,0.54,6.74,Grains and Pulses
7889,Yemen,Wheat,2019,57466.0,17459.0,25.94,185.96,41.70,6.93,0.059,0.71,0.65,4.16,Grains and Pulses
7890,Yemen,Wheat,2020,59190.0,21485.0,25.62,188.15,42.40,7.51,0.039,0.74,0.57,3.56,Grains and Pulses
7891,Yemen,Wheat,2021,60955.0,22644.0,25.64,186.96,41.00,7.96,0.029,1.20,0.64,3.85,Grains and Pulses


In [3]:
df_encoded = pd.get_dummies(df, columns=['Category'])

# Display the first few rows of the encoded DataFrame
df_encoded.head()

Unnamed: 0,Area,Item,Year,Area_harvested,Yield,temperature,precipitation,humidity,Soil_PH,Soil_Nitrogen,Soil_OM,VHI,ASI,Category_Fruits,Category_Grains and Pulses,Category_Nuts and Seeds,Category_Other,Category_Other Crops,Category_Stimulant and Spice Crops,Category_Vegetables
0,Thailand,Areca nuts,1961,0.0,0.0,26.23,1740.1,78.5,4.33,0.92,3.69,0.36,19.17,False,False,True,False,False,False,False
1,Thailand,Areca nuts,1962,0.0,0.0,26.08,1586.14,76.3,5.95,4.05,2.66,0.55,3.96,False,False,True,False,False,False,False
2,Thailand,Areca nuts,1963,0.0,0.0,26.07,1664.81,77.3,4.72,1.49,3.91,0.4,8.93,False,False,True,False,False,False,False
3,Thailand,Areca nuts,1964,0.0,0.0,26.14,1689.46,73.19,4.89,3.02,3.65,0.43,7.68,False,False,True,False,False,False,False
4,Thailand,Areca nuts,1965,0.0,0.0,26.28,1600.65,73.5,4.5,2.43,2.92,0.41,18.26,False,False,True,False,False,False,False


In [4]:
# Dropping rows where 'area_harvested' column is equal to 0
df_encoded = df_encoded[df_encoded['Area_harvested'] != 0]
df_encoded = df_encoded[df_encoded['Yield'] != 0]

In [5]:
# Counting instances where 'area_harvested' column is equal to 0
df_count_a = df_encoded[df_encoded['Area_harvested'] == 0].shape[0]
df_count_y = df_encoded[df_encoded['Yield'] == 0].shape[0]

print("Number of instances where Area_harvested is equal to 0:", df_count_a)
print("Number of instances where Yield is equal to 0:", df_count_y)

Number of instances where Area_harvested is equal to 0: 0
Number of instances where Yield is equal to 0: 0


In [9]:
df_encoded.to_csv('df_encoded.csv', index=False)

# RandomForest

In [6]:
# Split data into features (X) and target variable (y)
X = df_encoded.drop(columns=['Yield', 'Area', 'Item'])
y = df_encoded['Yield'] 

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model selection and training
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Model evaluation
y_pred = model.predict(X_test)
mse = round(mean_squared_error(y_test, y_pred),2)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error:', mse)
print('R-squared Score:', r2) # Indicates that my features account for X% of predictability

from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(model.get_params())

Mean Squared Error: 1736389872.98
R-squared Score: 0.7036232240392789
Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [7]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],  # You can adjust these values based on your computational resources
    'max_depth': [None, 10, 20],  # You can adjust these values based on the complexity of your data
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    # Add other parameters if you want to tune them as well
}

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='r2')

# Perform the grid search
grid_search.fit(X_train, y_train)  # X_train and y_train are your training data

# Get the best parameters
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

Best Parameters: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}


In [8]:
# Best parameters
best_params = {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}

# Initialize the Random Forest Regressor with best parameters
opt_rf = RandomForestRegressor(**best_params, random_state=42)

# Train the model with your data
opt_rf.fit(X_train, y_train)

# Model evaluation
y_pred = opt_rf.predict(X_test)
mse = round(mean_squared_error(y_test, y_pred),2)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error:', mse)
print('R-squared Score:', r2) # Indicates that my features account for X% of predictability

Mean Squared Error: 1744578505.95
R-squared Score: 0.7022255421719237


In [7]:
# This model is capable of predicting the crop yield for a crop type based on various factors.

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# Define the features (X) and target variable (y)
X = df_encoded[['Area_harvested', 'temperature', 'precipitation', 'humidity', 'Soil_PH', 'Soil_Nitrogen', 'Soil_OM', 'VHI', 'ASI', 
        'Category_Fruits', 'Category_Grains and Pulses', 'Category_Nuts and Seeds', 'Category_Other', 'Category_Other Crops', 
        'Category_Stimulant and Spice Crops', 'Category_Vegetables']]
y = df_encoded['Yield']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = RandomForestRegressor(random_state=42)

# Create pipeline
pipeline = Pipeline(steps=[
    ('model', model)
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Make predictions on new input data
def predict_yield(area_harvested, temperature, precipitation, humidity, soil_ph, soil_nitrogen, soil_om, vhi, asi, 
                  category_fruits, category_grains, category_nuts, category_other, category_other_crops, 
                  category_spice_crops, category_vegetables):
    input_data = pd.DataFrame({
        'Area_harvested': [area_harvested],
        'temperature': [temperature],
        'precipitation': [precipitation],
        'humidity': [humidity],
        'Soil_PH': [soil_ph],
        'Soil_Nitrogen': [soil_nitrogen],
        'Soil_OM': [soil_om],
        'VHI': [vhi],
        'ASI': [asi],
        'Category_Fruits': [category_fruits],
        'Category_Grains and Pulses': [category_grains],
        'Category_Nuts and Seeds': [category_nuts],
        'Category_Other': [category_other],
        'Category_Other Crops': [category_other_crops],
        'Category_Stimulant and Spice Crops': [category_spice_crops],
        'Category_Vegetables': [category_vegetables]
    })
    
    return pipeline.predict(input_data)

# Example usage:
predicted_yield = predict_yield(113221.44, 26.62, 1602.57, 73.07, 5.03, 2.34, 2.91, 0.49, 8.9, True, False, False, False, False, False, False)
print("Predicted crop yield:", predicted_yield)

Predicted crop yield: [189787.79]


In [51]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score

def objective(n_estimators, max_depth, min_samples_split, max_features):
    # Round max_depth to the nearest integer
    max_depth = int(round(max_depth))
    
    # Initialize RandomForestRegressor with specified hyperparameters
    model = RandomForestRegressor(n_estimators=int(n_estimators),
                                  max_depth=max_depth,
                                  min_samples_split=int(min_samples_split),
                                  max_features=max_features,
                                  bootstrap=True,
                                  ccp_alpha=0.0,
                                  criterion='squared_error',
                                  max_leaf_nodes=None,
                                  max_samples=None,
                                  min_impurity_decrease=0.0,
                                  min_samples_leaf=1,
                                  min_weight_fraction_leaf=0.0,
                                  monotonic_cst=None,
                                  n_jobs=None,
                                  oob_score=False,
                                  random_state=42,
                                  verbose=0,
                                  warm_start=False)
    
    # Calculate R-squared scores using cross-validation
    r2_scores = cross_val_score(model, X_train, y_train, cv=3, scoring="r2")
    
    # Return the negative mean of R-squared scores
    return -r2_scores.mean()

# Define the parameter grid to search
param_grid = {
    'n_estimators': (100, 250),  # Number of trees in the forest
    'max_depth': (5, 50),  # Maximum depth of the trees (ensure it's within a reasonable range)
    'min_samples_split': (2, 25),  # Minimum number of samples required to split an internal node
    'max_features': (0.1, 0.999)  # Fraction of features to consider when looking for the best split
}

optimizer = BayesianOptimization(f=objective, pbounds=param_grid, random_state=42)
optimizer.maximize(init_points=5, n_iter=15)

|   iter    |  target   | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m-0.4671  [0m | [0m21.85    [0m | [0m0.9547   [0m | [0m18.84    [0m | [0m189.8    [0m |
| [95m2        [0m | [95m-0.3215  [0m | [95m12.02    [0m | [95m0.2402   [0m | [95m3.336    [0m | [95m229.9    [0m |
| [0m3        [0m | [0m-0.4981  [0m | [0m32.05    [0m | [0m0.7366   [0m | [0m2.473    [0m | [0m245.5    [0m |
| [0m4        [0m | [0m-0.3283  [0m | [0m42.46    [0m | [0m0.2909   [0m | [0m6.182    [0m | [0m127.5    [0m |
| [0m5        [0m | [0m-0.4447  [0m | [0m18.69    [0m | [0m0.5718   [0m | [0m11.93    [0m | [0m143.7    [0m |
| [0m6        [0m | [0m-0.3644  [0m | [0m12.01    [0m | [0m0.3423   [0m | [0m4.297    [0m | [0m229.8    [0m |
| [0m7        [0m | [0m-0.393   [0m | [0m13.21    [0m | [0m0.3664   [0m | [0m2.099    [0m | [0m228.5   

In [52]:
best_params = optimizer.max['params']
best_params

{'max_depth': 12.04089631008678,
 'max_features': 0.1691314892477125,
 'min_samples_split': 2.9540827687756757,
 'n_estimators': 230.23084997120145}

In [53]:
final_model = RandomForestRegressor(n_estimators=int(best_params['n_estimators']),
                                   max_depth=int(best_params['max_depth']),
                                   min_samples_split=int(best_params['min_samples_split']),
                                   max_features=best_params['max_features'],
                                   random_state=42)
final_model.fit(X_train, y_train)
score = final_model.score(X_test, y_test)
print(f"Test R^2 Score: {score}")

Test R^2 Score: 0.2800134659887725


# Gradient BOOST

In [34]:
# GradientBoosting

# Split data into features (X) and target variable (y)
X = df_encoded.drop(columns=['Yield', 'Area', 'Item'])
y = df_encoded['Yield'] 

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model selection and training
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)

# Model evaluation
y_predg = gb.predict(X_test)
mseg = round(mean_squared_error(y_test, y_predg),2)
r2g = r2_score(y_test, y_predg)
print('Mean Squared Error:', mseg)
print('R-squared Score:', r2g) # Indicates that my features account for X% of predictability

from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(gb.get_params())

Mean Squared Error: 2859736823.67
R-squared Score: 0.5118840572132037
Parameters currently in use:

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 42,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}


In [26]:
# This model is capable of predicting the crop yield for a crop type based on various factors.
# Define the features (X) and target variable (y)
X = df_encoded[['Area_harvested', 'temperature', 'precipitation', 'humidity', 'Soil_PH', 'Soil_Nitrogen', 'Soil_OM', 'VHI', 'ASI', 
        'Category_Fruits', 'Category_Grains and Pulses', 'Category_Nuts and Seeds', 'Category_Other', 'Category_Other Crops', 
        'Category_Stimulant and Spice Crops', 'Category_Vegetables']]
y = df_encoded['Yield']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
gb = GradientBoostingRegressor(random_state=42)

# Create pipeline
pipeline = Pipeline(steps=[
    ('model', gb)
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Make predictions on new input data
def predict_yield(area_harvested, temperature, precipitation, humidity, soil_ph, soil_nitrogen, soil_om, vhi, asi, 
                  category_fruits, category_grains, category_nuts, category_other, category_other_crops, 
                  category_spice_crops, category_vegetables):
    input_data = pd.DataFrame({
        'Area_harvested': [area_harvested],
        'temperature': [temperature],
        'precipitation': [precipitation],
        'humidity': [humidity],
        'Soil_PH': [soil_ph],
        'Soil_Nitrogen': [soil_nitrogen],
        'Soil_OM': [soil_om],
        'VHI': [vhi],
        'ASI': [asi],
        'Category_Fruits': [category_fruits],
        'Category_Grains and Pulses': [category_grains],
        'Category_Nuts and Seeds': [category_nuts],
        'Category_Other': [category_other],
        'Category_Other Crops': [category_other_crops],
        'Category_Stimulant and Spice Crops': [category_spice_crops],
        'Category_Vegetables': [category_vegetables]
    })
    
    return pipeline.predict(input_data)

# Example usage:
predicted_yield = predict_yield(113221.44, 26.62, 1602.57, 73.07, 5.03, 2.34, 2.91, 0.49, 8.9, True, False, False, False, False, False, False)
print("Predicted crop yield:", predicted_yield)

Predicted crop yield: [160806.25323333]


In [None]:
# Item, avg_area_harvested, avg_yield, avg_temperature, avg_precipitation, avg_humidity, avg_soil_ph, avg_soil_nitrogen, avg_soil_om, avg_vhi, avg_asi, Category
'Bananas', '113221.44', '137430.05', '26.62', '1602.57', '73.07', '5.03', '2.34', '2.91', '0.49', '8.9', 'Fruits'


In [27]:
PV = 160806.25
AV = 137430.05
percent_difference = ((PV - AV) / PV) * 100
print(f"Percent difference between the two values is: {percent_difference:.2f}%")

Percent difference between the two values is: 14.54%


# Optimization

In [56]:
# Create "placeholders" for all three steps
estimators = [
    ('scaler', StandardScaler()),
    ('dim_reducer', PCA()),
    ('model', RandomForestRegressor())
]

my_pipe = Pipeline(estimators)

In [58]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.decomposition import PCA, KernelPCA
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.pipeline import Pipeline

# Define the parameter grid for Gradient Boosting
param_grid_gb = {
    'scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
    'dim_reducer': [PCA(), KernelPCA()],
    'model': [GradientBoostingRegressor()], 
    'model__n_estimators': [50, 100, 150],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 4, 5],
    'dim_reducer__n_components': [3, 4, 5]
}

# Define the parameter grid for Random Forest
param_grid_rf = {
    'scaler': [StandardScaler(), None],
    'dim_reducer': [PCA(), KernelPCA()],
    'model': [RandomForestRegressor()], 
    'model__n_estimators': [50, 100, 150],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5, 10],
    'dim_reducer__n_components': [2, 3, 4]
}

# Create pipelines for Gradient Boosting and Random Forest
pipe_gb = Pipeline([
    ('scaler', None),
    ('dim_reducer', None),
    ('model', None)
])

pipe_rf = Pipeline([
    ('scaler', None),
    ('dim_reducer', None),
    ('model', None)
])

# Initialize GridSearchCV objects
grid_gb = GridSearchCV(pipe_gb, param_grid_gb, cv=5, scoring='neg_mean_squared_error')
grid_rf = GridSearchCV(pipe_rf, param_grid_rf, cv=5, scoring='neg_mean_squared_error')

# Fit the GridSearchCV objects
fittedgrid_gb = grid_gb.fit(X_train, y_train)
fittedgrid_rf = grid_rf.fit(X_train, y_train)

In [59]:
# For Gradient Boosting
best_params_gb = fittedgrid_gb.best_params_
print("Best parameters for Gradient Boosting:", best_params_gb)

# For Random Forest
best_params_rf = fittedgrid_rf.best_params_
print("Best parameters for Random Forest:", best_params_rf)

Best parameters for Gradient Boosting: {'dim_reducer': PCA(), 'dim_reducer__n_components': 4, 'model': GradientBoostingRegressor(), 'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 50, 'scaler': StandardScaler()}
Best parameters for Random Forest: {'dim_reducer': PCA(), 'dim_reducer__n_components': 4, 'model': RandomForestRegressor(), 'model__max_depth': 10, 'model__min_samples_split': 10, 'model__n_estimators': 50, 'scaler': StandardScaler()}


In [60]:
# Gradient Boosting model with best parameters
gb_model = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=4, random_state=42)

# Define the pipeline with StandardScaler and PCA
gb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('dim_reducer', PCA(n_components=4)),
    ('model', gb_model)
])

# Fit the model
gb_pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred_gb = gb_pipeline.predict(X_test)
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print("Gradient Boosting Mean Squared Error:", mse_gb)
print("Gradient Boosting R-squared Score:", r2_gb)

Gradient Boosting Mean Squared Error: 3612963345.8222075
Gradient Boosting R-squared Score: 0.3833191239129011


In [61]:
# Random Forest model with best parameters
rf_model = RandomForestRegressor(n_estimators=50, max_depth=10, min_samples_split=10, random_state=42)

# Define the pipeline with StandardScaler and PCA
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('dim_reducer', PCA(n_components=4)),
    ('model', rf_model)
])

# Fit the model
rf_pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred_rf = rf_pipeline.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Mean Squared Error:", mse_rf)
print("Random Forest R-squared Score:", r2_rf)

Random Forest Mean Squared Error: 3367742416.1169314
Random Forest R-squared Score: 0.4251748094792679
