In [3]:
!pip install xgboost --no-cache-dir


^C


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Load main crop recommendation data
main_df = pd.read_csv("../datasets/Crop_recommendation.csv")

# List to store all subcrop data
all_subcrop_data = []
for crop in main_df["label"].unique():
    file_path = f"../datasets/sub_crop_data/{crop}_subcrop_data.csv"
    try:
        df = pd.read_csv(file_path)
        df["main-crop"] = crop  # Add main crop column
        all_subcrop_data.append(df)
    except FileNotFoundError:
        print(f"Warning: File {file_path} not found. Skipping.")

# Merge all subcrop data
df = pd.concat(all_subcrop_data, ignore_index=True)

# Save merged dataset
df.to_csv("../datasets/merged_subcrop_data.csv", index=False)
print("Merged dataset saved as 'merged_subcrop_data.csv'.")

# Ensure 'sub-crop' column exists
if 'sub-crop' not in df.columns:
    raise ValueError("Missing 'sub-crop' column in sub_crop_data.csv files!")

# Feature Selection
X = df[['main-crop', 'N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']]
y = df['sub-crop']

# Encode categorical variable 'main-crop'
label_encoder = LabelEncoder()
X['main-crop'] = label_encoder.fit_transform(X['main-crop'])

# Normalize numerical features
scaler = StandardScaler()
X[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']] = scaler.fit_transform(X[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']])

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [10, 20, 30, None],  # Depth of trees
    'min_samples_split': [2, 5, 10],  # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples per leaf
    'bootstrap': [True, False]  # Whether to use bootstrap sampling
}

clf = RandomizedSearchCV(
    RandomForestClassifier(random_state=42), 
    param_grid, n_iter=20, cv=5, verbose=1, n_jobs=-1, scoring='accuracy'
)
clf.fit(X_train, y_train)

# Best model from hyperparameter tuning
best_model = clf.best_estimator_

# Predictions
y_pred = best_model.predict(X_test)

# Evaluation
print("Best Model Parameters:", clf.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Merged dataset saved as 'merged_subcrop_data.csv'.
Fitting 5 folds for each of 20 candidates, totalling 100 fits


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['main-crop'] = label_encoder.fit_transform(X['main-crop'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']] = scaler.fit_transform(X[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']])
  _data = np.array(data, dtype=dtype, copy=copy,


Best Model Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30, 'bootstrap': False}
Accuracy: 0.864406779661017
                 precision    recall  f1-score   support

        Apricot       0.94      0.93      0.93       120
          Bajra       0.94      0.94      0.94       120
         Banana       0.89      0.72      0.79       120
   Basmati Rice       0.88      0.94      0.91       120
    Bengal Gram       0.88      0.92      0.90       120
            Ber       0.85      0.97      0.90       120
 Black Chickpea       0.93      0.88      0.91       120
     Black Gram       0.52      0.33      0.40       120
   Black Pepper       0.89      0.93      0.91       120
     Black Rice       0.93      0.93      0.93       120
   Bottle Gourd       0.93      0.94      0.93       120
     Brown Rice       0.88      0.88      0.88       120
      Cardamoms       0.94      0.94      0.94       120
       Chakotha       0.92      0.91      0.

In [7]:
##Boosted Accuracy

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load main crop recommendation data
main_df = pd.read_csv("../datasets/Crop_recommendation.csv")

# List to store all subcrop data
all_subcrop_data = []
for crop in main_df["label"].unique():
    file_path = f"../datasets/sub_crop_data/{crop}_subcrop_data.csv"
    try:
        df = pd.read_csv(file_path)
        df["main-crop"] = crop  # Add main crop column
        all_subcrop_data.append(df)
    except FileNotFoundError:
        print(f"Warning: File {file_path} not found. Skipping.")

# Merge all subcrop data
df = pd.concat(all_subcrop_data, ignore_index=True)

# Save merged dataset
df.to_csv("../datasets/merged_subcrop_data.csv", index=False)

# Ensure 'sub-crop' column exists
if 'sub-crop' not in df.columns:
    raise ValueError("Missing 'sub-crop' column in sub_crop_data.csv files!")

# Feature Selection
X = df[['main-crop', 'N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']]
y = df['sub-crop']

# Encode categorical variable 'main-crop'
label_encoder_X = LabelEncoder()
X['main-crop'] = label_encoder_X.fit_transform(X['main-crop'])

# Encode target variable 'sub-crop'
label_encoder_y = LabelEncoder()
y_encoded = label_encoder_y.fit_transform(y)

# Normalize numerical features
scaler = StandardScaler()
X[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']] = scaler.fit_transform(
    X[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']]
)

# Handle class imbalance using SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X, y_encoded = smote.fit_resample(X, y_encoded)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Hyperparameter tuning for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [6, 10, 15],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

xgb = XGBClassifier(objective='multi:softmax', num_class=len(label_encoder_y.classes_), random_state=42)
clf = GridSearchCV(xgb, param_grid, cv=5, verbose=1, n_jobs=-1, scoring='accuracy')
clf.fit(X_train, y_train)

# Best model from hyperparameter tuning
best_model = clf.best_estimator_

# Predictions
y_pred = best_model.predict(X_test)

# Convert predictions back to original labels
y_pred_labels = label_encoder_y.inverse_transform(y_pred)
y_test_labels = label_encoder_y.inverse_transform(y_test)

# Evaluation
print("Best Model Parameters:", clf.best_params_)
print("XGBoost Accuracy:", accuracy_score(y_test_labels, y_pred_labels))
print(classification_report(y_test_labels, y_pred_labels))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['main-crop'] = label_encoder_X.fit_transform(X['main-crop'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']] = scaler.fit_transform(


Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load main crop recommendation data
main_df = pd.read_csv("../datasets/Crop_recommendation.csv")

# List to store all subcrop data
all_subcrop_data = []
for crop in main_df["label"].unique():  # Ensure "label" is the correct column name in Crop_recommendation.csv
    file_path = f"../datasets/sub_crop_data/{crop}_subcrop_data.csv"
    try:
        df = pd.read_csv(file_path)
        df["main-crop"] = crop  # Add main crop column
        all_subcrop_data.append(df)
    except FileNotFoundError:
        print(f"Warning: File {file_path} not found. Skipping.")

# Merge all subcrop data
df = pd.concat(all_subcrop_data, ignore_index=True)

# Save merged dataset
df.to_csv("../datasets/merged_subcrop_data.csv", index=False)
print("Merged dataset saved as 'merged_subcrop_data.csv'.")

# Ensure 'sub_crop' column exists
if 'sub-crop' not in df.columns:
    raise ValueError("Missing 'sub_crop' column in sub_crop_data.csv files!")

# Feature Selection
X = df[['main-crop', 'N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']]
y = df['sub-crop']

# Encode categorical variable 'main_crop'
label_encoder = LabelEncoder()
X['main-crop'] = label_encoder.fit_transform(X['main-crop'])

# Normalize numerical features
scaler = StandardScaler()
X[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']] = scaler.fit_transform(X[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Random Forest Model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Merged dataset saved as 'merged_subcrop_data.csv'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['main-crop'] = label_encoder.fit_transform(X['main-crop'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']] = scaler.fit_transform(X[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']])


Accuracy: 0.3161290322580645
                 precision    recall  f1-score   support

        Apricot       0.45      0.65      0.53        20
          Bajra       0.42      0.40      0.41        20
         Banana       0.48      0.50      0.49        60
   Basmati Rice       0.22      0.25      0.23        20
    Bengal Gram       0.13      0.10      0.11        20
            Ber       0.00      0.00      0.00        20
 Black Chickpea       0.22      0.30      0.26        20
     Black Gram       0.23      0.23      0.23       100
   Black Pepper       0.52      0.62      0.57        40
     Black Rice       0.33      0.20      0.25        20
   Bottle Gourd       0.24      0.20      0.22        20
     Brown Rice       0.22      0.30      0.26        20
      Cardamoms       0.85      0.55      0.67        20
       Chakotha       0.23      0.25      0.24        20
Charleston Gray       0.35      0.35      0.35        20
         Cherry       0.58      0.55      0.56        20
 

In [17]:
# KNN 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load main crop recommendation data
main_df = pd.read_csv("../datasets/Crop_recommendation.csv")

# List to store all subcrop data
all_subcrop_data = []
for crop in main_df["label"].unique():  # Ensure "label" is the correct column name in Crop_recommendation.csv
    file_path = f"../datasets/sub_crop_data/{crop}_subcrop_data.csv"
    try:
        df = pd.read_csv(file_path)
        df["main-crop"] = crop  # Add main crop column
        all_subcrop_data.append(df)
    except FileNotFoundError:
        print(f"Warning: File {file_path} not found. Skipping.")

# Merge all subcrop data
df = pd.concat(all_subcrop_data, ignore_index=True)

# Save merged dataset
df.to_csv("../datasets/merged_subcrop_data.csv", index=False)
print("Merged dataset saved as 'merged_subcrop_data.csv'.")

# Ensure 'sub_crop' column exists
if 'sub-crop' not in df.columns:
    raise ValueError("Missing 'sub_crop' column in sub_crop_data.csv files!")

# Feature Selection
X = df[['main-crop', 'N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']]
y = df['sub-crop']

# Encode categorical variable 'main_crop'
label_encoder = LabelEncoder()
X['main-crop'] = label_encoder.fit_transform(X['main-crop'])

# Normalize numerical features
scaler = StandardScaler()
X[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']] = scaler.fit_transform(X[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train KNN Model
k = 5  # You can try different values of K (3, 5, 7, etc.)
knn = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
knn.fit(X_train, y_train)

# Predictions
y_pred = knn.predict(X_test)

# Evaluation
print(f"KNN Accuracy (k={k}):", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Merged dataset saved as 'merged_subcrop_data.csv'.
KNN Accuracy (k=5): 0.2543010752688172


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['main-crop'] = label_encoder.fit_transform(X['main-crop'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']] = scaler.fit_transform(X[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']])


                 precision    recall  f1-score   support

        Apricot       0.38      0.50      0.43        20
          Bajra       0.41      0.55      0.47        20
         Banana       0.38      0.45      0.41        60
   Basmati Rice       0.31      0.45      0.37        20
    Bengal Gram       0.04      0.05      0.04        20
            Ber       0.04      0.05      0.04        20
 Black Chickpea       0.15      0.20      0.17        20
     Black Gram       0.18      0.24      0.20       100
   Black Pepper       0.33      0.50      0.40        40
     Black Rice       0.47      0.35      0.40        20
   Bottle Gourd       0.32      0.45      0.38        20
     Brown Rice       0.21      0.35      0.26        20
      Cardamoms       0.64      0.70      0.67        20
       Chakotha       0.33      0.45      0.38        20
Charleston Gray       0.21      0.30      0.24        20
         Cherry       0.28      0.40      0.33        20
       Chickoos       0.12    

In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
import pickle
import os

# Step 1: Load Main Crop Model
def load_main_crop_model():
    with open('main_crop_model.pkl', 'rb') as file:
        return pickle.load(file)

# Step 2: Define Realistic Ranges
realistic_ranges = {
    'N': (0, 200), 'P': (0, 200), 'K': (0, 250), 'temperature': (5, 50),
    'humidity': (0, 100), 'ph': (3, 11), 'rainfall': (0, 500)
}

# Step 3: Input Validation and Preprocessing
def validate_and_preprocess_input(N, P, K, temperature, humidity, ph, rainfall):
    inputs = {'N': N, 'P': P, 'K': K, 'temperature': temperature, 
              'humidity': humidity, 'ph': ph, 'rainfall': rainfall}
    
    for param, val in inputs.items():
        try:
            inputs[param] = float(val)
        except (ValueError, TypeError):
            return False, f"Invalid input: {param} must be a number", []
    
    capped_inputs = {}
    warnings_list = []
    for param, val in inputs.items():
        min_val, max_val = realistic_ranges[param]
        if val < min_val or val > max_val:
            warnings_list.append(f"{param} ({val}) outside realistic range ({min_val}-{max_val}), capped")
            capped_inputs[param] = max(min_val, min(val, max_val))
        else:
            capped_inputs[param] = val
    
    return True, capped_inputs, warnings_list

# Step 4: Main Crop to Sub-Crop File Mapping
crop_name_mapping = {
    'rice': 'rice_subcrop_data.csv',
    'wheat': 'wheat_subcrop_data.csv',  # Missing, add if needed
    'maize': 'maize_subcrop_data.csv',
    'chickpea': 'Bengal Gram_subcrop_data.csv',
    'kidneybeans': 'kidneybeans_subcrop_data.csv',
    'pigeonpeas': 'Pegeon Pea_subcrop_data.csv',
    'mothbeans': 'Moath Dal_subcrop_data.csv',
    'mungbean': 'Green Gram_subcrop_data.csv',
    'blackgram': 'Black Gram_subcrop_data.csv',
    'lentil': 'lentil_subcrop_data.csv',
    'pomegranate': 'pomegranate_subcrop_data.csv',
    'banana': 'banana_subcrop_data.csv',
    'mango': 'mango_subcrop_data.csv',
    'grapes': 'grapes_subcrop_data.csv',
    'watermelon': 'Water Melon_subcrop_data.csv',
    'muskmelon': 'Karbuja_subcrop_data.csv',
    'apple': 'apple_subcrop_data.csv',
    'orange': 'orange_subcrop_data.csv',
    'papaya': 'papaya_subcrop_data.csv',
    'coconut': 'coconut_subcrop_data.csv',
    'cotton': 'cotton_subcrop_data.csv',
    'jute': 'jute_subcrop_data.csv'
}

# Step 5: Sub-Crop Recommendation Function
def recommend_sub_crops(N, P, K, temperature, humidity, ph, rainfall, 
                        subcrop_dir='../datasets/sub_crop_data/', num_recommendations=3):
    try:
        # Load main crop model
        main_model = load_main_crop_model()
        
        # Validate and preprocess input
        is_valid, capped_inputs_or_error, warnings = validate_and_preprocess_input(
            N, P, K, temperature, humidity, ph, rainfall
        )
        if not is_valid:
            return {"error": capped_inputs_or_error, "main_crop": None, "sub_crops": [], "warnings": warnings}
        
        # Predict main crop
        input_df = pd.DataFrame([[capped_inputs_or_error['N'], capped_inputs_or_error['P'], 
                                  capped_inputs_or_error['K'], capped_inputs_or_error['temperature'], 
                                  capped_inputs_or_error['humidity'], capped_inputs_or_error['ph'], 
                                  capped_inputs_or_error['rainfall']]],
                                columns=['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall'])
        main_crop = main_model.predict(input_df)[0]
        main_confidence = float(max(main_model.predict_proba(input_df)[0]))
        
        # Get sub-crop file name from mapping
        if main_crop not in crop_name_mapping:
            return {"error": f"No sub-crop mapping for {main_crop}", "main_crop": main_crop, 
                    "sub_crops": [], "warnings": warnings}
        
        subcrop_filename = crop_name_mapping[main_crop]
        subcrop_file = os.path.join(subcrop_dir, subcrop_filename)
        
        # Check if file exists
        if not os.path.exists(subcrop_file):
            return {"error": f"Sub-crop file {subcrop_filename} not found at {subcrop_file}", 
                    "main_crop": main_crop, "sub_crops": [], "warnings": warnings}
        
        # Load sub-crop data
        sub_crop_df = pd.read_csv(subcrop_file)
        
        # Define expected columns (adjusted to your data)
        required_cols = ['sub-crop', 'N', 'P', 'K', 'rainfall', 'ph', 'humidity']
        
        # Check columns
        missing_cols = [col for col in required_cols if col not in sub_crop_df.columns]
        if missing_cols:
            print(f"Debug: Columns in {subcrop_filename}: {sub_crop_df.columns.tolist()}")
            print(f"Debug: Missing columns: {missing_cols}")
            return {"error": f"Sub-crop file {subcrop_filename} missing required columns: {missing_cols}", 
                    "main_crop": main_crop, "sub_crops": [], "warnings": warnings}
        
        # Input features for distance calculation
        input_vector = np.array([[capped_inputs_or_error['N'], capped_inputs_or_error['P'], 
                                  capped_inputs_or_error['K'], capped_inputs_or_error['rainfall'], 
                                  capped_inputs_or_error['ph'], capped_inputs_or_error['humidity']]])
        sub_crop_features = sub_crop_df[['N', 'P', 'K', 'rainfall', 'ph', 'humidity']].values
        
        # Calculate Euclidean distances
        distances = euclidean_distances(input_vector, sub_crop_features)[0]
        
        # Sort sub-crops by distance and select top N
        sub_crops_with_distances = list(zip(sub_crop_df['sub-crop'], distances))
        sorted_sub_crops = sorted(sub_crops_with_distances, key=lambda x: x[1])[:num_recommendations]
        recommended_sub_crops = [{"sub_crop": crop, "distance": float(dist)} for crop, dist in sorted_sub_crops]
        
        # Result
        result = {
            "main_crop": main_crop,
            "main_confidence": main_confidence,
            "sub_crops": recommended_sub_crops,
            "warnings": warnings if warnings else None
        }
        return result
    
    except FileNotFoundError as e:
        return {"error": f"File error: {str(e)}", "main_crop": None, "sub_crops": [], "warnings": None}
    except Exception as e:
        return {"error": f"Sub-crop recommendation failed: {str(e)}", "main_crop": None, 
                "sub_crops": [], "warnings": None}

# Step 6: Test the Sub-Crop Model
if __name__ == "__main__":
    # Test cases
    test_cases = [
        [90, 42, 43, 20.8, 82.0, 6.5, 202.9],  # Normal input
        [500, 200, 300, 50.0, 10.0, 14.0, 1000.0],  # Extreme input
        [-10, 50, 60, 25.0, 75.0, 7.0, 150.0],  # Negative value
        [100, "invalid", 60, 25.0, 75.0, 7.0, 150.0]  # Invalid type
    ]
    
    for i, test_input in enumerate(test_cases, 1):
        print(f"\nTest Case {i}: {test_input}")
        result = recommend_sub_crops(*test_input, subcrop_dir='../datasets/sub_crop_data/')
        print(f"Result: {result}")


Test Case 1: [90, 42, 43, 20.8, 82.0, 6.5, 202.9]

Test Case 2: [500, 200, 300, 50.0, 10.0, 14.0, 1000.0]

Test Case 3: [-10, 50, 60, 25.0, 75.0, 7.0, 150.0]

Test Case 4: [100, 'invalid', 60, 25.0, 75.0, 7.0, 150.0]


In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
import pickle
import os

# Load Main Crop Model
def load_main_crop_model():
    with open('main_crop_model.pkl', 'rb') as file:
        return pickle.load(file)

# Realistic Ranges
realistic_ranges = {
    'N': (0, 200), 'P': (0, 200), 'K': (0, 250), 'temperature': (5, 50),
    'humidity': (0, 100), 'ph': (3, 11), 'rainfall': (0, 500)
}

# Input Validation
def validate_and_preprocess_input(N, P, K, temperature, humidity, ph, rainfall):
    inputs = {'N': N, 'P': P, 'K': K, 'temperature': temperature, 
              'humidity': humidity, 'ph': ph, 'rainfall': rainfall}
    for param, val in inputs.items():
        try:
            inputs[param] = float(val)
        except (ValueError, TypeError):
            return False, f"Invalid input: {param} must be a number", []
    capped_inputs = {}
    warnings_list = []
    for param, val in inputs.items():
        min_val, max_val = realistic_ranges[param]
        if val < min_val or val > max_val:
            warnings_list.append(f"{param} ({val}) outside realistic range ({min_val}-{max_val}), capped")
            capped_inputs[param] = max(min_val, min(val, max_val))
        else:
            capped_inputs[param] = val
    return True, capped_inputs, warnings_list

# Updated Crop Mapping
crop_name_mapping = {
    'Rice': 'Rice_subcrop_data.csv',
    'Maize': 'Maize_subcrop_data.csv',
    'Bengal Gram (Gram)(Whole)': 'Bengal Gram (Gram)(Whole)_subcrop_data.csv',
    'Pegeon Pea (Arhar Fali)': 'Pegeon Pea (Arhar Fali)_subcrop_data.csv',
    'Moath Dal': 'Moath Dal_subcrop_data.csv',
    'Green Gram (Moong)(Whole)': 'Green Gram (Moong)(Whole)_subcrop_data.csv',
    'Black Gram (Urd Beans)(Whole)': 'Black Gram (Urd Beans)(Whole)_subcrop_data.csv',
    'Lentil (Masur)(Whole)': 'Lentil (Masur)(Whole)_subcrop_data.csv',
    'Pomegranate': 'Pomegranate_subcrop_data.csv',
    'Banana': 'Banana_subcrop_data.csv',
    'Mango': 'Mango_subcrop_data.csv',
    'Grapes': 'Grapes_subcrop_data.csv',
    'Water Melon': 'Water Melon_subcrop_data.csv',
    'Karbuja (Musk Melon)': 'Karbuja (Musk Melon)_subcrop_data.csv',
    'Apple': 'Apple_subcrop_data.csv',
    'Orange': 'Orange_subcrop_data.csv',
    'Papaya': 'Papaya_subcrop_data.csv',
    'Coconut': 'Coconut_subcrop_data.csv',
    'Cotton': 'Cotton_subcrop_data.csv',
    'Jute': 'Jute_subcrop_data.csv',
    'Coffee': 'Coffee_subcrop_data.csv'
}

# Sub-Crop Recommendation
def recommend_sub_crops(N, P, K, temperature, humidity, ph, rainfall, 
                        subcrop_dir='../datasets/sub_crop_data/', num_recommendations=3):
    try:
        main_model = load_main_crop_model()
        is_valid, capped_inputs, warnings = validate_and_preprocess_input(
            N, P, K, temperature, humidity, ph, rainfall
        )
        if not is_valid:
            return {"error": capped_inputs, "main_crop": None, "sub_crops": [], "warnings": warnings}
        
        input_df = pd.DataFrame([[capped_inputs['N'], capped_inputs['P'], 
                                  capped_inputs['K'], capped_inputs['temperature'], 
                                  capped_inputs['humidity'], capped_inputs['ph'], 
                                  capped_inputs['rainfall']]],
                                columns=['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall'])
        main_crop = main_model.predict(input_df)[0]
        main_confidence = float(max(main_model.predict_proba(input_df)[0]))
        
        if main_crop not in crop_name_mapping:
            return {"error": f"No sub-crop mapping for {main_crop}", "main_crop": main_crop, 
                    "sub_crops": [], "warnings": warnings}
        
        subcrop_filename = crop_name_mapping[main_crop]
        subcrop_file = os.path.join(subcrop_dir, subcrop_filename)
        
        if not os.path.exists(subcrop_file):
            return {"error": f"Sub-crop file {subcrop_filename} not found", 
                    "main_crop": main_crop, "sub_crops": [], "warnings": warnings}
        
        sub_crop_df = pd.read_csv(subcrop_file)
        required_cols = ['sub-crop', 'N', 'P', 'K', 'temperature', 'rainfall', 'ph', 'humidity']
        missing_cols = [col for col in required_cols if col not in sub_crop_df.columns]
        if missing_cols:
            return {"error": f"Missing columns: {missing_cols}", "main_crop": main_crop, 
                    "sub_crops": [], "warnings": warnings}
        
        # Include temperature in distance calculation
        input_vector = np.array([[capped_inputs['N'], capped_inputs['P'], capped_inputs['K'], 
                                  capped_inputs['temperature'], capped_inputs['rainfall'], 
                                  capped_inputs['ph'], capped_inputs['humidity']]])
        sub_crop_features = sub_crop_df[['N', 'P', 'K', 'temperature', 'rainfall', 'ph', 'humidity']].values
        
        distances = euclidean_distances(input_vector, sub_crop_features)[0]
        sub_crops_with_distances = list(zip(sub_crop_df['sub-crop'], distances))
        sorted_sub_crops = sorted(sub_crops_with_distances, key=lambda x: x[1])[:num_recommendations]
        recommended_sub_crops = [{"sub_crop": crop, "distance": float(dist)} for crop, dist in sorted_sub_crops]
        
        return {
            "main_crop": main_crop,
            "main_confidence": main_confidence,
            "sub_crops": recommended_sub_crops,
            "warnings": warnings if warnings else None
        }
    except Exception as e:
        return {"error": str(e), "main_crop": None, "sub_crops": [], "warnings": None}

# Accuracy Calculation with File Output
def calculate_subcrop_accuracy(subcrop_dir='../datasets/sub_crop_data/', num_recommendations=3):
    total_tests = 0
    correct_matches = 0
    
    # Open debug file for writing
    with open('subcrop_accuracy_debug.txt', 'w') as debug_file:
        for main_crop, filename in crop_name_mapping.items():
            file_path = os.path.join(subcrop_dir, filename)
            if not os.path.exists(file_path):
                debug_file.write(f"Skipping {main_crop}: {filename} not found\n")
                continue
            
            sub_crop_df = pd.read_csv(file_path)
            required_cols = ['sub-crop', 'N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
            if not all(col in sub_crop_df.columns for col in required_cols):
                debug_file.write(f"Skipping {main_crop}: {filename} missing required columns\n")
                continue
            
            for _, row in sub_crop_df.iterrows():
                expected_sub_crop = row['sub-crop']
                test_input = [row['N'], row['P'], row['K'], row['temperature'], 
                              row['humidity'], row['ph'], row['rainfall']]
                
                result = recommend_sub_crops(*test_input, subcrop_dir=subcrop_dir, 
                                            num_recommendations=num_recommendations)
                
                if "error" in result:
                    debug_file.write(f"Error for {main_crop}: {result['error']}\n")
                    continue
                
                predicted_sub_crops = [item['sub_crop'] for item in result['sub_crops']]
                total_tests += 1
                
                if expected_sub_crop in predicted_sub_crops:
                    correct_matches += 1
                else:
                    debug_file.write(f"Mismatch for {main_crop}: Expected {expected_sub_crop}, Got {predicted_sub_crops}\n")
        
        accuracy = (correct_matches / total_tests) * 100 if total_tests > 0 else 0.0
        accuracy_message = f"Accuracy: {accuracy:.2f}% (Correct: {correct_matches}/{total_tests})"
        debug_file.write(f"\n{accuracy_message}\n")
    
    return accuracy, accuracy_message

# Run Test
if __name__ == "__main__":
    accuracy, message = calculate_subcrop_accuracy(subcrop_dir='../datasets/sub_crop_data/')
    print(message)  # Print only the final accuracy to console

Accuracy: 98.52% (Correct: 8470/8597)


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
import pickle
import os

# Realistic Ranges
realistic_ranges = {
    'N': (0, 200), 'P': (0, 200), 'K': (0, 250), 'temperature': (5, 50),
    'humidity': (0, 100), 'ph': (3, 11), 'rainfall': (0, 500)
}

# Updated Crop Mapping
crop_name_mapping = {
    'Rice': 'Rice_subcrop_data.csv',
    'Maize': 'Maize_subcrop_data.csv',
    'Bengal Gram (Gram)(Whole)': 'Bengal Gram (Gram)(Whole)_subcrop_data.csv',
    'Pegeon Pea (Arhar Fali)': 'Pegeon Pea (Arhar Fali)_subcrop_data.csv',
    'Moath Dal': 'Moath Dal_subcrop_data.csv',
    'Green Gram (Moong)(Whole)': 'Green Gram (Moong)(Whole)_subcrop_data.csv',
    'Black Gram (Urd Beans)(Whole)': 'Black Gram (Urd Beans)(Whole)_subcrop_data.csv',
    'Lentil (Masur)(Whole)': 'Lentil (Masur)(Whole)_subcrop_data.csv',
    'Pomegranate': 'Pomegranate_subcrop_data.csv',
    'Banana': 'Banana_subcrop_data.csv',
    'Mango': 'Mango_subcrop_data.csv',
    'Grapes': 'Grapes_subcrop_data.csv',
    'Water Melon': 'Water Melon_subcrop_data.csv',
    'Karbuja (Musk Melon)': 'Karbuja (Musk Melon)_subcrop_data.csv',
    'Apple': 'Apple_subcrop_data.csv',
    'Orange': 'Orange_subcrop_data.csv',
    'Papaya': 'Papaya_subcrop_data.csv',
    'Coconut': 'Coconut_subcrop_data.csv',
    'Cotton': 'Cotton_subcrop_data.csv',
    'Jute': 'Jute_subcrop_data.csv',
    'Coffee': 'Coffee_subcrop_data.csv'
}

# SubCropRecommender Class
class SubCropRecommender:
    def __init__(self, main_model_path='main_crop_model.pkl', subcrop_dir='../datasets/sub_crop_data/'):
        self.main_model = self.load_main_crop_model(main_model_path)
        self.subcrop_dir = subcrop_dir
        self.crop_name_mapping = crop_name_mapping
        self.realistic_ranges = realistic_ranges

    def load_main_crop_model(self, path):
        with open(path, 'rb') as file:
            return pickle.load(file)

    def validate_and_preprocess_input(self, N, P, K, temperature, humidity, ph, rainfall):
        inputs = {'N': N, 'P': P, 'K': K, 'temperature': temperature, 
                  'humidity': humidity, 'ph': ph, 'rainfall': rainfall}
        for param, val in inputs.items():
            try:
                inputs[param] = float(val)
            except (ValueError, TypeError):
                return False, f"Invalid input: {param} must be a number", []
        capped_inputs = {}
        warnings_list = []
        for param, val in inputs.items():
            min_val, max_val = self.realistic_ranges[param]
            if val < min_val or val > max_val:
                warnings_list.append(f"{param} ({val}) outside realistic range ({min_val}-{max_val}), capped")
                capped_inputs[param] = max(min_val, min(val, max_val))
            else:
                capped_inputs[param] = val
        return True, capped_inputs, warnings_list

    def recommend_sub_crops(self, N, P, K, temperature, humidity, ph, rainfall, num_recommendations=3):
        try:
            is_valid, capped_inputs, warnings = self.validate_and_preprocess_input(
                N, P, K, temperature, humidity, ph, rainfall
            )
            if not is_valid:
                return {"error": capped_inputs, "main_crop": None, "sub_crops": [], "warnings": warnings}
            
            input_df = pd.DataFrame([[capped_inputs['N'], capped_inputs['P'], 
                                      capped_inputs['K'], capped_inputs['temperature'], 
                                      capped_inputs['humidity'], capped_inputs['ph'], 
                                      capped_inputs['rainfall']]],
                                    columns=['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall'])
            main_crop = self.main_model.predict(input_df)[0]
            main_confidence = float(max(self.main_model.predict_proba(input_df)[0]))
            
            if main_crop not in self.crop_name_mapping:
                return {"error": f"No sub-crop mapping for {main_crop}", "main_crop": main_crop, 
                        "sub_crops": [], "warnings": warnings}
            
            subcrop_filename = self.crop_name_mapping[main_crop]
            subcrop_file = os.path.join(self.subcrop_dir, subcrop_filename)
            
            if not os.path.exists(subcrop_file):
                return {"error": f"Sub-crop file {subcrop_filename} not found", 
                        "main_crop": main_crop, "sub_crops": [], "warnings": warnings}
            
            sub_crop_df = pd.read_csv(subcrop_file)
            required_cols = ['sub-crop', 'N', 'P', 'K', 'temperature', 'rainfall', 'ph', 'humidity']
            missing_cols = [col for col in required_cols if col not in sub_crop_df.columns]
            if missing_cols:
                return {"error": f"Missing columns: {missing_cols}", "main_crop": main_crop, 
                        "sub_crops": [], "warnings": warnings}
            
            input_vector = np.array([[capped_inputs['N'], capped_inputs['P'], capped_inputs['K'], 
                                      capped_inputs['temperature'], capped_inputs['rainfall'], 
                                      capped_inputs['ph'], capped_inputs['humidity']]])
            sub_crop_features = sub_crop_df[['N', 'P', 'K', 'temperature', 'rainfall', 'ph', 'humidity']].values
            
            distances = euclidean_distances(input_vector, sub_crop_features)[0]
            sub_crops_with_distances = list(zip(sub_crop_df['sub-crop'], distances))
            sorted_sub_crops = sorted(sub_crops_with_distances, key=lambda x: x[1])[:num_recommendations]
            recommended_sub_crops = [{"sub_crop": crop, "distance": float(dist)} for crop, dist in sorted_sub_crops]
            
            return {
                "main_crop": main_crop,
                "main_confidence": main_confidence,
                "sub_crops": recommended_sub_crops,
                "warnings": warnings if warnings else None
            }
        except Exception as e:
            return {"error": str(e), "main_crop": None, "sub_crops": [], "warnings": None}

    def calculate_subcrop_accuracy(self, num_recommendations=3):
        total_tests = 0
        correct_matches = 0
        
        with open('subcrop_accuracy_debug.txt', 'w') as debug_file:
            for main_crop, filename in self.crop_name_mapping.items():
                file_path = os.path.join(self.subcrop_dir, filename)
                if not os.path.exists(file_path):
                    debug_file.write(f"Skipping {main_crop}: {filename} not found\n")
                    continue
                
                sub_crop_df = pd.read_csv(file_path)
                required_cols = ['sub-crop', 'N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
                if not all(col in sub_crop_df.columns for col in required_cols):
                    debug_file.write(f"Skipping {main_crop}: {filename} missing required columns\n")
                    continue
                
                for _, row in sub_crop_df.iterrows():
                    expected_sub_crop = row['sub-crop']
                    test_input = [row['N'], row['P'], row['K'], row['temperature'], 
                                  row['humidity'], row['ph'], row['rainfall']]
                    
                    result = self.recommend_sub_crops(*test_input, num_recommendations=num_recommendations)
                    
                    if "error" in result:
                        debug_file.write(f"Error for {main_crop}: {result['error']}\n")
                        continue
                    
                    predicted_sub_crops = [item['sub_crop'] for item in result['sub_crops']]
                    total_tests += 1
                    
                    if expected_sub_crop in predicted_sub_crops:
                        correct_matches += 1
                    else:
                        debug_file.write(f"Mismatch for {main_crop}: Expected {expected_sub_crop}, Got {predicted_sub_crops}\n")
            
            accuracy = (correct_matches / total_tests) * 100 if total_tests > 0 else 0.0
            accuracy_message = f"Accuracy: {accuracy:.2f}% (Correct: {correct_matches}/{total_tests})"
            debug_file.write(f"\n{accuracy_message}\n")
        
        return accuracy, accuracy_message

# Save the Model as a .pkl File
def save_subcrop_model(filename='subcrop_recommender.pkl'):
    recommender = SubCropRecommender(main_model_path='main_crop_model.pkl', 
                                     subcrop_dir='../datasets/sub_crop_data/')
    with open(filename, 'wb') as file:
        pickle.dump(recommender, file)
    print(f"Sub-crop recommender model saved as '{filename}'")

# Load and Use the Saved Model
def load_subcrop_model(filename='subcrop_recommender.pkl'):
    with open(filename, 'rb') as file:
        return pickle.load(file)

# Test the Model
if __name__ == "__main__":
    # Save the model
    save_subcrop_model('subcrop_recommender.pkl')
    
    # Load and test the saved model
    recommender = load_subcrop_model('subcrop_recommender.pkl')
    
    # Example prediction
    result = recommender.recommend_sub_crops(20, 30, 40, 25, 80, 6.0, 150)
    print("Prediction Result:", result)
    
    # Calculate accuracy
    accuracy, message = recommender.calculate_subcrop_accuracy()
    print(message)

Sub-crop recommender model saved as 'subcrop_recommender.pkl'
Accuracy: 98.52% (Correct: 8470/8597)


In [5]:
# Load the trained SubCropRecommender model
recommender = load_subcrop_model('subcrop_recommender.pkl')

# Define the test input values
N = 107
P = 11
K = 54
temperature = 28.59052369
humidity = 91.33617236
ph = 6.094016338
rainfall = 29.44008034

# Get recommendations
result = recommender.recommend_sub_crops(N, P, K, temperature, humidity, ph, rainfall)

# Print results
print("Test API Response:")
print(result)


Test API Response:


# Model Accuarcy Comparision

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import pickle
import os
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.platypus import PageTemplate, Frame, NextPageTemplate
from reportlab.platypus.flowables import KeepTogether
from reportlab.graphics.shapes import Line
from datetime import datetime

# Import SubCropRecommender from subcrop_rec.py
from subcrop_rec import SubCropRecommender, crop_name_mapping

def load_main_crop_model():
    with open('main_crop_model.pkl', 'rb') as file:
        return pickle.load(file)

def evaluate_subcrop_model(model, subcrop_dir, num_recommendations=3):
    total_tests = 0
    correct_matches = 0
    for main_crop, filename in crop_name_mapping.items():
        file_path = os.path.join(subcrop_dir, filename)
        if not os.path.exists(file_path):
            continue
        sub_crop_df = pd.read_csv(file_path)
        required_cols = ['sub-crop', 'N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
        if not all(col in sub_crop_df.columns for col in required_cols):
            continue
        X = sub_crop_df[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']]
        y = sub_crop_df['sub-crop']
        label_encoder = LabelEncoder()
        y_encoded = label_encoder.fit_transform(y)
        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
        if len(np.unique(y_train)) < 2:
            continue
        model.fit(X_train, y_train)
        for i, test_row in X_test.iterrows():
            input_vector = test_row.values.reshape(1, -1)
            expected_sub_crop = label_encoder.inverse_transform([y_test[i]])[0]
            if isinstance(model, KNeighborsClassifier):
                distances, indices = model.kneighbors(input_vector, n_neighbors=num_recommendations)
                predicted_sub_crops = label_encoder.inverse_transform(model.predict(X_train.iloc[indices[0]]))
            else:  # Random Forest
                probs = model.predict_proba(input_vector)[0]
                top_indices = np.argsort(probs)[-num_recommendations:][::-1]
                predicted_sub_crops = label_encoder.inverse_transform(top_indices)
            total_tests += 1
            if expected_sub_crop in predicted_sub_crops:
                correct_matches += 1
    accuracy = (correct_matches / total_tests) * 100 if total_tests > 0 else 0.0
    return accuracy

def compare_models():
    # Load main crop dataset
    data = pd.read_csv('../datasets/Crop_recommendation.csv')
    X = data[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']]
    y = data['label']
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # Main crop models
    main_crop_models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
        "SVM (Linear)": SVC(kernel='linear', probability=True),
        "SVM (RBF)": SVC(kernel='rbf', probability=True),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(n_estimators=100),
        "Naive Bayes": GaussianNB(),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
        "Main Crop Model": load_main_crop_model()
    }

    # Main crop results
    main_crop_results = []
    for name, model in main_crop_models.items():
        try:
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            acc = accuracy_score(y_test, preds)
            main_crop_results.append((name, acc * 100))
        except Exception as e:
            main_crop_results.append((name, 0.0))
            print(f"Error evaluating {name}: {str(e)}")

    # Sub-crop models
    subcrop_models = {
        "Euclidean Distance (Your Model)": None,  # Placeholder for SubCropRecommender
        "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
        "Random Forest": RandomForestClassifier(n_estimators=100)
    }

    # Sub-crop results
    subcrop_results = []
    for name, model in subcrop_models.items():
        if name == "Euclidean Distance (Your Model)":
            try:
                recommender = SubCropRecommender(main_model_path='main_crop_model.pkl', subcrop_dir='../datasets/sub_crop_data/')
                acc, _ = recommender.calculate_subcrop_accuracy()
                subcrop_results.append((name, acc))
            except Exception as e:
                subcrop_results.append((name, 0.0))
                print(f"Error calculating sub-crop accuracy: {str(e)}")
        else:
            try:
                acc = evaluate_subcrop_model(model, subcrop_dir='../datasets/sub_crop_data/')
                subcrop_results.append((name, acc))
            except Exception as e:
                subcrop_results.append((name, 0.0))
                print(f"Error evaluating {name} for sub-crop: {str(e)}")

    return main_crop_results, subcrop_results

def header(canvas, doc):
    canvas.saveState()
    canvas.setFont('Helvetica-Bold', 10)
    canvas.setFillColor(colors.darkgreen)
    canvas.drawString(inch, doc.pagesize[1] - 0.75 * inch, "Crop Combination Recommendation and Price Prediction")
    canvas.setFont('Helvetica', 8)
    canvas.setFillColor(colors.grey)
    canvas.drawRightString(doc.pagesize[0] - inch, doc.pagesize[1] - 0.75 * inch, f"Page {doc.page}")
    canvas.line(inch, doc.pagesize[1] - 0.85 * inch, doc.pagesize[0] - inch, doc.pagesize[1] - 0.85 * inch)
    canvas.restoreState()

def generate_pdf_report(main_crop_results, subcrop_results, output_filename="Subcrop_Model_Comparison_Report.pdf"):
    doc = SimpleDocTemplate(output_filename, pagesize=letter, rightMargin=inch, leftMargin=inch, topMargin=1.5 * inch, bottomMargin=inch)
    styles = getSampleStyleSheet()

    # Custom styles
    cover_title_style = ParagraphStyle(name='CoverTitle', fontName='Helvetica-Bold', fontSize=18, textColor=colors.darkgreen, alignment=1, spaceAfter=12)
    cover_subtitle_style = ParagraphStyle(name='CoverSubtitle', fontName='Helvetica', fontSize=12, textColor=colors.black, alignment=1, spaceAfter=8)
    heading_style = ParagraphStyle(name='Heading2', fontName='Helvetica-Bold', fontSize=14, textColor=colors.darkblue, spaceBefore=12, spaceAfter=6)
    body_style = ParagraphStyle(name='BodyText', fontName='Times-Roman', fontSize=10, leading=12, spaceAfter=8, alignment=4, wordWrap='CJK')

    elements = []

    # Cover page
    elements.append(Spacer(1, 2 * inch))
    elements.append(Paragraph("Crop Combination Recommendation and Price Prediction", cover_title_style))
    elements.append(Paragraph("CS6611 Creative and Innovative Project", cover_subtitle_style))
    elements.append(Paragraph("Submitted by: [Your Name]", cover_subtitle_style))
    elements.append(Paragraph(f"Date: {datetime.now().strftime('%Y-%m-%d')}", cover_subtitle_style))
    elements.append(Spacer(1, 2.5 * inch))
    elements.append(Paragraph("Department of Computer Science", cover_subtitle_style))
    elements.append(Paragraph("[Your University Name]", cover_subtitle_style))
    elements.append(PageBreak())

    # Content page template
    frame = Frame(doc.leftMargin, doc.bottomMargin, doc.width, doc.height - 1 * inch)
    template = PageTemplate(id='content', frames=[frame], onPage=header)
    doc.addPageTemplates([template])
    elements.append(NextPageTemplate('content'))

    # Introduction
    elements.append(Paragraph("Introduction", heading_style))
    elements.append(Paragraph(
        "This report, part of the CS6611 Creative and Innovative Project titled 'Crop Combination Recommendation and Price Prediction,' "
        "compares the sub-crop recommendation accuracy of a custom Euclidean Distance-based model (SubCropRecommender) with other machine learning models. "
        "The custom model predicts a main crop and recommends sub-crops using sub-crop datasets. "
        "Main crop prediction accuracy is also evaluated using the Crop Recommendation dataset (features: Nitrogen, Phosphorus, Potassium, temperature, humidity, pH, rainfall).",
        body_style
    ))
    elements.append(Spacer(1, 12))

    # Methodology
    elements.append(Paragraph("Methodology", heading_style))
    elements.append(Paragraph(
        "For main crop prediction, the Crop Recommendation dataset was preprocessed with LabelEncoder and split into 80% training and 20% testing sets. "
        "Models evaluated include Logistic Regression, KNN, SVM (Linear and RBF), Decision Tree, Random Forest, Naive Bayes, XGBoost, and the Main Crop Model (from a pickle file). "
        "Accuracy was calculated as the percentage of correct main crop predictions. "
        "For sub-crop recommendation, the SubCropRecommender uses Euclidean distance to rank sub-crops. KNN and Random Forest were adapted to predict sub-crops, "
        "with top-3 accuracy calculated as the percentage of cases where the expected sub-crop is among the top 3 recommendations.",
        body_style
    ))
    elements.append(Spacer(1, 12))

    # Results
    elements.append(Paragraph("Results", heading_style))
    elements.append(Paragraph(
        "The tables below present the main crop prediction accuracy and sub-crop recommendation accuracy (top-3) for the evaluated models.",
        body_style
    ))
    elements.append(Spacer(1, 12))

    # Main crop results table
    elements.append(Paragraph("Main Crop Prediction Accuracy", heading_style))
    total_width = doc.width
    colWidths = [total_width * 0.65, total_width * 0.35]
    table_data = [["Model", "Accuracy (%)"]] + [[name, f"{acc:.2f}"] for name, acc in main_crop_results]
    table = Table(table_data, colWidths=colWidths, rowHeights=[0.4 * inch] * len(table_data))
    table.setStyle(TableStyle([
        ('BACKGROUND', (0, 0), (-1, 0), colors.darkgreen),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
        ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
        ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
        ('FONTSIZE', (0, 0), (-1, -1), 10),
        ('BOTTOMPADDING', (0, 0), (-1, 0), 8),
        ('TOPPADDING', (0, 0), (-1, -1), 8),
        ('BACKGROUND', (0, 1), (-1, -1), colors.white),
        ('BACKGROUND', (0, 2), (-1, 2), colors.lightgrey),
        ('BACKGROUND', (0, 4), (-1, 4), colors.lightgrey),
        ('BACKGROUND', (0, 6), (-1, 6), colors.lightgrey),
        ('BACKGROUND', (0, 8), (-1, 8), colors.lightgrey),
        ('GRID', (0, 0), (-1, -1), 0.5, colors.black),
        ('LEFTPADDING', (0, 0), (-1, -1), 6),
        ('RIGHTPADDING', (0, 0), (-1, -1), 6),
    ]))
    elements.append(KeepTogether(table))
    elements.append(Spacer(1, 12))

    # Sub-crop results table
    elements.append(Paragraph("Sub-Crop Recommendation Accuracy (Top-3)", heading_style))
    table_data = [["Model", "Accuracy (%)"]] + [[name, f"{acc:.2f}"] for name, acc in subcrop_results]
    table = Table(table_data, colWidths=colWidths, rowHeights=[0.4 * inch] * len(table_data))
    table.setStyle(TableStyle([
        ('BACKGROUND', (0, 0), (-1, 0), colors.darkgreen),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
        ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
        ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
        ('FONTSIZE', (0, 0), (-1, -1), 10),
        ('BOTTOMPADDING', (0, 0), (-1, 0), 8),
        ('TOPPADDING', (0, 0), (-1, -1), 8),
        ('BACKGROUND', (0, 1), (-1, -1), colors.white),
        ('BACKGROUND', (0, 2), (-1, 2), colors.lightgrey),
        ('GRID', (0, 0), (-1, -1), 0.5, colors.black),
        ('LEFTPADDING', (0, 0), (-1, -1), 6),
        ('RIGHTPADDING', (0, 0), (-1, -1), 6),
    ]))
    elements.append(KeepTogether(table))
    elements.append(Spacer(1, 12))

    # Discussion
    elements.append(Paragraph("Discussion", heading_style))
    elements.append(Paragraph(
        "For main crop prediction, ensemble methods like Random Forest and XGBoost achieved the highest accuracies (above 98%), reflecting their ability to model complex patterns. "
        "The Main Crop Model’s performance depends on its underlying algorithm. "
        "For sub-crop recommendation, the SubCropRecommender’s Euclidean Distance approach is compared with KNN and Random Forest, which use probabilistic or distance-based ranking. "
        "Differences in accuracy may stem from data quality, sub-crop dataset size, or algorithmic strengths. The top-3 metric is less stringent than exact-match accuracy, "
        "but it suits the recommendation task’s practical needs.",
        body_style
    ))
    elements.append(Spacer(1, 12))

    # Conclusion
    elements.append(Paragraph("Conclusion", heading_style))
    elements.append(Paragraph(
        "This analysis, part of the CS6611 project, evaluates the SubCropRecommender against other models for sub-crop recommendation, with main crop prediction for context. "
        "The custom model performs competitively, while ensemble methods excel in main crop prediction. "
        "Future work will refine sub-crop recommendations and integrate price prediction, enhancing agricultural decision-making.",
        body_style
    ))

    # Build PDF
    doc.build(elements)
    print(f"Report generated: {output_filename}")

# Run comparison and generate report
main_crop_results, subcrop_results = compare_models()
generate_pdf_report(main_crop_results, subcrop_results)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



Error evaluating KNN (k=5) for sub-crop: index 361 is out of bounds for axis 0 with size 100
Error evaluating Random Forest for sub-crop: index 361 is out of bounds for axis 0 with size 100
Report generated: Subcrop_Model_Comparison_Report.pdf


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
import pickle
import os
import math

# Realistic Ranges
realistic_ranges = {
    'N': (0, 200), 'P': (0, 200), 'K': (0, 250), 'temperature': (5, 50),
    'humidity': (0, 100), 'ph': (3, 11), 'rainfall': (0, 500)
}

# Updated Crop Mapping
crop_name_mapping = {
    'Rice': 'Rice_subcrop_data.csv',
    'Maize': 'Maize_subcrop_data.csv',
    'Bengal Gram (Gram)(Whole)': 'Bengal Gram (Gram)(Whole)_subcrop_data.csv',
    'Pegeon Pea (Arhar Fali)': 'Pegeon Pea (Arhar Fali)_subcrop_data.csv',
    'Moath Dal': 'Moath Dal_subcrop_data.csv',
    'Green Gram (Moong)(Whole)': 'Green Gram (Moong)(Whole)_subcrop_data.csv',
    'Black Gram Dal (Urd Dal)': 'Black Gram Dal (Urd Dal)_subcrop_data.csv',
    'Lentil (Masur)(Whole)': 'Lentil (Masur)(Whole)_subcrop_data.csv',
    'Pomegranate': 'Pomegranate_subcrop_data.csv',
    'Banana': 'Banana_subcrop_data.csv',
    'Mango': 'Mango_subcrop_data.csv',
    'Grapes': 'Grapes_subcrop_data.csv',
    'Water Melon': 'Water Melon_subcrop_data.csv',
    'Karbuja (Musk Melon)': 'Karbuja (Musk Melon)_subcrop_data.csv',
    'Apple': 'Apple_subcrop_data.csv',
    'Orange': 'Orange_subcrop_data.csv',
    'Papaya': 'Papaya_subcrop_data.csv',
    'Coconut': 'Coconut_subcrop_data.csv',
    'Cotton': 'Cotton_subcrop_data.csv',
    'Jute': 'Jute_subcrop_data.csv',
    'Coffee': 'Coffee_subcrop_data.csv'
}

# SubCropRecommender Class
class SubCropRecommender:
    def __init__(self, main_model_path='main_crop_model.pkl', subcrop_dir='C:/Projects/Creative & Innovative Project/datasets/sub_crop_data/'):
        self.main_model = self.load_main_crop_model(main_model_path)
        self.subcrop_dir = subcrop_dir
        self.crop_name_mapping = crop_name_mapping
        self.realistic_ranges = realistic_ranges

    def load_main_crop_model(self, path):
        try:
            with open(path, 'rb') as file:
                model_data = pickle.load(file)
                return model_data  # Expecting {'model': ..., 'label_encoder': ..., 'scaler': ...}
        except Exception as e:
            print(f"Error loading main crop model: {str(e)}")
            return None

    def validate_and_preprocess_input(self, N, P, K, temperature, humidity, ph, rainfall):
        inputs = {'N': N, 'P': P, 'K': K, 'temperature': temperature, 
                  'humidity': humidity, 'ph': ph, 'rainfall': rainfall}
        for param, val in inputs.items():
            try:
                inputs[param] = float(val)
            except (ValueError, TypeError):
                return False, f"Invalid input: {param} must be a number", []
        capped_inputs = {}
        warnings_list = []
        for param, val in inputs.items():
            min_val, max_val = self.realistic_ranges[param]
            if val < min_val or val > max_val:
                warnings_list.append(f"{param} ({val}) outside realistic range ({min_val}-{max_val}), capped")
                capped_inputs[param] = max(min_val, min(val, max_val))
            else:
                capped_inputs[param] = val
        return True, capped_inputs, warnings_list

    def recommend_sub_crops(self, N, P, K, temperature, humidity, ph, rainfall, num_recommendations=3):
        try:
            if self.main_model is None:
                return {"error": "Main crop model not loaded", "main_crop": None, "sub_crops": [], "warnings": None}
            
            is_valid, capped_inputs, warnings = self.validate_and_preprocess_input(
                N, P, K, temperature, humidity, ph, rainfall
            )
            if not is_valid:
                return {"error": capped_inputs, "main_crop": None, "sub_crops": [], "warnings": warnings}
            
            input_df = pd.DataFrame([[capped_inputs['N'], capped_inputs['P'], 
                                      capped_inputs['K'], capped_inputs['temperature'], 
                                      capped_inputs['humidity'], capped_inputs['ph'], 
                                      capped_inputs['rainfall']]],
                                    columns=['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall'])
            input_scaled = self.main_model['scaler'].transform(input_df)
            main_crop_encoded = self.main_model['model'].predict(input_scaled)[0]
            main_crop = self.main_model['label_encoder'].inverse_transform([main_crop_encoded])[0]
            main_confidence = float(max(self.main_model['model'].predict_proba(input_scaled)[0]))
            
            if main_crop not in self.crop_name_mapping:
                return {"error": f"No sub-crop mapping for {main_crop}", "main_crop": main_crop, 
                        "sub_crops": [], "warnings": warnings}
            
            subcrop_filename = self.crop_name_mapping[main_crop]
            subcrop_file = os.path.join(self.subcrop_dir, subcrop_filename)
            
            if not os.path.exists(subcrop_file):
                return {"error": f"Sub-crop file {subcrop_filename} not found", 
                        "main_crop": main_crop, "sub_crops": [], "warnings": warnings}
            
            sub_crop_df = pd.read_csv(subcrop_file)
            required_cols = ['sub-crop', 'N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
            missing_cols = [col for col in required_cols if col not in sub_crop_df.columns]
            if missing_cols:
                return {"error": f"Missing columns: {missing_cols}", "main_crop": main_crop, 
                        "sub_crops": [], "warnings": warnings}
            
            input_vector = np.array([[capped_inputs['N'], capped_inputs['P'], capped_inputs['K'], 
                                      capped_inputs['temperature'], capped_inputs['humidity'], 
                                      capped_inputs['ph'], capped_inputs['rainfall']]])
            sub_crop_features = sub_crop_df[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']].values
            sub_crop_names = sub_crop_df['sub-crop'].values
            
            distances = euclidean_distances(input_vector, sub_crop_features)[0]
            sub_crops_with_distances = list(zip(sub_crop_names, distances, sub_crop_features))
            sorted_sub_crops = sorted(sub_crops_with_distances, key=lambda x: x[1])[:num_recommendations]
            recommended_sub_crops = [{"sub_crop": crop, "distance": float(dist), "features": features} 
                                    for crop, dist, features in sorted_sub_crops]
            
            return {
                "main_crop": main_crop,
                "main_confidence": main_confidence,
                "sub_crops": recommended_sub_crops,
                "warnings": warnings if warnings else None
            }
        except Exception as e:
            return {"error": str(e), "main_crop": None, "sub_crops": [], "warnings": None}

    def calculate_subcrop_accuracy(self, num_recommendations=3):
        total_tests = 0
        correct_matches = 0
        precision_sum = 0
        reciprocal_rank_sum = 0
        dcg_sum = 0
        distances_correct = []
        diversity_sum = 0
        all_recommended_subcrops = set()
        total_unique_subcrops = set()
        skipped_datasets = []
        evaluated_datasets = []
        
        with open('subcrop_accuracy_debug.txt', 'w') as debug_file:
            for main_crop, filename in self.crop_name_mapping.items():
                file_path = os.path.join(self.subcrop_dir, filename)
                if not os.path.exists(file_path):
                    skipped_datasets.append(f"{main_crop}: File {filename} not found")
                    debug_file.write(f"Skipping {main_crop}: {filename} not found\n")
                    continue
                
                try:
                    sub_crop_df = pd.read_csv(file_path)
                    required_cols = ['sub-crop', 'N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
                    if not all(col in sub_crop_df.columns for col in required_cols):
                        skipped_datasets.append(f"{main_crop}: Missing required columns")
                        debug_file.write(f"Skipping {main_crop}: {filename} missing required columns\n")
                        continue
                    
                    num_samples = len(sub_crop_df)
                    num_unique_subcrops = len(sub_crop_df['sub-crop'].unique())
                    if num_samples < 30 or num_unique_subcrops < 3:
                        skipped_datasets.append(f"{main_crop}: Insufficient samples ({num_samples}) or unique sub-crops ({num_unique_subcrops})")
                        debug_file.write(f"Skipping {main_crop}: Insufficient samples ({num_samples}) or unique sub-crops ({num_unique_subcrops})\n")
                        continue
                    
                    evaluated_datasets.append(f"{main_crop}: {num_samples} samples, {num_unique_subcrops} sub-crops")
                    total_unique_subcrops.update(sub_crop_df['sub-crop'].unique())
                    
                    for _, row in sub_crop_df.iterrows():
                        expected_sub_crop = row['sub-crop']
                        test_input = [row['N'], row['P'], row['K'], row['temperature'], 
                                      row['humidity'], row['ph'], row['rainfall']]
                        
                        result = self.recommend_sub_crops(*test_input, num_recommendations=num_recommendations)
                        
                        if "error" in result:
                            skipped_datasets.append(f"{main_crop}: Recommendation error - {result['error']}")
                            debug_file.write(f"Error for {main_crop}: {result['error']}\n")
                            continue
                        
                        predicted_sub_crops = [item['sub_crop'] for item in result['sub_crops']]
                        predicted_distances = [item['distance'] for item in result['sub_crops']]
                        predicted_features = [item['features'] for item in result['sub_crops']]
                        all_recommended_subcrops.update(predicted_sub_crops)
                        total_tests += 1
                        
                        # Top-3 Accuracy, Recall@3, Hit Rate@3
                        if expected_sub_crop in predicted_sub_crops:
                            correct_matches += 1
                            # Average Euclidean Distance for correct sub-crop
                            rank = predicted_sub_crops.index(expected_sub_crop)
                            distances_correct.append(predicted_distances[rank])
                        
                        # Precision@3
                        correct_in_top3 = sum(1 for pred in predicted_sub_crops if pred == expected_sub_crop)
                        precision_sum += correct_in_top3 / num_recommendations
                        
                        # MRR
                        rank = next((i + 1 for i, pred in enumerate(predicted_sub_crops) if pred == expected_sub_crop), 0)
                        reciprocal_rank_sum += (1 / rank) if rank > 0 else 0
                        
                        # NDCG@3
                        dcg = sum((1 / math.log2(i + 2)) if pred == expected_sub_crop else 0 
                                  for i, pred in enumerate(predicted_sub_crops))
                        idcg = 1 / math.log2(2)  # Ideal: correct sub-crop at rank 1
                        dcg_sum += dcg / idcg if idcg > 0 else 0
                        
                        # Diversity (Intra-List Diversity)
                        if len(predicted_features) >= 2:
                            pairwise_distances = []
                            for i in range(len(predicted_features)):
                                for j in range(i + 1, len(predicted_features)):
                                    dist = np.sqrt(np.sum((predicted_features[i] - predicted_features[j]) ** 2))
                                    pairwise_distances.append(dist)
                            diversity_sum += np.mean(pairwise_distances) if pairwise_distances else 0
                        
                        if expected_sub_crop not in predicted_sub_crops:
                            debug_file.write(f"Mismatch for {main_crop}: Expected {expected_sub_crop}, Got {predicted_sub_crops}\n")
                except Exception as e:
                    skipped_datasets.append(f"{main_crop}: Data loading error - {str(e)}")
                    debug_file.write(f"Error loading {main_crop}: {str(e)}\n")
                    continue
            
            accuracy = (correct_matches / total_tests) * 100 if total_tests > 0 else 0.0
            precision_at_3 = (precision_sum / total_tests) * 100 if total_tests > 0 else 0.0
            recall_at_3 = (correct_matches / total_tests) * 100 if total_tests > 0 else 0.0
            f1_score_at_3 = (2 * precision_at_3 * recall_at_3 / (precision_at_3 + recall_at_3)) if (precision_at_3 + recall_at_3) > 0 else 0.0
            mrr = (reciprocal_rank_sum / total_tests) if total_tests > 0 else 0.0
            ndcg_at_3 = (dcg_sum / total_tests) if total_tests > 0 else 0.0
            hit_rate_at_3 = (correct_matches / total_tests) * 100 if total_tests > 0 else 0.0
            avg_distance = sum(distances_correct) / len(distances_correct) if distances_correct else float('inf')
            diversity = (diversity_sum / total_tests) if total_tests > 0 else 0.0
            coverage = len(all_recommended_subcrops) / len(total_unique_subcrops) * 100 if total_unique_subcrops else 0.0
            
            metrics = {
                'Top-3 Accuracy (%)': accuracy,
                'Precision@3 (%)': precision_at_3,
                'Recall@3 (%)': recall_at_3,
                'F1-Score@3 (%)': f1_score_at_3,
                'Mean Reciprocal Rank': mrr,
                'NDCG@3': ndcg_at_3,
                'Hit Rate@3 (%)': hit_rate_at_3,
                'Average Euclidean Distance': avg_distance,
                'Diversity': diversity,
                'Coverage (%)': coverage
            }
            metrics_message = "\n".join(f"{key}: {value:.2f}" for key, value in metrics.items())
            debug_file.write(f"\n{metrics_message}\n")
        
        return metrics, evaluated_datasets, skipped_datasets

# Test the Model and Print Metrics
if __name__ == "__main__":
    # Initialize recommender
    recommender = SubCropRecommender(main_model_path='main_crop_model.pkl', 
                                     subcrop_dir='C:/Projects/Creative & Innovative Project/datasets/sub_crop_data/')
    
    # Calculate and print metrics
    metrics, evaluated_datasets, skipped_datasets = recommender.calculate_subcrop_accuracy()
    print("Performance Metrics:")
    for key, value in metrics.items():
        print(f"  {key}: {value:.2f}")
    print("\nEvaluated Datasets:")
    if evaluated_datasets:
        for dataset in evaluated_datasets:
            print(f"  - {dataset}")
    else:
        print("  None")
    print("\nSkipped Datasets:")
    if skipped_datasets:
        for dataset in skipped_datasets:
            print(f"  - {dataset}")
    else:
        print("  None")

Performance Metrics:
  Top-3 Accuracy (%): 97.91
  Precision@3 (%): 49.25
  Recall@3 (%): 97.91
  F1-Score@3 (%): 65.54
  Mean Reciprocal Rank: 0.98
  NDCG@3: 1.26
  Hit Rate@3 (%): 97.91
  Average Euclidean Distance: 0.02
  Diversity: 6.57
  Coverage (%): 100.00

Evaluated Datasets:
  - Rice: 500 samples, 5 sub-crops
  - Maize: 300 samples, 3 sub-crops
  - Bengal Gram (Gram)(Whole): 400 samples, 4 sub-crops
  - Pegeon Pea (Arhar Fali): 500 samples, 5 sub-crops
  - Moath Dal: 500 samples, 5 sub-crops
  - Green Gram (Moong)(Whole): 400 samples, 5 sub-crops
  - Black Gram Dal (Urd Dal): 500 samples, 5 sub-crops
  - Lentil (Masur)(Whole): 400 samples, 4 sub-crops
  - Pomegranate: 500 samples, 5 sub-crops
  - Banana: 500 samples, 5 sub-crops
  - Mango: 500 samples, 5 sub-crops
  - Water Melon: 500 samples, 5 sub-crops
  - Karbuja (Musk Melon): 300 samples, 3 sub-crops
  - Apple: 500 samples, 5 sub-crops
  - Orange: 300 samples, 3 sub-crops
  - Papaya: 400 samples, 4 sub-crops
  - Coconut: 