In [1]:
# =========================
# 0. Install & imports
# =========================
!pip install -q scikit-learn pandas numpy joblib

import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib


In [4]:
# =========================
# 1. Load dataset
# =========================
# Make sure this filename matches the one you uploaded
df = pd.read_excel("orchid_growth_dataset.xlsx")

print(df.head())
print(df.columns)
print("Shape:", df.shape)


   sample_id           jar_id planting_date  age_days  plant_height_mm  \
0          1  JAR_20250811_01    2025-08-11        30              4.7   
1         17  JAR_20250811_02    2025-08-11        30              2.5   
2         33  JAR_20250811_03    2025-08-11        30              3.3   
3         49  JAR_20250811_04    2025-08-11        30              2.0   
4         65  JAR_20250811_05    2025-08-11        30              3.8   

  expected_height_range  
0               3–10 mm  
1               3–10 mm  
2               3–10 mm  
3               3–10 mm  
4               3–10 mm  
Index(['sample_id', 'jar_id', 'planting_date', 'age_days', 'plant_height_mm',
       'expected_height_range'],
      dtype='object')
Shape: (992, 6)


In [5]:
# =========================
# 2. Parse expected range & create labels
# =========================

def parse_range_to_min_max(s):
    """
    Convert strings like '3–10 mm' into numeric (3.0, 10.0).
    Works even if the dash is an en dash.
    """
    s = str(s)
    # Replace non-digits with spaces and split
    nums = ''.join(ch if (ch.isdigit() or ch == '.' or ch == '-') else ' '
                   for ch in s).split()
    if len(nums) >= 2:
        return float(nums[0]), float(nums[1])
    else:
        return np.nan, np.nan

df["h_min"], df["h_max"] = zip(*df["expected_height_range"].map(parse_range_to_min_max))

# 3-class label
df["growth_label"] = np.select(
    [df["plant_height_mm"] < df["h_min"],
     df["plant_height_mm"] > df["h_max"]],
    ["below_expected", "above_expected"],
    default="within_expected"
)

print(df[["plant_height_mm", "expected_height_range", "h_min", "h_max", "growth_label"]].head())
print(df["growth_label"].value_counts())


   plant_height_mm expected_height_range  h_min  h_max     growth_label
0              4.7               3–10 mm    3.0   10.0  within_expected
1              2.5               3–10 mm    3.0   10.0   below_expected
2              3.3               3–10 mm    3.0   10.0  within_expected
3              2.0               3–10 mm    3.0   10.0   below_expected
4              3.8               3–10 mm    3.0   10.0  within_expected
growth_label
above_expected     716
within_expected    241
below_expected      35
Name: count, dtype: int64


In [6]:
# =========================
# 3. Build feature matrix and target
# =========================

feature_cols = ["age_days", "plant_height_mm"]
X = df[feature_cols].copy()
y = df["growth_label"]   # 3-class target: below / within / above

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # keeps class proportions
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)


Train size: (793, 2) Test size: (199, 2)


In [7]:
# =========================
# 4. Train the RandomForest model
# =========================

clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    class_weight="balanced"  # handle class imbalance (few 'below_expected')
)

clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print("Classification report:")
print(classification_report(y_test, y_pred))

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))


Classification report:
                 precision    recall  f1-score   support

 above_expected       0.99      1.00      1.00       144
 below_expected       1.00      1.00      1.00         7
within_expected       1.00      0.98      0.99        48

       accuracy                           0.99       199
      macro avg       1.00      0.99      1.00       199
   weighted avg       1.00      0.99      0.99       199

Confusion matrix:
[[144   0   0]
 [  0   7   0]
 [  1   0  47]]


In [8]:
# =========================
# 5. Save model & metadata
# =========================

model_path = "orchid_growth_rf_model.joblib"
metadata_path = "orchid_growth_metadata.joblib"

joblib.dump(clf, model_path)

metadata = {
    "feature_cols": feature_cols
}
joblib.dump(metadata, metadata_path)

print("Saved model to:", model_path)
print("Saved metadata to:", metadata_path)


Saved model to: orchid_growth_rf_model.joblib
Saved metadata to: orchid_growth_metadata.joblib


In [9]:
# =========================
# 6. Helper functions for deployment-style usage
# =========================

def compute_age_days(planting_date_str, current_date_str, date_format="%Y-%m-%d"):
    """
    planting_date_str: 'YYYY-MM-DD'
    current_date_str: 'YYYY-MM-DD'
    returns integer age in days
    """
    planting_date = datetime.strptime(planting_date_str, date_format)
    current_date = datetime.strptime(current_date_str, date_format)
    return (current_date - planting_date).days


def load_model_and_metadata(model_path="orchid_growth_rf_model.joblib",
                            metadata_path="orchid_growth_metadata.joblib"):
    model = joblib.load(model_path)
    metadata = joblib.load(metadata_path)
    return model, metadata


def classify_orchid_growth(planting_date_str, current_date_str, current_height_mm,
                           model=None, metadata=None,
                           date_format="%Y-%m-%d"):
    """
    High-level function you can call from your backend.
    Returns (predicted_label, probabilities_dict, age_days).
    """
    # Compute age days from dates
    age_days = compute_age_days(planting_date_str, current_date_str, date_format=date_format)

    # Lazy-load model if not provided
    if model is None or metadata is None:
        model, metadata = load_model_and_metadata()

    feature_cols = metadata["feature_cols"]

    # Build a single-row DataFrame for the model
    X_new = pd.DataFrame([{
        "age_days": age_days,
        "plant_height_mm": float(current_height_mm)
    }], columns=feature_cols)

    # Predict
    pred_label = model.predict(X_new)[0]

    # Optional: probabilities
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_new)[0]
        class_labels = model.classes_
        proba_dict = {cls: float(p) for cls, p in zip(class_labels, proba)}
    else:
        proba_dict = None

    return pred_label, proba_dict, age_days


In [15]:
def get_expected_height_range(age_days):
    """
    Updated expected height ranges based on real orchid dataset behavior.
    """
    if age_days <= 40:
        return (3, 10)     # Early stage (your dataset shows 3–10mm)
    elif age_days <= 60:
        return (8, 20)     # Mid stage
    elif age_days <= 80:
        return (15, 28)    # Growing stage
    elif age_days <= 100:
        return (20, 35)    # Late stage
    elif age_days <= 120:
        return (25, 40)    # Mature stage
    else:
        return (30, 50)


In [13]:
def classify_orchid_growth(planting_date_str, current_date_str, current_height_mm,
                           model=None, metadata=None, date_format="%Y-%m-%d"):

    age_days = compute_age_days(planting_date_str, current_date_str, date_format=date_format)

    if model is None or metadata is None:
        model, metadata = load_model_and_metadata()

    X_new = pd.DataFrame([{
        "age_days": age_days,
        "plant_height_mm": float(current_height_mm)
    }], columns=metadata["feature_cols"])

    pred_label = model.predict(X_new)[0]

    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_new)[0]
        proba = {cls: float(p) for cls, p in zip(model.classes_, proba)}
    else:
        proba = None

    expected_range = get_expected_height_range(age_days)

    return {
        "predicted_label": pred_label,
        "probabilities": proba,
        "age_days": age_days,
        "expected_height_range": expected_range
    }


In [16]:
result = classify_orchid_growth(
    planting_date_str="2025-01-01",
    current_date_str="2025-02-10",
    current_height_mm=7.0
)

result


{'predicted_label': 'within_expected',
 'probabilities': {'above_expected': 0.0,
  'below_expected': 0.0,
  'within_expected': 1.0},
 'age_days': 40,
 'expected_height_range': (3, 10)}