In [1]:
import pandas as pd

# Load the dataset
df = pd.read_excel("realistic_plant1.xlsx")

# Display the first few rows of the dataset
print(df.head())


   Sunlight (hours/day)  Wind (m/s)        pH Soil Type  Temperature (°C)  \
0              9.968287    2.601858  6.440622      Clay         18.832119   
1              7.005389    2.409634  6.161043     Loamy         26.265529   
2              8.476037    2.963828  6.239556      Clay         10.304735   
3              7.035587    2.050289  6.544074     Loamy         29.041636   
4              7.244446    3.043450  5.823265     Sandy         33.504069   

   Water (mm/month)  Carbon Dioxide (ppm)  Minerals (%) Plant Type  
0         51.669142                   400      2.421354      Wheat  
1         65.449886                   350      1.113701      Maize  
2         75.081722                   400      2.734520      Wheat  
3         97.326862                   350      1.234929      Maize  
4         49.291020                   350      2.843357    Sorghum  


In [2]:
 from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Separate features and target
X = df.drop("Plant Type", axis=1)
y = df["Plant Type"]

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Preprocess the features
numeric_features = ["Sunlight (hours/day)", "Wind (m/s)", "pH", "Temperature (°C)", "Water (mm/month)", "Carbon Dioxide (ppm)", "Minerals (%)"]
categorical_features = ["Soil Type"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report


# Create a pipeline with preprocessing and the model
rf_model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, random_state=42 , oob_score = True))

# Train the model
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))


Random Forest Classification Report:
               precision    recall  f1-score   support

       Maize       1.00      1.00      1.00       167
        Rice       1.00      1.00      1.00       142
     Sorghum       1.00      1.00      1.00       147
       Wheat       1.00      1.00      1.00       144

    accuracy                           1.00       600
   macro avg       1.00      1.00      1.00       600
weighted avg       1.00      1.00      1.00       600



In [24]:
print(X_test)

      Sunlight (hours/day)  Wind (m/s)        pH Soil Type  Temperature (°C)  \
1801              7.370339    4.972775  6.100117     Sandy         25.176141   
1190              7.466210    2.931498  6.016909     Loamy         27.946522   
1817              8.448361    3.091506  6.234323      Clay         14.529289   
251               8.935256    2.849083  5.726475     Sandy         25.667204   
2505              6.983025    2.471128  6.862752     Loamy         22.382993   
...                    ...         ...       ...       ...               ...   
104               7.327050    1.697424  5.539299      Clay         23.994697   
2087              7.795195    2.156474  6.977248     Loamy         27.037781   
599               7.254036    2.690757  6.050313     Sandy         29.450042   
1756              7.272935    2.207498  5.746366     Sandy         25.947737   
1323              7.528889    2.239169  5.907960     Loamy         20.249616   

      Water (mm/month)  Carbon Dioxide 

In [20]:
import joblib

In [21]:
version = joblib.__version__

In [23]:
joblib.dump(rf_model,"model{version}.pkl".format(version= version))

['model1.1.1.pkl']