<a href="https://colab.research.google.com/github/Anjani-k-16/MicrosAIl/blob/main/notebooks/microsail_initial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1 - Setup & imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import joblib
sns.set(style="whitegrid")
np.random.seed(42)


In [None]:
# Cell 2 - Create small synthetic soil dataset (you can replace with real data later)
n = 60
df = pd.DataFrame({
    "sample_id": [f"S{str(i+1).zfill(3)}" for i in range(n)],
    "farm_id": [f"Farm{(i%8)+1}" for i in range(n)],
    "pH": np.round(np.random.normal(6.5, 0.5, n), 2),
    "organic_matter": np.round(np.abs(np.random.normal(3.5, 1.0, n)), 2),
    "moisture_pct": np.round(np.clip(np.random.normal(20, 5, n), 5, 40), 1),
    "N": np.round(np.clip(np.random.normal(40, 12, n), 5, 120), 1),
    "P": np.round(np.clip(np.random.normal(25, 8, n), 1, 80), 1),
    "K": np.round(np.clip(np.random.normal(70, 20, n), 5, 200), 1),
    "tillage": np.random.choice(["no-till","reduced","conventional"], n, p=[0.25,0.35,0.4]),
    "cover_crop": np.random.choice(["yes","no"], n, p=[0.35,0.65])
})

# Create an artificial "microbial_diversity_index" (label) dependent on OM, pH and cover crop
df["microbial_diversity"] = (
    0.4 * (df["organic_matter"]) +
    0.25 * (7 - np.abs(df["pH"] - 6.8)) +   # pH close to 6.8 slightly better
    0.3 * (df["cover_crop"].map({"yes":1,"no":0})) +
    np.random.normal(0, 0.4, n)
)
# scale a bit
df["microbial_diversity"] = (df["microbial_diversity"] - df["microbial_diversity"].min()) / (df["microbial_diversity"].max() - df["microbial_diversity"].min()) * 2 + 2
df.head()


In [None]:
# Cell 3 - EDA
print("Shape:", df.shape)
display(df.describe())

plt.figure(figsize=(8,4))
sns.histplot(df["microbial_diversity"], kde=True)
plt.title("Microbial diversity (synthetic)")
plt.show()

plt.figure(figsize=(10,8))
sns.heatmap(df[["pH","organic_matter","moisture_pct","N","P","K","microbial_diversity"]].corr(), annot=True, cmap="vlag")
plt.title("Correlation matrix")
plt.show()


In [None]:
# Cell 4 - Preprocess + baseline model
# One-hot encode categorical
X = pd.get_dummies(df.drop(columns=["sample_id","farm_id","microbial_diversity"]), drop_first=True)
y = df["microbial_diversity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("R2 (test):", round(r2_score(y_test, y_pred), 3))
print("MAE (test):", round(mean_absolute_error(y_test, y_pred), 3))

# Save predictions inside the notebook
results = pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1)
results["y_pred"] = y_pred
results.head()


In [None]:
# Cell 5 - Save model and dataset to downloaded files (optional)
from google.colab import files

# Save CSV of synthetic data for upload if you want
df.to_csv("microsail_synthetic_data.csv", index=False)
print("Saved microsail_synthetic_data.csv (download to upload to GitHub if desired).")

# Save model
joblib.dump(model, "microsail_baseline_model.pkl")
print("Saved microsail_baseline_model.pkl")

# If you want to download the files to your machine (click command output links)
files.download("microsail_synthetic_data.csv")
files.download("microsail_baseline_model.pkl")
