In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
crops = pd.read_csv("soil_measures.csv")

# Check for missing values
print(crops.isna().sum())

# Check unique values of the target (crop)
print(crops.crop.unique())

# Split into feature and target sets
X = crops.drop(columns="crop")
y = crops["crop"]

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

# Initialize the scaler
scaler = StandardScaler()

# Scale the training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Step 1: Hyperparameter tuning for Logistic Regression ---
param_grid = {
    'C': [0.01, 0.1, 1, 10],  # Regularization strength
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],  # Solvers
}

grid_search = GridSearchCV(
    LogisticRegression(
        multi_class="multinomial",
        random_state=42),
    param_grid,
    cv=5
)
grid_search.fit(X_train_scaled, y_train)

# Best parameters for Logistic Regression
print(f"Best parameters for Logistic Regression: {grid_search.best_params_}")

# Initialize the best Logistic Regression model
log_reg_best = grid_search.best_estimator_

# Test the model
y_pred_log_reg = log_reg_best.predict(X_test_scaled)

# Calculate accuracy and F1-score
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
f1_log_reg = f1_score(y_test, y_pred_log_reg, average="weighted")

print(f"Logistic Regression Accuracy: {accuracy_log_reg * 100:.2f}%")
print(f"Logistic Regression F1-score: {f1_log_reg:.2f}")

# --- Step 2: Cross-validation for better performance estimate ---
cross_val_scores = cross_val_score(
    log_reg_best,
    X_train_scaled,
    y_train,
    cv=5
)
print(f"Cross-validation scores for Logistic Regression: {cross_val_scores}")
print(f"Mean cross-validation score: {cross_val_scores.mean():.2f}")

# --- Step 3: Try Random Forest for comparison ---
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Test the Random Forest model
y_pred_rf = rf_model.predict(X_test_scaled)

# Calculate accuracy and F1-score for Random Forest
accuracy_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf, average="weighted")

print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print(f"Random Forest F1-score: {f1_rf:.2f}")

# --- Step 5: Function to predict crop for new input ---
def predict_crop(nitrogen, phosphorus, potassium, ph_value):
    input_data = pd.DataFrame({
        "N": [nitrogen],
        "P": [phosphorus],
        "K": [potassium],
        "ph": [ph_value]
    })

    predicted_crop = log_reg_best.predict(input_data)  # You can change this to use rf_model or log_reg_weighted
    return predicted_crop[0]

# Example: Predicting the crop for given soil measures
nitrogen_input = 50  # Example nitrogen level
phosphorus_input = 30  # Example phosphorus level
potassium_input = 60  # Example potassium level
ph_input = 6.5  # Example pH level

predicted_crop = predict_crop(nitrogen_input, phosphorus_input, potassium_input, ph_input)
print(f"The recommended crop for the given soil measures is: {predicted_crop}")


Training set size: (1760, 4), Test set size: (440, 4)
F1-score for N: 0.09149868209906838
F1-score for P: 0.14761942909728204
F1-score for K: 0.23896974566001802
F1-score for ph: 0.04532731061152114
{'N': 0.09149868209906838, 'P': 0.14761942909728204, 'K': 0.23896974566001802, 'ph': 0.04532731061152114}
F1-score for model with all features: 0.5454206655375167
The recommended crop for the given soil measures is: coconut
