# Enhanced Crop Recommendation System
## Combining Decision Tree and KNN with Ensemble Methods

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load and prepare data
df = pd.read_csv('Crop_recommendation.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nDataset info:")
print(df.info())
print(f"\nCrop distribution:")
print(df['label'].value_counts())

In [None]:
# Data preprocessing
print("Missing values:")
print(df.isnull().sum())

# Encode categorical data
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

# Separate features and target
X = df.drop(['label', 'label_encoded'], axis=1)
y = df['label_encoded']

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

In [None]:
# Individual Models
print("Training individual models...\n")

# KNN Model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

# Decision Tree Model
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=5)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

# Random Forest (additional strong model)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [None]:
# Ensemble Methods
print("Creating ensemble models...\n")

# 1. Voting Classifier (Hard Voting)
voting_hard = VotingClassifier(
    estimators=[
        ('knn', KNeighborsClassifier(n_neighbors=5)),
        ('dt', DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=5)),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10))
    ],
    voting='hard'
)
voting_hard.fit(X_train, y_train)
y_pred_voting_hard = voting_hard.predict(X_test)

# 2. Voting Classifier (Soft Voting)
voting_soft = VotingClassifier(
    estimators=[
        ('knn', KNeighborsClassifier(n_neighbors=5)),
        ('dt', DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=5)),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10))
    ],
    voting='soft'
)
voting_soft.fit(X_train, y_train)
y_pred_voting_soft = voting_soft.predict(X_test)

# 3. Stacking Classifier
stacking_model = StackingClassifier(
    estimators=[
        ('knn', KNeighborsClassifier(n_neighbors=5)),
        ('dt', DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=5))
    ],
    final_estimator=LogisticRegression(random_state=42, max_iter=1000),
    cv=5
)
stacking_model.fit(X_train, y_train)
y_pred_stacking = stacking_model.predict(X_test)

In [None]:
# Evaluation function
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    
    print(f"{model_name} Results:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-score: {f1:.4f}")
    print("-" * 40)
    
    return accuracy, precision, recall, f1

# Evaluate all models
print("Model Performance Comparison:\n")

results = {}
results['KNN'] = evaluate_model(y_test, y_pred_knn, "KNN")
results['Decision Tree'] = evaluate_model(y_test, y_pred_dt, "Decision Tree")
results['Random Forest'] = evaluate_model(y_test, y_pred_rf, "Random Forest")
results['Voting (Hard)'] = evaluate_model(y_test, y_pred_voting_hard, "Voting Classifier (Hard)")
results['Voting (Soft)'] = evaluate_model(y_test, y_pred_voting_soft, "Voting Classifier (Soft)")
results['Stacking'] = evaluate_model(y_test, y_pred_stacking, "Stacking Classifier")

In [None]:
# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[model][0] for model in results.keys()],
    'Precision': [results[model][1] for model in results.keys()],
    'Recall': [results[model][2] for model in results.keys()],
    'F1-Score': [results[model][3] for model in results.keys()]
})

print("\nModel Comparison Summary:")
print(comparison_df.round(4))

# Find best model
best_model_idx = comparison_df['Accuracy'].idxmax()
best_model = comparison_df.loc[best_model_idx, 'Model']
best_accuracy = comparison_df.loc[best_model_idx, 'Accuracy']

print(f"\nBest performing model: {best_model} with accuracy: {best_accuracy:.4f}")

In [None]:
# Visualization
plt.figure(figsize=(12, 8))

# Plot 1: Model Accuracy Comparison
plt.subplot(2, 2, 1)
plt.bar(comparison_df['Model'], comparison_df['Accuracy'], color='skyblue')
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.ylim(0.9, 1.0)

# Plot 2: All Metrics Comparison
plt.subplot(2, 2, 2)
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
x = np.arange(len(comparison_df))
width = 0.2

for i, metric in enumerate(metrics):
    plt.bar(x + i*width, comparison_df[metric], width, label=metric)

plt.xlabel('Models')
plt.ylabel('Score')
plt.title('All Metrics Comparison')
plt.xticks(x + width*1.5, comparison_df['Model'], rotation=45)
plt.legend()
plt.ylim(0.9, 1.0)

# Plot 3: Confusion Matrix for Best Model
plt.subplot(2, 2, 3)
if best_model == 'Voting (Soft)':
    best_predictions = y_pred_voting_soft
elif best_model == 'Voting (Hard)':
    best_predictions = y_pred_voting_hard
elif best_model == 'Stacking':
    best_predictions = y_pred_stacking
else:
    best_predictions = y_pred_dt  # fallback

cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title(f'Confusion Matrix - {best_model}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

plt.tight_layout()
plt.show()

In [None]:
# Cross-validation for ensemble models
print("Cross-validation scores (5-fold):")
print("-" * 40)

models_for_cv = {
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=5),
    'Voting (Soft)': voting_soft,
    'Stacking': stacking_model
}

cv_results = {}
for name, model in models_for_cv.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    cv_results[name] = cv_scores
    print(f"{name}:")
    print(f"  Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"  Individual scores: {cv_scores.round(4)}")
    print()

In [None]:
# Feature importance from the best tree-based model
if 'Decision Tree' in best_model or 'Random Forest' in best_model:
    if 'Random Forest' in best_model:
        feature_importance = rf_model.feature_importances_
    else:
        feature_importance = dt_model.feature_importances_
    
    feature_names = X.columns
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importance
    }).sort_values('Importance', ascending=False)
    
    print("Feature Importance:")
    print(importance_df)
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.bar(importance_df['Feature'], importance_df['Importance'])
    plt.title('Feature Importance')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Prediction function for new data
def predict_crop(N, P, K, temperature, humidity, ph, rainfall, model_choice='best'):
    """
    Predict crop recommendation based on soil and climate parameters
    
    Parameters:
    - N, P, K: Soil nutrient levels
    - temperature: Temperature in Celsius
    - humidity: Humidity percentage
    - ph: Soil pH level
    - rainfall: Rainfall in mm
    - model_choice: 'knn', 'dt', 'voting_soft', 'stacking', or 'best'
    """
    
    # Prepare input data
    input_data = np.array([[N, P, K, temperature, humidity, ph, rainfall]])
    input_scaled = scaler.transform(input_data)
    
    # Select model
    if model_choice == 'knn':
        model = knn_model
    elif model_choice == 'dt':
        model = dt_model
    elif model_choice == 'voting_soft':
        model = voting_soft
    elif model_choice == 'stacking':
        model = stacking_model
    else:  # best model
        if best_model == 'Voting (Soft)':
            model = voting_soft
        elif best_model == 'Stacking':
            model = stacking_model
        else:
            model = dt_model
    
    # Make prediction
    prediction = model.predict(input_scaled)[0]
    crop_name = le.inverse_transform([prediction])[0]
    
    # Get prediction probabilities if available
    if hasattr(model, 'predict_proba'):
        probabilities = model.predict_proba(input_scaled)[0]
        top_3_indices = np.argsort(probabilities)[-3:][::-1]
        top_3_crops = le.inverse_transform(top_3_indices)
        top_3_probs = probabilities[top_3_indices]
        
        print(f"Recommended crop: {crop_name}")
        print(f"\nTop 3 recommendations:")
        for i, (crop, prob) in enumerate(zip(top_3_crops, top_3_probs)):
            print(f"{i+1}. {crop}: {prob:.3f}")
    else:
        print(f"Recommended crop: {crop_name}")
    
    return crop_name

# Example prediction
print("Example Crop Prediction:")
print("Input: N=90, P=42, K=43, Temperature=20.9, Humidity=82, pH=6.5, Rainfall=202")
predicted_crop = predict_crop(90, 42, 43, 20.9, 82, 6.5, 202, 'best')
print(f"\nUsing {best_model} model")

## Summary

This enhanced crop recommendation system combines Decision Tree and KNN models using several ensemble techniques:

1. **Voting Classifier (Hard)**: Takes majority vote from all models
2. **Voting Classifier (Soft)**: Averages prediction probabilities
3. **Stacking Classifier**: Uses a meta-learner (Logistic Regression) to combine predictions

The ensemble methods typically provide better performance than individual models by:
- Reducing overfitting
- Improving generalization
- Combining strengths of different algorithms
- Providing more robust predictions

Choose the best performing model based on your specific requirements for accuracy, interpretability, and computational efficiency.