In [None]:
!pip install graphviz pydotplus six imblearn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
import warnings
import os
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/agriculture-crop-yield/crop_yield.csv')
df_sample = df.sample(n=100000, random_state=42)

In [None]:
numerical_features = ['Rainfall_mm', 'Temperature_Celsius', 'Days_to_Harvest', 'Yield_tons_per_hectare']

# Plot distributions (histograms with KDE)
plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(2, 2, i)
    sns.histplot(df_sample[feature], kde=True, bins=30)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Count')
plt.tight_layout()
plt.savefig('numerical_distributions.png')
plt.show()

# Plot box plots to identify outliers
plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(2, 2, i)
    sns.boxplot(y=df_sample[feature])
    plt.title(f'Box Plot of {feature}')
plt.tight_layout()
plt.savefig('numerical_boxplots.png')
plt.show()

# Handle outliers using IQR method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# Apply outlier removal for numerical features
df_sample_cleaned = df_sample.copy()
for feature in numerical_features:
    df_sample_cleaned = remove_outliers(df_sample_cleaned, feature)

# Display the number of rows before and after outlier removal
print(f"Original dataset size: {len(df_sample)}")
print(f"Dataset size after outlier removal: {len(df_sample_cleaned)}")

# Update df_sample to the cleaned version
df_sample = df_sample_cleaned

In [None]:
# Preprocessing: Encode categorical variables and convert yield to classification
le_region = LabelEncoder()
le_soil = LabelEncoder()
le_crop = LabelEncoder()
le_weather = LabelEncoder()

df_sample['Region'] = le_region.fit_transform(df_sample['Region'])
df_sample['Soil_Type'] = le_soil.fit_transform(df_sample['Soil_Type'])
df_sample['Crop'] = le_crop.fit_transform(df_sample['Crop'])
df_sample['Weather_Condition'] = le_weather.fit_transform(df_sample['Weather_Condition'])
df_sample['Fertilizer_Used'] = df_sample['Fertilizer_Used'].astype(int)
df_sample['Irrigation_Used'] = df_sample['Irrigation_Used'].astype(int)

# Define features and target (convert Yield to classification: High/Low based on median)
median_yield = df_sample['Yield_tons_per_hectare'].median()
df_sample['Yield_Class'] = (df_sample['Yield_tons_per_hectare'] > median_yield).astype(int)

X = df_sample[['Region', 'Soil_Type', 'Crop', 'Rainfall_mm', 'Temperature_Celsius', 
               'Fertilizer_Used', 'Irrigation_Used', 'Weather_Condition', 'Days_to_Harvest']]
y = df_sample['Yield_Class']

In [None]:
# Split and scale the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

In [None]:
# Visualize correlation matrix for numerical features
numeric_df = df_sample.select_dtypes(include=['int64', 'float64'])
correlation_matrix = numeric_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')
plt.savefig('correlation_heatmap.png')
plt.show()

In [None]:
# Train and evaluate Decision Tree with readable visualization
dt = DecisionTreeClassifier(max_depth=3, random_state=42)  # Limit depth for readability
dt.fit(X_train_balanced, y_train_balanced)
y_pred_dt = dt.predict(X_test_scaled)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt, zero_division=0))

# Visualize Decision Tree
plt.figure(figsize=(20,10))
plot_tree(dt, feature_names=X.columns, class_names=['Low', 'High'], filled=True, rounded=True, impurity=False, fontsize=10)
plt.title("Decision Tree for Yield Classification")
plt.savefig('decision_tree.png')
plt.show()

# Confusion Matrix for Decision Tree
cm_dt = confusion_matrix(y_test, y_pred_dt)
disp_dt = ConfusionMatrixDisplay(confusion_matrix=cm_dt, display_labels=['Low', 'High'])
disp_dt.plot(cmap='Blues')
plt.title('Confusion Matrix - Decision Tree')
plt.savefig('confusion_matrix_dt.png')
plt.show()

In [None]:
# Train and evaluate Logistic Regression
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train_balanced, y_train_balanced)
y_pred_lr = lr.predict(X_test_scaled)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, zero_division=0))

# Confusion Matrix for Logistic Regression
cm_lr = confusion_matrix(y_test, y_pred_lr)
disp_lr = ConfusionMatrixDisplay(confusion_matrix=cm_lr, display_labels=['Low', 'High'])
disp_lr.plot(cmap='Blues')
plt.title('Confusion Matrix - Logistic Regression')
plt.savefig('confusion_matrix_lr.png')
plt.show()

In [None]:
# Train and evaluate Linear SVM
svm = LinearSVC(random_state=42, max_iter=1000)
svm.fit(X_train_balanced, y_train_balanced)
y_pred_svm = svm.predict(X_test_scaled)
print("Linear SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, zero_division=0))

# Confusion Matrix for Linear SVM
cm_svm = confusion_matrix(y_test, y_pred_svm)
disp_svm = ConfusionMatrixDisplay(confusion_matrix=cm_svm, display_labels=['Low', 'High'])
disp_svm.plot(cmap='Blues')
plt.title('Confusion Matrix - Linear SVM')
plt.savefig('confusion_matrix_svm.png')
plt.show()

In [None]:
# Train and evaluate optimized KNN (with smaller sample for speed)
X_train_small, _, y_train_small, _ = train_test_split(X_train_balanced, y_train_balanced, train_size=0.1, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', n_jobs=-1)
knn.fit(X_train_small, y_train_small)
y_pred_knn = knn.predict(X_test_scaled)
print("KNN Accuracy (Small Sample):", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn, zero_division=0))

# Hyperparameter tuning for KNN (on small sample)
param_grid_knn = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
grid_search_knn = GridSearchCV(KNeighborsClassifier(algorithm='ball_tree', n_jobs=-1), 
                              param_grid_knn, cv=3, scoring='f1_weighted', n_jobs=-1)
grid_search_knn.fit(X_train_small, y_train_small)
best_knn = grid_search_knn.best_estimator_
print("Best KNN Params:", grid_search_knn.best_params_)

# Evaluate best KNN
y_pred_best_knn = best_knn.predict(X_test_scaled)
print("Best KNN Accuracy:", accuracy_score(y_test, y_pred_best_knn))
print(classification_report(y_test, y_pred_best_knn, zero_division=0))

# Confusion Matrix for Best KNN
cm_knn = confusion_matrix(y_test, y_pred_best_knn)
disp_knn = ConfusionMatrixDisplay(confusion_matrix=cm_knn, display_labels=['Low', 'High'])
disp_knn.plot(cmap='Blues')
plt.title('Confusion Matrix - Best KNN')
plt.savefig('confusion_matrix_knn.png')
plt.show()

In [None]:
# Train and evaluate Random Forest
rf = RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1)
rf.fit(X_train_balanced, y_train_balanced)
y_pred_rf = rf.predict(X_test_scaled)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, zero_division=0))

# Confusion Matrix for Random Forest
cm_rf = confusion_matrix(y_test, y_pred_rf)
disp_rf = ConfusionMatrixDisplay(confusion_matrix=cm_rf, display_labels=['Low', 'High'])
disp_rf.plot(cmap='Blues')
plt.title('Confusion Matrix - Random Forest')
plt.savefig('confusion_matrix_rf.png')
plt.show()

In [None]:
# Train and evaluate Ensemble (Stacking)
estimators = [('rf', rf), ('svm', svm)]
stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stacking.fit(X_train_balanced, y_train_balanced)
y_pred_stacking = stacking.predict(X_test_scaled)
print("Stacking Classifier Accuracy:", accuracy_score(y_test, y_pred_stacking))
print(classification_report(y_pred_stacking, y_test, zero_division=0))

# Confusion Matrix for Stacking Classifier
cm_stacking = confusion_matrix(y_test, y_pred_stacking)
disp_stacking = ConfusionMatrixDisplay(confusion_matrix=cm_stacking, display_labels=['Low', 'High'])
disp_stacking.plot(cmap='Blues')
plt.title('Confusion Matrix - Stacking Classifier')
plt.savefig('confusion_matrix_stacking.png')
plt.show()

In [None]:
# Function to plot learning curves
def plot_learning_curve(estimator, title, X, y, cv=3, train_sizes=np.linspace(0.1, 1.0, 5)):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=-1, train_sizes=train_sizes, scoring='accuracy'
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel("Training Examples")
    plt.ylabel("Accuracy")
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training Accuracy")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-Validation Accuracy")

    plt.legend(loc="best")
    os.makedirs('plots', exist_ok=True)
    plt.savefig(f'plots/learning_curve_{title.lower().replace(" ", "_")}.png')
    plt.show()

# Plot learning curves for all models
models = {
    'Decision Tree': dt,
    'Logistic Regression': lr,
    'Linear SVM': svm,
    'KNN': best_knn,
    'Random Forest': rf,
    'Stacking Classifier': stacking
}

for name, model in models.items():
    print(f"Generating learning curve for {name}...")
    plot_learning_curve(model, f"Learning Curve - {name}", X_train_balanced, y_train_balanced)

In [None]:
# Store best models for comparison
best_models = {
    'Decision Tree': dt,
    'Logistic Regression': lr,
    'Linear SVM': svm,
    'KNN': best_knn,
    'Random Forest': rf,
    'Stacking Classifier': stacking
}

# Compare all models and find the best
model_accuracies = {}
for name, model in best_models.items():
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    model_accuracies[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.6f}")

# Identify the best model
best_model_name = max(model_accuracies, key=model_accuracies.get)
best_accuracy = model_accuracies[best_model_name]
print(f"\nBest Model: {best_model_name} with Accuracy: {best_accuracy:.6f}")