In [None]:
!pip install numpy pandas scikit-learn matplotlib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


file_path = "/kaggle/input/crop-yield-pred/Dataset12.csv" 
df = pd.read_csv(file_path)
display(df.info())
display(df.head(157))
label_encoder = LabelEncoder()
df["Crop"] = label_encoder.fit_transform(df["Crop"])
X = df.drop(columns=["Yield"])
y = df["Yield"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Random Forest": RandomForestRegressor(n_estimators=50, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=50, max_depth=3, random_state=42, verbosity=0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Decision Tree": DecisionTreeRegressor(max_depth=5, random_state=42),
    "KNN": KNeighborsRegressor(n_neighbors=5)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {"MAE": mae, "MSE": mse, "R2 Score": r2}
results_df = pd.DataFrame(results).T
print(results_df)
plt.figure(figsize=(10, 5))
sns.barplot(x=results_df.index, y=results_df["R2 Score"])
plt.title("Model Performance Comparison (R² Score)")
plt.ylabel("R² Score")
plt.xticks(rotation=45)
plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor # Use Regressor instead of Classifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
# Load the dataset
df=pd.read_csv('/kaggle/input/crop-yield/Dataset12.csv')
# Creating the DataFrame
de=pd.DataFrame(df)
print(de)






In [None]:
# Identify categorical columns and apply Label Encoding
categorical_columns = df.select_dtypes(include=['object']).columns # Identify all categorical columns
# Apply Label Encoding to each categorical column
le = LabelEncoder()
for col in categorical_columns:
 df[col] = le.fit_transform(df[col])

In [None]:
# Select features and target variable
X = df[['Crop', 'Precipitation (mm day-1)', 'Specific Humidity at 2 Meters (g/kg)', 
 'Relative Humidity at 2 Meters (%)', 'Temperature at 2 Meters (C)']] # Features
y = df['Yield'] # Target variable
# Check if 'Yield' is numerical or categorical
if df['Yield'].dtype == 'object':
 raise ValueError("Yield should be a numerical column for regression. Check your dataset.")


In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Normalize or Standardize Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Initialize the Decision Tree Regressor
dt_model = DecisionTreeRegressor(max_depth=30, min_samples_split=3, min_samples_leaf=1, 
random_state=42)

In [None]:
# Train the Decision Tree Model
dt_model.fit(X_train_scaled, y_train)
# Predict using Decision Tree
y_pred_dt = dt_model.predict(X_test_scaled)


In [None]:
# Evaluate Decision Tree Model
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)
print(f"Decision Tree MSE: {mse_dt:.4f}")
print(f"Decision Tree R² Score: {r2_dt:.4f}")

In [None]:
# Try a Different Model: Random Forest Regressor to rreduce Overfitting.
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
# Predict using Random Forest
y_pred_rf = rf_model.predict(X_test_scaled)


In [None]:
# Evaluate Random Forest Model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest MSE: {mse_rf:.4f}")
print(f"Random Forest R² Score: {r2_rf:.4f}")

In [None]:
# Visualize Actual vs. Predicted Results
def plot_results(y_test, y_pred, model_name):
    plt.figure(figsize=(10, 6))
 # Ensure axes have the same scale
    min_val = min(y_test.min(), y_pred.min())
    max_val = max(y_test.max(), y_pred.max())
# Scatter plot of actual vs predicted values
    plt.scatter(y_test,y_pred,color='blue', alpha=0.5, label="Predictions")
 
 # Perfect prediction reference line
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label="Perfect Fit")
# Labels and title
    plt.xlabel('Actual Yield')
    plt.ylabel('Predicted Yield')
    plt.title(f'Actual vs Predicted Crop Yield ({model_name})')
# Force equal scaling
    plt.axis("equal") 
    plt.legend()
 # Show the plot
    plt.tight_layout()
    plt.show()


In [None]:
# Plot results for both models
plot_results(y_test, y_pred_dt, "Decision Tree")
plot_results(y_test, y_pred_rf, "Random Forest")

In [None]:
# Set a larger figure size and higher resolution
plt.figure(figsize=(25, 20), dpi=300)
# Plot the Decision Tree with enhanced visibility
plot_tree(dt_model, feature_names=X.columns, filled=True, rounded=True, fontsize=12)
# Set a meaningful title
plt.title("Decision Tree Structure for Crop Yield Prediction", color='blue', fontsize=25)
# Save the figure for better sharing and analysis
plt.savefig("decision_tree.png", dpi=300, bbox_inches='tight')
# Show the plot
plt.show()