In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_graphviz
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import graphviz
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

df = pd.read_csv("/kaggle/input/crop-yield/crop yield prediction.csv") 
display(df.head(157))
display(df.info())

print("\nMissing values:")
print(df.isnull().sum())

plt.figure(figsize=(10, 6))
sns.histplot(df.iloc[:, -1], kde=True, bins=30, color="blue")
plt.title("Crop Yield Distribution")
plt.show()

sns.pairplot(df.iloc[:, :5])
plt.show()

for col in df.select_dtypes(include=['object']).columns:
    df[col] = LabelEncoder().fit_transform(df[col])

X = df.drop(columns=[df.columns[-1]]) 
y = df[df.columns[-1]]  

feature_names = X.columns

scaler = StandardScaler()
X = scaler.fit_transform(X)

y_scaler = MinMaxScaler()
y = y_scaler.fit_transform(y.values.reshape(-1, 1)).flatten()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(objective='reg:squarederror', random_state=42),
    "Lasso": Lasso(alpha=1e-5, max_iter=50000),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "Decision Tree": DecisionTreeRegressor(max_depth=4, random_state=42)  
}

model_scores = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    model_scores[name] = {
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "R2 Score": r2_score(y_test, y_pred)
    }

for name, scores in model_scores.items():
    print(f"{name} Performance:")
    for metric, value in scores.items():
        print(f"{metric}: {value:.4f}")
    print("-" * 30)

dt_model = models["Decision Tree"]
plt.figure(figsize=(30, 15), dpi=150)
plot_tree(dt_model, filled=True, feature_names=feature_names, fontsize=10)
plt.title("Decision Tree Visualization (max_depth=4)")
plt.show()

dot_data = export_graphviz(dt_model, out_file=None, feature_names=feature_names, filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("decision_tree")  

best_model_name = max(model_scores, key=lambda x: model_scores[x]['R2 Score'])
best_model = models[best_model_name]
joblib.dump(best_model, "crop_yield_model.pkl")
print(f"Best model '{best_model_name}' saved as crop_yield_model.pkl")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
df = pd.read_csv('/kaggle/input/crop-yield/crop yield prediction.csv') 
print("First 5 rows of the dataset:")
display(df.head())
print("\nDataset Info:")
df.info()
print("\nMissing Values:")
print(df.isnull().sum())
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le
    X = df.drop('Crop', axis=1) 
y = df['Crop']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
df = pd.read_csv('/kaggle/input/crop-yield/crop yield prediction.csv') 
print("First 5 rows of the dataset:")
display(df.head())
print("\nDataset Info:")
df.info()
print("\nMissing Values:")
print(df.isnull().sum())

public class CropData {
    private String crop;
    private double precipitation;
    private double humidity;
    private double specificHumidity;
    private double temperature;
    private int yield;

    public CropData(String crop, double precipitation, double humidity, double specificHumidity, double temperature, int yield) {
        this.crop = crop;
        this.precipitation = precipitation;
        this.humidity = humidity;
        this.specificHumidity = specificHumidity;
        this.temperature = temperature;
        this.yield = yield;
    }

    public String getCrop() {
        return crop;
    }

    public double getPrecipitation() {
        return precipitation;
    }

    public double getHumidity() {
        return humidity;
    }

    public double getSpecificHumidity() {
        return specificHumidity;
    }

    public double getTemperature() {
        return temperature;
    }

    public int getYield() {
        return yield;
    }

    public static void main(String[] args) {
        CropData crop1 = new CropData("Cocoa, beans", 2248.92, 83.4, 17.72, 26.01, 11560);
        System.out.println("Crop: " + crop1.getCrop());
        System.out.println("Precipitation: " + crop1.getPrecipitation());
        // Print other details similarly
    }
}


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load dataset
df = pd.read_csv('/kaggle/input/crop-yield/crop yield prediction.csv')

print("First 5 rows of the dataset:")
display(df.head())

print("\nDataset Info:")
df.info()

print("\nMissing Values:")
print(df.isnull().sum())

# Drop rows with missing values if any
df = df.dropna()

# Convert categorical variables to numeric if necessary
if 'Crop' in df.columns:
    df['Crop'] = df['Crop'].astype('category').cat.codes

# Feature columns and target
features = df.drop('Yield', axis=1)
target = df['Yield']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Model training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
print(f"\nMean Squared Error: {mse:.2f}")
public class CropData {
    private String crop;
    private double precipitation;
    private double humidity;
    private double specificHumidity;
    private double temperature;
    private int yieldValue;

    public CropData(String crop, double precipitation, double humidity, double specificHumidity, double temperature, int yieldValue) {
        this.crop = crop;
        this.precipitation = precipitation;
        this.humidity = humidity;
        this.specificHumidity = specificHumidity;
        this.temperature = temperature;
        this.yieldValue = yieldValue;
    }

    public String getCrop() {
        return crop;
    }

    public double getPrecipitation() {
        return precipitation;
    }

    public double getHumidity() {
        return humidity;
    }

    public double getSpecificHumidity() {
        return specificHumidity;
    }

    public double getTemperature() {
        return temperature;
    }

    public int getYieldValue() {
        return yieldValue;
    }

    public static void main(String[] args) {
        CropData crop1 = new CropData("Cocoa, beans", 2248.92, 83.4, 17.72, 26.01, 11560);
        System.out.println("Crop: " + crop1.getCrop());
        System.out.println("Precipitation: " + crop1.getPrecipitation());
        System.out.println("Humidity: " + crop1.getHumidity());
        System.out.println("Specific Humidity: " + crop1.getSpecificHumidity());
        System.out.println("Temperature: " + crop1.getTemperature());
        System.out.println("Yield: " + crop1.getYieldValue());
    }
}


In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("your_dataset.csv")  # Replace with your actual CSV file name

X = df.drop("Yield", axis=1)
y = df["Yield"]

categorical = ["Crop"]
numerical = X.columns.difference(categorical)

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(), categorical)
], remainder="passthrough")

model = Pipeline([
    ("pre", preprocessor),
    ("rf", RandomForestRegressor(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R²:", r2_score(y_test, y_pred))

joblib.dump(model, "crop_yield_model.pkl")


In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv("/kaggle/input/crop-yield/crop yield prediction.csv")  # Replace with your actual CSV file name

# Separate features and target
X = df.drop("Yield", axis=1)
y = df["Yield"]

# Preprocessing: One-hot encode the 'Crop' column
categorical = ["Crop"]
numerical = X.columns.difference(categorical)

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(), categorical)
], remainder="passthrough")

# Build a machine learning pipeline
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("✅ Model trained.")
print(f"📉 RMSE: {rmse:.2f}")
print(f"📈 R² Score: {r2:.3f}")

# Save the model
joblib.dump(model, "crop_yield_model.pkl")
print("💾 Model saved to 'crop_yield_model.pkl'")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
import graphviz

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_graphviz
from xgboost import XGBRegressor

warnings.simplefilter(action='ignore', category=FutureWarning)

# === Load Dataset ===
df = pd.read_csv("/kaggle/input/crop-yield/crop yield prediction.csv")  # Replace with your actual file name

# === Inspect Data ===
print(df.head())
print(df.info())
print("\nMissing values:\n", df.isnull().sum())

# === Visualize Target Variable ===
plt.figure(figsize=(10, 6))
sns.histplot(df.iloc[:, -1], kde=True, bins=30, color="skyblue")
plt.title("Crop Yield Distribution")
plt.show()

# === Pair Plot ===
sns.pairplot(df.iloc[:, :5])
plt.show()

# === Encode Categorical Features ===
for col in df.select_dtypes(include=['object']).columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# === Split Features and Target ===
X = df.drop(columns=[df.columns[-1]])
y = df[df.columns[-1]]

feature_names = X.columns

# === Scale Features and Target ===
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

y_scaler = MinMaxScaler()
y_scaled = y_scaler.fit_transform(y.values.reshape(-1, 1)).flatten()

# === Split Train/Test ===
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# === Models ===
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(objective='reg:squarederror', random_state=42),
    "Lasso": Lasso(alpha=1e-5, max_iter=50000),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "Decision Tree": DecisionTreeRegressor(max_depth=4, random_state=42)
}

model_scores = {}

# === Train and Evaluate Models ===
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    model_scores[name] = {
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "R2 Score": r2_score(y_test, y_pred)
    }

# === Show Results ===
for name, scores in model_scores.items():
    print(f"\n{name} Performance:")
    for metric, value in scores.items():
        print(f"{metric}: {value:.4f}")
    print("-" * 30)

# === Visualize Decision Tree ===
dt_model = models["Decision Tree"]
plt.figure(figsize=(30, 15), dpi=150)
plot_tree(dt_model, filled=True, feature_names=feature_names, fontsize=10)
plt.title("Decision Tree Visualization (max_depth=4)")
plt.show()

# Export to Graphviz
dot_data = export_graphviz(
    dt_model, out_file=None,
    feature_names=feature_names,
    filled=True, rounded=True,
    special_characters=True
)
graph = graphviz.Source(dot_data)
graph.render("decision_tree")  # Saves as decision_tree.pdf

# === Save the Best Model ===
best_model_name = max(model_scores, key=lambda x: model_scores[x]['R2 Score'])
best_model = models[best_model_name]
joblib.dump(best_model, "crop_yield_model.pkl")
print(f"\n✅ Best model '{best_model_name}' saved as crop_yield_model.pkl")
Crop,Precipitation (mm day-1),Specific Humidity at 2 Meters (g/kg),Relative Humidity at 2 Meters (%),Temperature at 2 Meters (C),Yield
Maize,4.1,11.3,80.2,25.0,3150
Rice,5.6,12.5,82.1,26.2,3400
...

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
import graphviz

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_graphviz
from xgboost import XGBRegressor

warnings.simplefilter(action='ignore', category=FutureWarning)

# === Load Dataset ===
df = pd.read_csv("/kaggle/input/crop-yield/crop yield prediction.csv")  # Replace with your actual file name

# === Inspect Data ===
print(df.head())
print(df.info())
print("\nMissing values:\n", df.isnull().sum())

# === Visualize Target Variable ===
plt.figure(figsize=(10, 6))
sns.histplot(df.iloc[:, -1], kde=True, bins=30, color="skyblue")
plt.title("Crop Yield Distribution")
plt.show()

# === Pair Plot ===
sns.pairplot(df.iloc[:, :5])
plt.show()

# === Encode Categorical Features ===
for col in df.select_dtypes(include=['object']).columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# === Split Features and Target ===
X = df.drop(columns=[df.columns[-1]])
y = df[df.columns[-1]]

feature_names = X.columns

# === Scale Features and Target ===
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

y_scaler = MinMaxScaler()
y_scaled = y_scaler.fit_transform(y.values.reshape(-1, 1)).flatten()

# === Split Train/Test ===
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# === Models ===
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(objective='reg:squarederror', random_state=42),
    "Lasso": Lasso(alpha=1e-5, max_iter=50000),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "Decision Tree": DecisionTreeRegressor(max_depth=4, random_state=42)
}

model_scores = {}

# === Train and Evaluate Models ===
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    model_scores[name] = {
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "R2 Score": r2_score(y_test, y_pred)
    }

# === Show Results ===
for name, scores in model_scores.items():
    print(f"\n{name} Performance:")
    for metric, value in scores.items():
        print(f"{metric}: {value:.4f}")
    print("-" * 30)

# === Visualize Decision Tree ===
dt_model = models["Decision Tree"]
plt.figure(figsize=(30, 15), dpi=150)
plot_tree(dt_model, filled=True, feature_names=feature_names, fontsize=10)
plt.title("Decision Tree Visualization (max_depth=4)")
plt.show()

# Export to Graphviz
dot_data = export_graphviz(
    dt_model, out_file=None,
    feature_names=feature_names,
    filled=True, rounded=True,
    special_characters=True
)
graph = graphviz.Source(dot_data)
graph.render("decision_tree")  # Saves as decision_tree.pdf

# === Save the Best Model ===
best_model_name = max(model_scores, key=lambda x: model_scores[x]['R2 Score'])
best_model = models[best_model_name]
joblib.dump(best_model, "crop_yield_model.pkl")
print(f"\n✅ Best model '{best_model_name}' saved as crop_yield_model.pkl")
Crop,Precipitation (mm day-1),Specific Humidity at 2 Meters (g/kg),Relative Humidity at 2 Meters (%),Temperature at 2 Meters (C),Yield
Maize,4.1,11.3,80.2,25.0,3150
Rice,5.6,12.5,82.1,26.2,3400
...