<h2><b> Step: 1</b>- Install Required Packages  </h2>

In [29]:
%pip install pandas scikit-learn matplotlib seaborn ipywidgets
%pip install pandas numpy scikit-learn matplotlib seaborn plotly ipywidgets








Note: you may need to restart the kernel to use updated packages.




<h2><b> Step: 2</b>-  Import All Required Libraries
python
</h2>

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import warnings
# VS Code specific setup for widgets
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import plotly.express as px

# Configure Plotly for VS Code
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")



All libraries imported successfully!


<h2><b> Step: 3</b>-  Load and Prepare the Data
</h2>

In [31]:
# Generate sample data
np.random.seed(42)
n_samples = 1000

data = {
    'temperature': np.random.uniform(15, 35, n_samples),
    'rainfall': np.random.uniform(0, 100, n_samples),
    'humidity': np.random.uniform(30, 90, n_samples),
    'soil_ph': np.random.uniform(4.5, 8.5, n_samples),
    'nitrogen': np.random.uniform(5, 50, n_samples),
    'phosphorus': np.random.uniform(0, 40, n_samples),
    'potassium': np.random.uniform(100, 300, n_samples),
    'organic_matter': np.random.uniform(1, 5, n_samples),
    'irrigation_frequency': np.random.randint(1, 10, n_samples),
    'fertilizer_usage': np.random.uniform(10, 50, n_samples),
    'crop_type': np.random.choice(['Wheat', 'Corn', 'Soybeans', 'Rice', 'Barley'], n_samples)
}

# Create a realistic yield calculation
data['crop_yield'] = (
    0.5 * data['temperature'] +
    0.3 * data['rainfall'] +
    0.2 * data['humidity'] +
    0.4 * data['soil_ph'] +
    0.6 * data['nitrogen'] +
    0.5 * data['phosphorus'] +
    0.4 * data['potassium'] +
    0.7 * data['organic_matter'] +
    0.3 * data['irrigation_frequency'] +
    0.4 * data['fertilizer_usage'] +
    np.random.normal(0, 5, n_samples)
)

df = pd.DataFrame(data)

print("Sample data created successfully")
print(f"Dataset shape: {df.shape}")
display(df.head())

Sample data created successfully
Dataset shape: (1000, 12)


Unnamed: 0,temperature,rainfall,humidity,soil_ph,nitrogen,phosphorus,potassium,organic_matter,irrigation_frequency,fertilizer_usage,crop_type,crop_yield
0,22.490802,18.513293,45.702341,7.190812,30.739815,15.745421,229.651391,1.155198,2,14.938543,Wheat,159.253455
1,34.014286,54.190095,44.818728,7.686726,41.244455,18.937426,134.477272,1.74709,1,49.202078,Rice,150.357872
2,29.639879,87.294584,84.375275,5.501872,39.207242,34.181896,274.478913,4.324983,8,25.748767,Rice,223.144923
3,26.97317,73.222489,44.972772,6.999496,11.925496,13.600175,222.623248,4.067073,9,19.096869,Wheat,163.428837
4,18.120373,80.656115,46.316984,6.786984,11.716226,34.785987,131.440777,2.402571,7,16.852982,Wheat,128.51761


<h2><b> Step: 4</b>-  Data Preprocessing
</h2>

In [32]:
# Handle negative values in phosphorus
df['phosphorus'] = df['phosphorus'].apply(lambda x: max(0, x))

# Encode crop_type
le = LabelEncoder()
df['crop_type_encoded'] = le.fit_transform(df['crop_type'])

print("Data preprocessing completed")
print(f"Crop types: {list(le.classes_)}")

Data preprocessing completed
Crop types: ['Barley', 'Corn', 'Rice', 'Soybeans', 'Wheat']


<h2><b> Step: 5</b>-  Prepare Features and Target
</h2>

In [33]:
# First, let's make sure the crop_type_encoded column exists
if 'crop_type_encoded' not in df.columns:
    # If not, let's create it
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    df['crop_type_encoded'] = le.fit_transform(df['crop_type'])
    print("Created crop_type_encoded column")

# Features and target
features = ['temperature', 'rainfall', 'humidity', 'soil_ph', 'nitrogen', 
            'phosphorus', 'potassium', 'organic_matter', 'irrigation_frequency', 
            'fertilizer_usage', 'crop_type_encoded']
X = df[features]
y = df['crop_yield']

print("Features and target prepared")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Features used: {features}")

features = ['temperature', 'rainfall', 'humidity', 'soil_ph', 'nitrogen', 
            'phosphorus', 'potassium', 'organic_matter', 'irrigation_frequency', 
            'fertilizer_usage', 'crop_type_encoded']
X = df[features]
y = df['crop_yield']

print("Features and target prepared")

Features and target prepared
X shape: (1000, 11)
y shape: (1000,)
Features used: ['temperature', 'rainfall', 'humidity', 'soil_ph', 'nitrogen', 'phosphorus', 'potassium', 'organic_matter', 'irrigation_frequency', 'fertilizer_usage', 'crop_type_encoded']
Features and target prepared


<h2><b> Step: 6</b>-   Split Data and Scale Features
</h2>

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data split and scaled")

Data split and scaled


<h2><b> Step: 7</b>-  Train the Model
</h2>

In [35]:
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

model_performance = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    
    model_performance[name] = {
        'model': model,
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }
    
    print(f"{name} Performance:")
    print(f"  MSE: {mse:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
    print(f"  Cross-validation R²: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")
    print()

# Select the best model
best_model_name = max(model_performance, key=lambda x: model_performance[x]['r2'])
best_model = model_performance[best_model_name]['model']
print(f"Best model: {best_model_name}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
display(feature_importance)

Random Forest Performance:
  MSE: 75.2019, RMSE: 8.6719, MAE: 6.9862, R²: 0.9087
  Cross-validation R²: 0.9049 (±0.0129)

Gradient Boosting Performance:
  MSE: 47.8013, RMSE: 6.9138, MAE: 5.5993, R²: 0.9420
  Cross-validation R²: 0.9413 (±0.0055)

Best model: Gradient Boosting

Feature Importance:


Unnamed: 0,feature,importance
6,potassium,0.728636
1,rainfall,0.104886
4,nitrogen,0.082664
5,phosphorus,0.035511
9,fertilizer_usage,0.022335
2,humidity,0.014319
0,temperature,0.008337
3,soil_ph,0.001118
7,organic_matter,0.001081
8,irrigation_frequency,0.000939


<h2><b> Step: 8</b>-  Create the Tab Interface
</h2>

In [36]:
import ipywidgets as widgets
from IPython.display import display

# Create tab container
tab = widgets.Tab()

# Create output areas for each tab
output_prediction = widgets.Output()
output_analysis = widgets.Output()
output_visualization = widgets.Output()
output_about = widgets.Output()

# Set tab titles
tab.children = [output_prediction, output_analysis, output_visualization, output_about]
tab.set_title(0, 'Yield Prediction')
tab.set_title(1, 'Data Analysis')
tab.set_title(2, 'Visualizations')
tab.set_title(3, 'About')

# Display the tab container
display(tab)

Tab(children=(Output(), Output(), Output(), Output()), selected_index=0, titles=('Yield Prediction', 'Data Ana…

<h2><b> Step: 9</b>-   Create Prediction Tab Content
</h2>

In [37]:
with output_prediction:
    # Create input widgets
    form_title = widgets.HTML("<h1 style='text-align: center; color: #2E86AB; margin-bottom: 20px;'>🌱 Agricultural Yield Predictor</h1>")
    form_description = widgets.HTML("<p style='text-align: center; color: #6c757d;'>Enter the agricultural parameters to predict crop yield</p>")
    
    # Left column
    left_col = widgets.VBox([
        widgets.FloatText(value=25.0, description="Temperature (°C):", style={'description_width': 'initial'}),
        widgets.FloatText(value=50.0, description="Rainfall (mm):", style={'description_width': 'initial'}),
        widgets.FloatText(value=60.0, description="Humidity (%):", style={'description_width': 'initial'}),
        widgets.FloatText(value=6.5, description="Soil pH:", style={'description_width': 'initial'}),
        widgets.FloatText(value=20.0, description="Nitrogen (kg/ha):", style={'description_width': 'initial'})
    ])
    
    # Right column
    right_col = widgets.VBox([
        widgets.FloatText(value=15.0, description="Phosphorus (kg/ha):", style={'description_width': 'initial'}),
        widgets.FloatText(value=200.0, description="Potassium (kg/ha):", style={'description_width': 'initial'}),
        widgets.FloatText(value=2.5, description="Organic Matter (%):", style={'description_width': 'initial'}),
        widgets.IntText(value=4, description="Irrigation Frequency:", style={'description_width': 'initial'}),
        widgets.FloatText(value=20.0, description="Fertilizer Usage (kg/ha):", style={'description_width': 'initial'})
    ])
    
    # Crop type dropdown
    crop_type = widgets.Dropdown(
        options=le.classes_.tolist(), 
        value='Wheat',
        description='Crop Type:',
        style={'description_width': 'initial'}
    )
    
    # Prediction button
    predict_btn = widgets.Button(
        description="Predict Yield", 
        button_style='success',
        icon='calculator',
        layout=widgets.Layout(width='200px', height='40px')
    )
    
    # Result display
    result_display = widgets.Output()
    
    # Arrange form
    form_columns = widgets.HBox([left_col, right_col])
    form = widgets.VBox([
        form_title,
        form_description,
        form_columns,
        crop_type,
        widgets.HBox([predict_btn], layout=widgets.Layout(justify_content='center')),
        result_display
    ])
    
    display(form)

<h2><b> Step: 10</b>-   Create Analysis Tab Content
</h2>

In [38]:
with output_analysis:
    analysis_title = widgets.HTML("<h2 style='color: #2E86AB;'>Dataset Analysis</h2>")
    
    # Model performance comparison
    perf_df = pd.DataFrame({
        'Model': list(model_performance.keys()),
        'R² Score': [model_performance[m]['r2'] for m in model_performance],
        'RMSE': [model_performance[m]['rmse'] for m in model_performance],
        'CV Score (Mean)': [model_performance[m]['cv_mean'] for m in model_performance]
    })
    
    # Feature importance
    feature_fig = go.Figure(go.Bar(
        x=feature_importance['importance'],
        y=feature_importance['feature'],
        orientation='h',
        marker_color='#2E86AB'
    ))
    feature_fig.update_layout(
        title='Feature Importance',
        xaxis_title='Importance',
        yaxis_title='Features',
        height=400
    )
    
    # Display analysis content
    display(analysis_title)
    display(widgets.HTML("<h3>Model Performance Comparison</h3>"))
    display(perf_df)
    display(widgets.HTML("<h3>Feature Importance</h3>"))
    display(feature_fig)

<h2><b> Step: 11</b>-   Create Visualization Tab Content
</h2>

In [39]:
with output_visualization:
    viz_title = widgets.HTML("<h2 style='color: #2E86AB;'>Data Visualizations</h2>")
    display(viz_title)
    
    # Create a copy of the dataframe with only numerical columns for correlation
    numerical_df = df.select_dtypes(include=[np.number])
    
    # Interactive correlation heatmap (only numerical data)
    @interact
    def show_correlation_heatmap():
        # Use only numerical data for correlation
        corr_matrix = numerical_df.corr()
        fig = px.imshow(corr_matrix, 
                       text_auto=True, 
                       aspect="auto",
                       color_continuous_scale='RdBu_r',
                       title='Correlation Matrix (Numerical Features Only)',
                       height=600)
        fig.show()
    
    # Distribution of crop yield by crop type
    display(widgets.HTML("<h3>Crop Yield Distribution</h3>"))
    
    yield_fig = px.box(df, x='crop_type', y='crop_yield', 
                     title='Crop Yield Distribution by Crop Type',
                     color='crop_type')
    yield_fig.show()
    
    # Interactive scatter plot with only numerical features
    display(widgets.HTML("<h3>Interactive Scatter Plot</h3>"))
    
    numerical_features = [f for f in features if f != 'crop_type_encoded']
    
    @interact
    def show_scatter_plot(
        x_feature=widgets.Dropdown(options=numerical_features, value='temperature', description='X-axis:'),
        y_feature=widgets.Dropdown(options=numerical_features, value='rainfall', description='Y-axis:'),
        color_by=widgets.Dropdown(options=['crop_type', 'crop_yield'], value='crop_type', description='Color by:')
    ):
        fig = px.scatter(df, x=x_feature, y=y_feature, color=color_by,
                         title=f'{y_feature} vs {x_feature}',
                         hover_data=['crop_type', 'crop_yield'],
                         height=500)
        fig.show()
    
    # Additional visualization: Pair plot for numerical features
    display(widgets.HTML("<h3>Additional Visualizations</h3>"))
    
    pair_plot_btn = widgets.Button(description="Show Pair Plot", button_style='info')
    pair_plot_output = widgets.Output()
    
    def on_pair_plot_click(b):
        with pair_plot_output:
            clear_output()
            # Sample the data to make the pair plot more manageable
            sample_df = df.sample(min(100, len(df)))
            fig = px.scatter_matrix(sample_df, 
                                   dimensions=numerical_features[:5],  # Limit to first 5 features for clarity
                                   color="crop_type",
                                   title="Pair Plot of Numerical Features (Sampled)",
                                   height=600)
            fig.show()
    
    pair_plot_btn.on_click(on_pair_plot_click)
    
    display(widgets.VBox([pair_plot_btn, pair_plot_output]))

<h2><b> Step: 12</b>-   Create About Tab Content
</h2>

In [40]:
with output_about:
    about_title = widgets.HTML("<h2 style='color: #2E86AB;'>About This Application</h2>")
    about_content = widgets.HTML("""
    <div style="line-height: 1.6;">
        <h3>Agricultural Yield Prediction Model</h3>
        <p>This application uses machine learning to predict crop yields based on various agricultural and environmental factors.</p>
        
        <h4>Features Used:</h4>
        <ul>
            <li>Temperature (°C)</li>
            <li>Rainfall (mm)</li>
            <li>Humidity (%)</li>
            <li>Soil pH</li>
            <li>Nitrogen content (kg/ha)</li>
            <li>Phosphorus content (kg/ha)</li>
            <li>Potassium content (kg/ha)</li>
            <li>Organic matter (%)</li>
            <li>Irrigation frequency</li>
            <li>Fertilizer usage (kg/ha)</li>
            <li>Crop type</li>
        </ul>
        
        <h4>Models Implemented:</h4>
        <ul>
            <li>Random Forest Regressor</li>
            <li>Gradient Boosting Regressor</li>
        </ul>
        
        <p>The best performing model is automatically selected for predictions.</p>
        
        <h4>Evaluation Metrics:</h4>
        <ul>
            <li>R² Score: Measures how well the model explains the variance in the data</li>
            <li>Root Mean Squared Error (RMSE): Measures the average prediction error</li>
            <li>Mean Absolute Error (MAE): Measures the average absolute prediction error</li>
        </ul>
    </div>
    """)
    
    display(about_title)
    display(about_content)

<h2><b> Step: 13</b>-   Prediction Function
</h2>

In [41]:
# Prediction function
def on_predict_click(b):
    with result_display:
        clear_output()
        
        # Get input values
        input_data = [
            left_col.children[0].value,  # temperature
            left_col.children[1].value,  # rainfall
            left_col.children[2].value,  # humidity
            left_col.children[3].value,  # soil_ph
            left_col.children[4].value,  # nitrogen
            right_col.children[0].value,  # phosphorus
            right_col.children[1].value,  # potassium
            right_col.children[2].value,  # organic_matter
            right_col.children[3].value,  # irrigation_frequency
            right_col.children[4].value,  # fertilizer_usage
            le.transform([crop_type.value])[0]  # crop_type_encoded
        ]
        
        # Scale and predict
        input_scaled = scaler.transform([input_data])
        prediction = best_model.predict(input_scaled)[0]
        
        # Calculate confidence interval
        confidence = 0.95
        std_dev = np.std(y_test)
        margin_of_error = 1.96 * std_dev / np.sqrt(len(y_test))
        lower_bound = prediction - margin_of_error
        upper_bound = prediction + margin_of_error
        
        # Display results with styling
        display(HTML(f"""
        <div style="border: 2px solid #2E86AB; padding: 20px; border-radius: 10px; background-color: #f8f9fa; margin-top: 20px;">
            <h2 style="color: #2E86AB; text-align: center;">Prediction Result</h2>
            <div style="text-align: center; margin: 20px 0;">
                <p style="font-size: 16px; margin-bottom: 5px;">Predicted Yield for {crop_type.value}</p>
                <h1 style="color: #2E86AB; font-size: 36px; margin: 10px 0;">{prediction:.2f} tons/hectare</h1>
                <p style="color: #6c757d; font-size: 14px;">95% confidence interval: {lower_bound:.2f} - {upper_bound:.2f} tons/hectare</p>
            </div>
            
            <div style="display: flex; justify-content: space-between; margin-top: 20px;">
                <div style="flex: 1; padding: 10px; background-color: #e9f5ff; border-radius: 5px; margin-right: 10px;">
                    <h4 style="color: #2E86AB; margin-top: 0;">Environmental Factors</h4>
                    <p>Temperature: {left_col.children[0].value} °C</p>
                    <p>Rainfall: {left_col.children[1].value} mm</p>
                    <p>Humidity: {left_col.children[2].value}%</p>
                    <p>Soil pH: {left_col.children[3].value}</p>
                </div>
                
                <div style="flex: 1; padding: 10px; background-color: #e9f5ff; border-radius: 5px;">
                    <h4 style="color: #2E86AB; margin-top: 0;">Soil Nutrients & Practices</h4>
                    <p>Nitrogen: {left_col.children[4].value} kg/ha</p>
                    <p>Phosphorus: {right_col.children[0].value} kg/ha</p>
                    <p>Potassium: {right_col.children[1].value} kg/ha</p>
                    <p>Organic Matter: {right_col.children[2].value}%</p>
                    <p>Irrigation: {right_col.children[3].value}</p>
                    <p>Fertilizer: {right_col.children[4].value} kg/ha</p>
                </div>
            </div>
        </div>
        """))
        
        # Show comparison with average yield for this crop type
        avg_yield = df[df['crop_type'] == crop_type.value]['crop_yield'].mean()
        comparison = "above" if prediction > avg_yield else "below"
        percent_diff = abs(prediction - avg_yield) / avg_yield * 100
        
        display(HTML(f"""
        <div style="margin-top: 20px; padding: 15px; background-color: #fff3cd; border-radius: 5px; border-left: 4px solid #ffc107;">
            <h4 style="color: #856404; margin-top: 0;">Comparison with Average</h4>
            <p>The predicted yield is <strong>{comparison} average</strong> for {crop_type.value} crops.</p>
            <p>Average yield for {crop_type.value}: {avg_yield:.2f} tons/hectare</p>
            <p>Difference: {abs(prediction - avg_yield):.2f} tons/hectare ({percent_diff:.1f}%)</p>
        </div>
        """))

# Link button to function
predict_btn.on_click(on_predict_click)

<h2><b> Step: 14</b>-  Final Application Display
</h2>


In [42]:
# Display the complete application with a header
display(widgets.VBox([
    widgets.HTML("<div style='background: linear-gradient(135deg, #2E86AB, #A23B72); padding: 20px; border-radius: 10px; margin-bottom: 20px;'>" +
                 "<h1 style='color: white; text-align: center; margin: 0;'>🌾 Agricultural Yield Predictor</h1>" +
                 "<p style='color: white; text-align: center; margin: 10px 0 0 0;'>Predict crop yields using machine learning</p></div>"),
    tab
]))

print("Application loaded successfully! Select the tabs to explore different features.")

VBox(children=(HTML(value="<div style='background: linear-gradient(135deg, #2E86AB, #A23B72); padding: 20px; b…

Application loaded successfully! Select the tabs to explore different features.
