# 📈 Notebook 02: Linear Regression Analysis
## Intelligent Agriculture - Crop Recommendation System

**Objectives:**
1. Implement Simple Linear Regression
2. Implement Multivariate Linear Regression
3. Feature importance analysis
4. Model evaluation and visualization
5. Regression for continuous target prediction

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully!")

In [None]:
# Load preprocessed data
train_data = pd.read_csv('../data/processed/train.csv')
val_data = pd.read_csv('../data/processed/validation.csv')
test_data = pd.read_csv('../data/processed/test.csv')

# Load preprocessing objects
scaler = joblib.load('../data/processed/scaler.pkl')
label_encoder = joblib.load('../data/processed/label_encoder.pkl')

print(f"✅ Data loaded: {len(train_data)} train, {len(val_data)} val, {len(test_data)} test")

# Prepare features
feature_cols = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
X_train = train_data[feature_cols]
X_val = val_data[feature_cols]
X_test = test_data[feature_cols]

# Scale features
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Create regression targets (predict individual nutrients)
print("🎯 LINEAR REGRESSION FOR NUTRIENT PREDICTION")
print("=" * 50)

# We'll predict N, P, K values based on other features
regression_results = {}

# Simple Linear Regression: Predict N based on temperature
print("\n1️⃣ SIMPLE LINEAR REGRESSION: N ~ Temperature")
lr_simple = LinearRegression()
X_temp_train = X_train[['temperature']]
X_temp_val = X_val[['temperature']]
y_N_train = X_train['N']
y_N_val = X_val['N']

lr_simple.fit(X_temp_train, y_N_train)
y_N_pred = lr_simple.predict(X_temp_val)

# Metrics
mse = mean_squared_error(y_N_val, y_N_pred)
r2 = r2_score(y_N_val, y_N_pred)
mae = mean_absolute_error(y_N_val, y_N_pred)

print(f"MSE: {mse:.4f}")
print(f"R²: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"Coefficient: {lr_simple.coef_[0]:.4f}")
print(f"Intercept: {lr_simple.intercept_:.4f}")

regression_results['Simple_LR'] = {'MSE': mse, 'R2': r2, 'MAE': mae}

In [None]:
# Visualize Simple Linear Regression
plt.figure(figsize=(12, 5))

# Scatter plot with regression line
plt.subplot(1, 2, 1)
plt.scatter(X_temp_val, y_N_val, alpha=0.6, label='Actual')
plt.scatter(X_temp_val, y_N_pred, alpha=0.6, label='Predicted', color='red')
plt.plot(X_temp_val, y_N_pred, color='red', linewidth=2)
plt.xlabel('Temperature')
plt.ylabel('Nitrogen (N)')
plt.title('Simple Linear Regression: N ~ Temperature')
plt.legend()

# Residual plot
plt.subplot(1, 2, 2)
residuals = y_N_val - y_N_pred
plt.scatter(y_N_pred, residuals, alpha=0.6)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Predicted N')
plt.ylabel('Residuals')
plt.title('Residual Plot')

plt.tight_layout()
plt.show()

In [None]:
# Multivariate Linear Regression: Predict N based on all other features
print("\n2️⃣ MULTIVARIATE LINEAR REGRESSION: N ~ All Features")
lr_multi = LinearRegression()
X_multi_train = X_train[['P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']]
X_multi_val = X_val[['P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']]

lr_multi.fit(X_multi_train, y_N_train)
y_N_multi_pred = lr_multi.predict(X_multi_val)

# Metrics
mse_multi = mean_squared_error(y_N_val, y_N_multi_pred)
r2_multi = r2_score(y_N_val, y_N_multi_pred)
mae_multi = mean_absolute_error(y_N_val, y_N_multi_pred)

print(f"MSE: {mse_multi:.4f}")
print(f"R²: {r2_multi:.4f}")
print(f"MAE: {mae_multi:.4f}")

regression_results['Multivariate_LR'] = {'MSE': mse_multi, 'R2': r2_multi, 'MAE': mae_multi}

# Feature importance (coefficients)
feature_importance = pd.DataFrame({
    'Feature': X_multi_train.columns,
    'Coefficient': lr_multi.coef_,
    'Abs_Coefficient': np.abs(lr_multi.coef_)
}).sort_values('Abs_Coefficient', ascending=False)

print("\n📊 Feature Importance (Coefficients):")
print(feature_importance)

In [None]:
# Visualize Multivariate Linear Regression
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Actual vs Predicted
axes[0,0].scatter(y_N_val, y_N_multi_pred, alpha=0.6)
axes[0,0].plot([y_N_val.min(), y_N_val.max()], [y_N_val.min(), y_N_val.max()], 'r--', lw=2)
axes[0,0].set_xlabel('Actual N')
axes[0,0].set_ylabel('Predicted N')
axes[0,0].set_title(f'Actual vs Predicted (R² = {r2_multi:.3f})')

# Residuals
residuals_multi = y_N_val - y_N_multi_pred
axes[0,1].scatter(y_N_multi_pred, residuals_multi, alpha=0.6)
axes[0,1].axhline(y=0, color='red', linestyle='--')
axes[0,1].set_xlabel('Predicted N')
axes[0,1].set_ylabel('Residuals')
axes[0,1].set_title('Residual Plot')

# Feature importance
axes[1,0].barh(feature_importance['Feature'], feature_importance['Abs_Coefficient'])
axes[1,0].set_xlabel('Absolute Coefficient Value')
axes[1,0].set_title('Feature Importance')

# Model comparison
models = ['Simple LR', 'Multivariate LR']
r2_scores = [r2, r2_multi]
axes[1,1].bar(models, r2_scores, color=['skyblue', 'lightcoral'])
axes[1,1].set_ylabel('R² Score')
axes[1,1].set_title('Model Comparison')
axes[1,1].set_ylim(0, 1)

plt.tight_layout()
plt.show()

In [None]:
# Multiple target regression (predict P and K as well)
print("\n3️⃣ MULTIPLE TARGET REGRESSION")
print("=" * 50)

targets = ['N', 'P', 'K']
multi_target_results = {}

for target in targets:
    print(f"\n🎯 Predicting {target}:")
    
    # Features (all except the target)
    feature_cols_target = [col for col in feature_cols if col != target]
    X_target_train = X_train[feature_cols_target]
    X_target_val = X_val[feature_cols_target]
    y_target_train = X_train[target]
    y_target_val = X_val[target]
    
    # Train model
    lr_target = LinearRegression()
    lr_target.fit(X_target_train, y_target_train)
    y_target_pred = lr_target.predict(X_target_val)
    
    # Metrics
    mse_target = mean_squared_error(y_target_val, y_target_pred)
    r2_target = r2_score(y_target_val, y_target_pred)
    mae_target = mean_absolute_error(y_target_val, y_target_pred)
    
    print(f"MSE: {mse_target:.4f}")
    print(f"R²: {r2_target:.4f}")
    print(f"MAE: {mae_target:.4f}")
    
    multi_target_results[target] = {
        'MSE': mse_target, 'R2': r2_target, 'MAE': mae_target,
        'model': lr_target
    }

In [None]:
# Visualize multiple target results
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, target in enumerate(targets):
    r2_score_target = multi_target_results[target]['R2']
    
    # Get predictions for visualization
    feature_cols_target = [col for col in feature_cols if col != target]
    X_target_val = X_val[feature_cols_target]
    y_target_val = X_val[target]
    y_target_pred = multi_target_results[target]['model'].predict(X_target_val)
    
    axes[i].scatter(y_target_val, y_target_pred, alpha=0.6)
    axes[i].plot([y_target_val.min(), y_target_val.max()], 
                [y_target_val.min(), y_target_val.max()], 'r--', lw=2)
    axes[i].set_xlabel(f'Actual {target}')
    axes[i].set_ylabel(f'Predicted {target}')
    axes[i].set_title(f'{target} Prediction (R² = {r2_score_target:.3f})')

plt.tight_layout()
plt.show()

In [None]:
# Summary and save results
print("📋 LINEAR REGRESSION SUMMARY")
print("=" * 50)

# Create summary DataFrame
summary_data = []
summary_data.append(['Simple LR (N~Temp)', r2, mse, mae])
summary_data.append(['Multivariate LR (N)', r2_multi, mse_multi, mae_multi])

for target in targets:
    results = multi_target_results[target]
    summary_data.append([f'LR ({target})', results['R2'], results['MSE'], results['MAE']])

summary_df = pd.DataFrame(summary_data, 
                         columns=['Model', 'R²', 'MSE', 'MAE'])

print(summary_df)

# Save results
summary_df.to_csv('../data/processed/linear_regression_results.csv', index=False)

# Save best models
joblib.dump(lr_multi, '../models/saved_models/linear_regression_model.pkl')

print("\n✅ Results saved to: data/processed/linear_regression_results.csv")
print("✅ Best model saved to: models/saved_models/linear_regression_model.pkl")

print("\n🎯 KEY INSIGHTS:")
print(f"• Best R² score: {summary_df['R²'].max():.3f}")
print(f"• Multivariate regression outperforms simple regression")
print(f"• Feature importance varies across different targets")

print("\n🚀 Next: Open notebook 03_Logistic_Regression.ipynb")