In [4]:
# Crop Yield Prediction Analysis

**Group Members:**
- Alice Smith (ID: "001")
- Bob Johnson (ID: "002")
- Carol Lee (ID: "003")
- David Kim (ID: "004")

This notebook demonstrates:

- Loading and exploring a large agricultural dataset (~24k rows)
- Polynomial feature engineering
- Using a pre-trained model (`theta_model.npy`)
- Visualizing performance (charts included)
- Making predictions on new samples
- Applying Big Data and Machine Learning concepts


SyntaxError: invalid decimal literal (3383821592.py, line 11)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from IPython.display import Image, display


In [None]:
# Correct relative path to raw_dataset.csv
DATA_CSV = "../data/raw_dataset.csv"
df = pd.read_csv(DATA_CSV)

print("Dataset loaded successfully!")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()


In [None]:
# Check missing values
print("Missing values per column:\n", df.isna().sum())

# Basic statistics
df.describe()


In [None]:
plt.figure(figsize=(8,4))
plt.hist(df['Yield'], bins=50, color='skyblue')
plt.title("Distribution of Crop Yield")
plt.xlabel("Yield")
plt.ylabel("Count")
plt.show()

plt.figure(figsize=(8,4))
plt.scatter(df['Rainfall_mm'], df['Yield'], alpha=0.5)
plt.xlabel("Rainfall (mm)")
plt.ylabel("Yield")
plt.title("Yield vs Rainfall")
plt.show()


In [None]:
X = df[["Rainfall_mm", "Temperature_C", "Pesticide_Use", "Area", "Production"]].values
y = df["Yield"].values.reshape(-1,1)

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

print("Original features:", X.shape[1])
print("Polynomial features:", X_poly.shape[1])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_poly, y, test_size=0.3, random_state=42
)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# Add bias column
X_train_b = np.hstack([np.ones((X_train_s.shape[0],1)), X_train_s])
X_test_b = np.hstack([np.ones((X_test_s.shape[0],1)), X_test_s])

print("Train shape:", X_train_b.shape)
print("Test shape:", X_test_b.shape)


In [None]:
# Load theta_model.npy
theta = np.load("../models/theta_model.npy")
print("Theta shape:", theta.shape)
print("First 5 weights:\n", theta[:5])


In [None]:
y_pred = X_test_b.dot(theta)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Test MSE:", mse)
print("RÂ² Score:", r2)

# Plot predicted vs actual
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Actual Yield")
plt.ylabel("Predicted Yield")
plt.title("Predicted vs Actual Yield")
plt.grid(True)
plt.show()


In [3]:
from IPython.display import Image, display

display(Image(filename="../images/chart1.png"))
display(Image(filename="../images/chart2.png"))



FileNotFoundError: [Errno 2] No such file or directory: '../images/chart1.png'

In [None]:
# Example new input
x_new = np.array([[850, 21, 160, 40000, 3700]])

# Preprocess
x_poly = poly.transform(x_new)
x_scaled = scaler.transform(x_poly)
x_b = np.hstack([np.ones((x_scaled.shape[0],1)), x_scaled])

y_pred_new = x_b.dot(theta)
print("Predicted Yield for new input:", y_pred_new[0][0])
