In [None]:
# Banana Seed Count Prediction Model

This notebook implements a machine learning model to predict the number of seeds in a banana based on various features. The model will be trained using a labeled dataset (to be added later).

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Data Preparation

The model will use the following features to predict the number of seeds:
1. Length (cm)
2. Width (cm)
3. Weight (g)
4. Ripeness level (1-5)
5. Color (encoded)
6. Curvature (degrees)

In [None]:
# Create sample data structure (to be replaced with real data later)
def create_sample_data(n_samples=100):
    np.random.seed(42)
    
    # Generate synthetic features
    length = np.random.normal(15, 2, n_samples)  # Average banana length 15cm
    width = np.random.normal(3, 0.5, n_samples)  # Average banana width 3cm
    weight = np.random.normal(120, 20, n_samples)  # Average banana weight 120g
    ripeness = np.random.randint(1, 6, n_samples)  # Ripeness level 1-5
    color = np.random.randint(1, 4, n_samples)  # Color encoded as 1=green, 2=yellow, 3=brown
    curvature = np.random.normal(45, 10, n_samples)  # Average curvature 45 degrees
    
    # Create feature matrix
    X = np.column_stack([length, width, weight, ripeness, color, curvature])
    
    # Generate target (number of seeds)
    # This is a simplified model where larger, riper bananas tend to have more seeds
    y = (0.5 * length + 0.3 * width + 0.2 * weight/100 + 0.1 * ripeness + 
         0.1 * color + 0.1 * curvature/45).astype(int)
    y = np.clip(y, 0, 20)  # Limit number of seeds between 0 and 20
    
    # Create DataFrame
    columns = ['length', 'width', 'weight', 'ripeness', 'color', 'curvature']
    df = pd.DataFrame(X, columns=columns)
    
    return df, y

# Create sample dataset
X_data, y_data = create_sample_data()

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_data, test_size=0.2, random_state=42
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the model
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42
)
model.fit(X_train_scaled, y_train)

In [None]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")

In [None]:
# Function to predict seeds for new bananas
def predict_seeds(length, width, weight, ripeness, color, curvature):
    # Create a feature array for the new banana
    features = np.array([[length, width, weight, ripeness, color, curvature]])
    
    # Scale the features
    features_scaled = scaler.transform(features)
    
    # Make prediction
    prediction = model.predict(features_scaled)[0]
    
    return int(prediction)

# Example prediction
example_banana = {
    'length': 16,    # cm
    'width': 3.2,    # cm
    'weight': 130,   # g
    'ripeness': 4,   # scale 1-5
    'color': 2,      # yellow
    'curvature': 40  # degrees
}

predicted_seeds = predict_seeds(
    example_banana['length'],
    example_banana['width'],
    example_banana['weight'],
    example_banana['ripeness'],
    example_banana['color'],
    example_banana['curvature']
)

print(f"Predicted number of seeds: {predicted_seeds}")