# Logistic Regression with US Pumpkins Dataset

This notebook demonstrates a simple logistic regression model to predict pumpkin size based on price data.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('US-pumpkins.csv')
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
df.head()

In [None]:
# Explore the data
print("Column names:")
print(df.columns.tolist())
print("\nItem Size values:")
print(df['Item Size'].value_counts())
print("\nMissing values in key columns:")
print(df[['Item Size', 'Low Price', 'High Price']].isnull().sum())

In [None]:
# Clean and prepare the data
# Remove rows with missing Item Size, Low Price, or High Price
clean_df = df.dropna(subset=['Item Size', 'Low Price', 'High Price'])

# Create target variable: 1 for 'lge' (large), 0 for others (med, sml, etc.)
clean_df['is_large'] = (clean_df['Item Size'] == 'lge').astype(int)

# Create features
clean_df['avg_price'] = (clean_df['Low Price'] + clean_df['High Price']) / 2
clean_df['price_range'] = clean_df['High Price'] - clean_df['Low Price']

# Select features and target
features = ['Low Price', 'High Price', 'avg_price', 'price_range']
X = clean_df[features]
y = clean_df['is_large']

print(f"Number of samples after cleaning: {len(clean_df)}")
print(f"Target distribution:\n{y.value_counts()}")
print(f"\nFeatures summary:")
print(X.describe())

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

# Create and train the logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

print("\nModel training completed!")
print(f"Model coefficients: {model.coef_[0]}")
print(f"Model intercept: {model.intercept_[0]}")
print(f"Feature names: {features}")

In [None]:
# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of being large

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.3f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred))

# Show some example predictions
print(f"\nExample Predictions:")
for i in range(5):
    actual = "Large" if y_test.iloc[i] == 1 else "Not Large"
    predicted = "Large" if y_pred[i] == 1 else "Not Large"
    probability = y_pred_proba[i]
    print(f"Sample {i+1}: Actual={actual}, Predicted={predicted}, Probability={probability:.3f}")

In [None]:
# Simple visualization
plt.figure(figsize=(10, 6))

# Plot 1: Price vs Size
plt.subplot(1, 2, 1)
large_pumpkins = clean_df[clean_df['is_large'] == 1]
not_large_pumpkins = clean_df[clean_df['is_large'] == 0]

plt.scatter(not_large_pumpkins['avg_price'], not_large_pumpkins['price_range'], 
           alpha=0.6, label='Not Large', color='orange')
plt.scatter(large_pumpkins['avg_price'], large_pumpkins['price_range'], 
           alpha=0.6, label='Large', color='green')
plt.xlabel('Average Price')
plt.ylabel('Price Range')
plt.title('Pumpkin Size by Price Features')
plt.legend()

# Plot 2: Feature importance
plt.subplot(1, 2, 2)
feature_names = ['Low Price', 'High Price', 'Avg Price', 'Price Range']
coefficients = model.coef_[0]
plt.barh(feature_names, coefficients)
plt.xlabel('Coefficient Value')
plt.title('Feature Importance in Logistic Regression')

plt.tight_layout()
plt.show()

print("Simple Logistic Regression Model Complete!")