In [1]:
# -*- coding: utf-8 -*-
"""
美国南瓜市场价格分析与预测模型
修正版本：解决Unicode路径问题和样式兼容性
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import shap
import joblib
import os
import tempfile

  from .autonotebook import tqdm as notebook_tqdm


## 环境设置

In [3]:
# 解决中文路径问题：设置纯ASCII临时文件夹
temp_dir = tempfile.mkdtemp(prefix='joblib_')
os.environ['JOBLIB_TEMP_FOLDER'] = temp_dir

# 设置样式
try:
    plt.style.use('seaborn-v0_8')  # Matplotlib >= 3.5
except:
    plt.style.use('seaborn')       # 旧版本回退

sns.set_palette("husl")
pd.set_option('display.max_columns', 30)
plt.rcParams['font.sans-serif'] = ['SimHei']  # 解决中文显示
plt.rcParams['axes.unicode_minus'] = False    # 解决负号显示

## 1. 数据加载与初步分析

In [4]:
print("1. 数据加载与初步分析...")
df = pd.read_csv('US-pumpkins.csv')

# 初步检查
print(f"数据集形状: {df.shape}")
print("\n前5行数据:")
print(df.head())
print("\n数据概览:")
print(df.info())
print("\n描述性统计:")
print(df.describe(include='all'))

1. 数据加载与初步分析...
数据集形状: (1757, 26)

前5行数据:
   City Name Type       Package      Variety Sub Variety  Grade     Date  \
0  BALTIMORE  NaN  24 inch bins          NaN         NaN    NaN  4/29/17   
1  BALTIMORE  NaN  24 inch bins          NaN         NaN    NaN   5/6/17   
2  BALTIMORE  NaN  24 inch bins  HOWDEN TYPE         NaN    NaN  9/24/16   
3  BALTIMORE  NaN  24 inch bins  HOWDEN TYPE         NaN    NaN  9/24/16   
4  BALTIMORE  NaN  24 inch bins  HOWDEN TYPE         NaN    NaN  11/5/16   

   Low Price  High Price  Mostly Low  Mostly High    Origin Origin District  \
0      270.0       280.0       270.0        280.0  MARYLAND             NaN   
1      270.0       280.0       270.0        280.0  MARYLAND             NaN   
2      160.0       160.0       160.0        160.0  DELAWARE             NaN   
3      160.0       160.0       160.0        160.0  VIRGINIA             NaN   
4       90.0       100.0        90.0        100.0  MARYLAND             NaN   

  Item Size   Color  Envir

## 2. 问题构建

In [5]:
print("\n2. 问题构建: 南瓜价格预测回归问题")


2. 问题构建: 南瓜价格预测回归问题


## 3. 数据预处理与特征工程

In [7]:
print("\n3. 数据预处理与特征工程...")

# 处理日期特征
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%y')
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Week'] = df['Date'].dt.isocalendar().week

# 创建目标变量
df['Avg_Price'] = (df['Low Price'] + df['High Price']) / 2

# 处理分类变量
categorical_cols = ['City Name', 'Type', 'Package', 'Variety', 'Sub Variety',
                   'Grade', 'Origin', 'Item Size', 'Color', 'Environment']
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown')
        df[col] = df[col].astype(str).str.upper().str.strip()

# 特征选择与过滤
features = ['City Name', 'Package', 'Variety', 'Origin', 'Item Size',
            'Color', 'Year', 'Month', 'Week', 'Unit of Sale']
target = 'Avg_Price'

# 过滤有效数据
model_data = df[features + [target]].dropna()

# 类别特征编码
label_encoders = {}
for col in ['City Name', 'Package', 'Variety', 'Origin', 'Item Size', 'Color', 'Unit of Sale']:
    le = LabelEncoder()
    model_data[col] = le.fit_transform(model_data[col])
    label_encoders[col] = le


3. 数据预处理与特征工程...


## 4. 探索性数据分析 (EDA)

In [8]:
print("\n4. 探索性数据分析...")

# 价格分布
plt.figure(figsize=(12, 6))
sns.histplot(model_data['Avg_Price'], kde=True, bins=30)
plt.title('南瓜价格分布')
plt.savefig('price_distribution.png', bbox_inches='tight', dpi=300)
plt.close()

# 品种与价格关系
plt.figure(figsize=(14, 7))
top_varieties = model_data['Variety'].value_counts().nlargest(5).index
sns.boxplot(x='Variety', y='Avg_Price',
            data=model_data[model_data['Variety'].isin(top_varieties)])
plt.title('前5品种的价格分布')
plt.savefig('price_by_variety.png', bbox_inches='tight', dpi=300)
plt.close()


4. 探索性数据分析...


## 5. 模型构建与优化

In [9]:
print("\n5. 模型构建与优化...")

# 划分训练测试集
X = model_data[features]
y = model_data[target]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# 构建管道
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(random_state=42, n_jobs=1))  # 禁用并行
])

# 参数网格
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}

# 网格搜索（禁用并行）
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=1  # 关键修改：禁用并行处理
)
grid_search.fit(X_train, y_train)


5. 模型构建与优化...


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__max_depth': [None, 10, ...], 'model__min_samples_leaf': [1, 2], 'model__min_samples_split': [2, 5], 'model__n_estimators': [100, 200]}"
,scoring,'neg_mean_squared_error'
,n_jobs,1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## 6. 模型评估

In [10]:
print("\n6. 模型评估...")

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"\n最佳模型性能:")
print(f"- MSE: {mse:.2f}")
print(f"- RMSE: {rmse:.2f}")
print(f"- R²: {r2:.2f}")
print(f"\n最佳参数: {grid_search.best_params_}")


6. 模型评估...

最佳模型性能:
- MSE: 33.83
- RMSE: 5.82
- R²: 1.00

最佳参数: {'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}


## 7. 结果保存

In [11]:
print("\n7. 保存结果...")

# 保存到当前目录（避免中文路径）
joblib.dump(best_model, 'pumpkin_model.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

# 实际vs预测图
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel('实际价格')
plt.ylabel('预测价格')
plt.title('实际 vs 预测价格')
plt.savefig('prediction_results.png', bbox_inches='tight', dpi=300)
plt.close()

# 清理临时文件夹
try:
    import shutil
    shutil.rmtree(temp_dir)
except:
    pass

print("\n分析完成！结果已保存到当前目录。")


7. 保存结果...

分析完成！结果已保存到当前目录。
