# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [None]:
# 某车企汽车年销量预测分析程序
# 学号：0234972
# 姓名：王雅婷
# 专业：计算机科学与技术

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

# --------------------------
# 1. 数据导入与预处理（数据源质量审核）
# --------------------------
# 导入数据（假设数据文件与程序同路径）
try:
    # 读取SPSS数据文件（需安装pyreadstat库）
    import pyreadstat
    df_0234972, meta_0234972 = pyreadstat.read_sav("E:\Demo\python\回归分析实践—某车企汽车年销量预测\data\数据-汽车销量预测.sav")
    print("数据导入成功，共{}行{}列".format(df_0234972.shape[0], df_0234972.shape[1]))
except Exception as e:
    print("SPSS文件读取失败，尝试读取CSV格式备份数据...")
    df_0234972 = pd.read_csv("汽车销量预测.csv")  # 若sav文件无法读取，可使用CSV备份

# 数据基本信息查看（质量审核）
print("\n【数据源质量审核结果】")
print("1. 缺失值检查：")
missing_0234972 = df_0234972.isnull().sum()
print(missing_0234972[missing_0234972 > 0] if any(missing_0234972 > 0) else "无缺失值")

print("\n2. 异常值检查（数值型变量）：")
num_cols_0234972 = df_0234972.select_dtypes(include=[np.number]).columns
for col in num_cols_0234972:
    # 绘制箱线图检测异常值
    plt.figure(figsize=(6, 3))
    sns.boxplot(x=df_0234972[col])
    plt.title(f"图1. {col}的异常值检测（学号0234972）")
    plt.savefig(f"异常值检测_{col}_0234972.png")
    plt.close()

# 数据预处理（处理缺失值和异常值）
# 填充缺失值（数值型用均值，类别型用众数）
for col in df_0234972.columns:
    if df_0234972[col].isnull().any():
        if df_0234972[col].dtype in [np.int64, np.float64]:
            df_0234972[col].fillna(df_0234972[col].mean(), inplace=True)
        else:
            df_0234972[col].fillna(df_0234972[col].mode()[0], inplace=True)
print("\n数据预处理完成：缺失值已填充，异常值已标记（如需处理可手动调整）")

# 假设数据中包含年份（year）和销量（sales）字段，若字段名不同需修改
# 这里以常见字段名为例，实际使用时需根据数据调整
if 'year' not in df_0234972.columns:
    df_0234972['year'] = range(2000, 2000 + len(df_0234972))  # 若无年份，手动生成
if 'sales' not in df_0234972.columns:
    sales_col_0234972 = num_cols_0234972[-1]  # 假设最后一列是销量
    df_0234972 = df_0234972.rename(columns={sales_col_0234972: 'sales'})

# --------------------------
# 2. 线性回归拟合
# --------------------------
print("\n【线性回归拟合】")
X_lin_0234972 = df_0234972[['year']]  # 自变量：年份
y_lin_0234972 = df_0234972['sales']   # 因变量：销量

# 建立线性回归模型
model_lin_0234972 = LinearRegression()
model_lin_0234972.fit(X_lin_0234972, y_lin_0234972)
y_pred_lin_0234972 = model_lin_0234972.predict(X_lin_0234972)

# 模型评估
r2_lin_0234972 = r2_score(y_lin_0234972, y_pred_lin_0234972)
mse_lin_0234972 = mean_squared_error(y_lin_0234972, y_pred_lin_0234972)
print(f"线性回归R²值：{r2_lin_0234972:.4f}")
print(f"线性回归MSE值：{mse_lin_0234972:.4f}")

# 可视化线性拟合结果
plt.figure(figsize=(10, 6))
plt.scatter(X_lin_0234972, y_lin_0234972, color='blue', label='实际销量')
plt.plot(X_lin_0234972, y_pred_lin_0234972, color='red', label=f'线性拟合（R²={r2_lin_0234972:.4f}）')
plt.xlabel('年份')
plt.ylabel('汽车销量')
plt.title('图2. 线性回归拟合结果（学号0234972）')
plt.legend()
plt.savefig('线性回归拟合_0234972.png')
plt.close()

# --------------------------
# 3. 曲线拟合（多项式回归）
# --------------------------
print("\n【曲线拟合（多项式回归）】")
# 尝试2次、3次多项式，选择最优阶数
best_degree_0234972 = 2
best_r2_0234972 = 0
poly_models_0234972 = {}

for degree in [2, 3, 4]:
    poly_0234972 = PolynomialFeatures(degree=degree)
    X_poly_0234972 = poly_0234972.fit_transform(X_lin_0234972)
    
    model_poly_0234972 = LinearRegression()
    model_poly_0234972.fit(X_poly_0234972, y_lin_0234972)
    y_pred_poly_0234972 = model_poly_0234972.predict(X_poly_0234972)
    
    r2_poly_0234972 = r2_score(y_lin_0234972, y_pred_poly_0234972)
    poly_models_0234972[degree] = (model_poly_0234972, poly_0234972, r2_poly_0234972)
    
    if r2_poly_0234972 > best_r2_0234972:
        best_r2_0234972 = r2_poly_0234972
        best_degree_0234972 = degree

print(f"最优多项式阶数：{best_degree_0234972}阶，R²值：{best_r2_0234972:.4f}")

# 可视化最优多项式拟合结果
best_model_poly_0234972, best_poly_0234972, _ = poly_models_0234972[best_degree_0234972]
X_poly_best_0234972 = best_poly_0234972.fit_transform(X_lin_0234972)
y_pred_poly_best_0234972 = best_model_poly_0234972.predict(X_poly_best_0234972)

plt.figure(figsize=(10, 6))
plt.scatter(X_lin_0234972, y_lin_0234972, color='blue', label='实际销量')
plt.plot(X_lin_0234972, y_pred_poly_best_0234972, color='green', 
         label=f'{best_degree_0234972}阶多项式拟合（R²={best_r2_0234972:.4f}）')
plt.xlabel('年份')
plt.ylabel('汽车销量')
plt.title('图3. 多项式回归拟合结果（学号0234972）')
plt.legend()
plt.savefig(f'{best_degree_0234972}阶多项式拟合_0234972.png')
plt.close()

# --------------------------
# 4. 非线性回归拟合（指数增长模型）
# --------------------------
print("\n【非线性回归拟合（指数模型）】")
# 指数模型：y = a*e^(b*x)，转换为线性模型：ln(y) = ln(a) + b*x
y_log_0234972 = np.log(y_lin_0234972.replace(0, np.nan).dropna())  # 去除0值（避免log(0)）
X_log_0234972 = X_lin_0234972.loc[y_log_0234972.index]  # 对齐索引

model_exp_0234972 = LinearRegression()
model_exp_0234972.fit(X_log_0234972, y_log_0234972)
y_pred_log_0234972 = model_exp_0234972.predict(X_log_0234972)
y_pred_exp_0234972 = np.exp(y_pred_log_0234972)  # 转换回原尺度

# 模型评估
r2_exp_0234972 = r2_score(y_lin_0234972.loc[y_log_0234972.index], y_pred_exp_0234972)
print(f"指数回归R²值：{r2_exp_0234972:.4f}")

# 可视化非线性拟合结果
plt.figure(figsize=(10, 6))
plt.scatter(X_lin_0234972, y_lin_0234972, color='blue', label='实际销量')
plt.plot(X_log_0234972, y_pred_exp_0234972, color='purple', 
         label=f'指数拟合（R²={r2_exp_0234972:.4f}）')
plt.xlabel('年份')
plt.ylabel('汽车销量')
plt.title('图4. 指数回归拟合结果（学号0234972）')
plt.legend()
plt.savefig('指数回归拟合_0234972.png')
plt.close()

# --------------------------
# 5. 模型比较与未来销量预测
# --------------------------
print("\n【模型比较与预测】")
# 模型性能汇总
models_compare_0234972 = pd.DataFrame({
    '模型类型': ['线性回归', f'{best_degree_0234972}阶多项式回归', '指数回归'],
    'R²值': [r2_lin_0234972, best_r2_0234972, r2_exp_0234972]
})
print("模型性能比较：")
print(models_compare_0234972)

# 选择最优模型（R²最大的模型）
best_model_type_0234972 = models_compare_0234972.loc[models_compare_0234972['R²值'].idxmax()]['模型类型']
print(f"\n最优模型：{best_model_type_0234972}")

# 预测未来2-3年销量（假设最后一年为2024年，预测2025、2026、2027年）
last_year_0234972 = df_0234972['year'].max()
future_years_0234972 = pd.DataFrame({
    'year': [last_year_0234972 + 1, last_year_0234972 + 2, last_year_0234972 + 3]
})

# 根据最优模型进行预测
if best_model_type_0234972 == '线性回归':
    future_pred_0234972 = model_lin_0234972.predict(future_years_0234972)
elif '多项式' in best_model_type_0234972:
    future_pred_0234972 = best_model_poly_0234972.predict(
        best_poly_0234972.transform(future_years_0234972)
    )
else:  # 指数回归
    future_pred_log_0234972 = model_exp_0234972.predict(future_years_0234972)
    future_pred_0234972 = np.exp(future_pred_log_0234972)

# 整理预测结果
pred_results_0234972 = pd.DataFrame({
    '预测年份': future_years_0234972['year'],
    '预测销量': future_pred_0234972.round(2)
})
print("\n未来3年销量预测结果：")
print(pred_results_0234972)

# 可视化预测结果
plt.figure(figsize=(10, 6))
plt.scatter(df_0234972['year'], df_0234972['sales'], color='blue', label='历史销量')
plt.scatter(future_years_0234972['year'], future_pred_0234972, color='red', marker='*', s=100, label='预测销量')

# 绘制最优模型拟合线
if best_model_type_0234972 == '线性回归':
    plt.plot(df_0234972['year'], y_pred_lin_0234972, color='red', label='拟合线')
elif '多项式' in best_model_type_0234972:
    plt.plot(df_0234972['year'], y_pred_poly_best_0234972, color='green', label='拟合线')
else:
    plt.plot(X_log_0234972, y_pred_exp_0234972, color='purple', label='拟合线')

plt.xlabel('年份')
plt.ylabel('汽车销量')
plt.title(f'图5. 最优模型预测结果（学号0234972，模型：{best_model_type_0234972}）')
plt.legend()
plt.savefig('销量预测结果_0234972.png')
plt.close()

# 保存预测结果为CSV（用于提交材料）
pred_results_0234972.to_csv('销量预测结果_0234972.csv', index=False)
print("\n程序执行完成，预测结果已保存为CSV文件，图表已保存为PNG文件")