In [None]:
import pandas as pd
import plotly.express as px
import statsmodels.api as sm
import numpy as np
import webbrowser

========== 1. 读取数据 ==========

In [None]:
file_path = "C:\\Users\\bryennt\\Desktop\\2020-2023青年失业率及季度GDP.xlsx"
data = pd.read_excel(file_path, sheet_name='Sheet1')

In [None]:
# 统一列名为中文
data.columns = ['时间点', 'GDP', '青年失业率']
data['GDP'] = pd.to_numeric(data['GDP'], errors='coerce')
data['青年失业率'] = pd.to_numeric(data['青年失业率'], errors='coerce')

In [None]:
# 时间升序排列
data = data[::-1].reset_index(drop=True)

In [None]:
# 去除无效数据
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data = data.dropna()

========== 2. 增加时间缩写、年份等辅助信息 ==========

In [None]:
def convert_quarter_label(label):
    year = label[:4]
    quarter_map = {
        '第一季度': 'S1',
        '第二季度': 'S2',
        '第三季度': 'S3',
        '第四季度': 'S4',
    }
    quarter = label[-4:]
    return year + quarter_map.get(quarter, '')

In [None]:
data['时间缩写'] = data['时间点'].apply(convert_quarter_label)
data['年份'] = data['时间点'].str[:4]

========== 3. 模型改进：使用 GDP 进行 OLS 回归 ==========

In [None]:
# 去掉年份交乘项，直接使用 GDP
X = data[['GDP']]
y = data['青年失业率']
X_const = sm.add_constant(X)
model = sm.OLS(y, X_const).fit()

In [None]:
# 3) 提取回归系数和 R²
intercept = model.params['const']
coef_gdp = model.params['GDP']
r2 = model.rsquared

In [None]:
# 4) 生成回归方程文字（改为英文）
regression_text = (
    f"Regression Equation: y = {intercept:.2f} + {coef_gdp:.2f}·x<br>"
    f"R² = {r2:.3f}"
)

========== 4. 绘制散点图（使用年份分色） ==========

In [None]:
fig = px.scatter(
    data,
    x='GDP',
    y='青年失业率',
    color='年份',               # 按“年份”分色
    text='时间缩写',
    hover_data={
        '时间点': True,
        'GDP': ':.2f',
        '青年失业率': ':.2f',
        '年份': True
    },
    labels={
        'GDP': 'GDP',
        '青年失业率': 'Youth Unemployment Rate (%)',
        '年份': 'Year'
    },
    title='2020-2023 Youth Unemployment Rate vs GDP Analysis'
)

========== 5. 手动添加回归直线到图中 ==========

In [None]:
# 1) 生成一组平滑的 x 值
x_range = np.linspace(data['GDP'].min(), data['GDP'].max(), 100)
# 2) 计算 y_hat
X_pred = sm.add_constant(pd.DataFrame({'GDP': x_range}))
y_pred = model.predict(X_pred)

In [None]:
# 3) 将回归曲线添加到图中
fig.add_scatter(
    x=x_range,
    y=y_pred,
    mode='lines',
    line=dict(color='black'),
    name='Regression Line'
)

========== 6. 更新布局：英文标签 + 图注美化 + 显示图例 ==========

In [None]:
fig.update_traces(textposition='top center')
fig.update_layout(
    title_font_size=20,
    xaxis_title='GDP',
    yaxis_title='Youth Unemployment Rate (%)',
    showlegend=True,
    annotations=[
        dict(
            text="Note: Quarter naming such as '2020S1' indicates the first quarter of 2020; Data source: National Bureau of Statistics",
            xref="paper", yref="paper",
            x=0, y=-0.15, showarrow=False,
            font=dict(size=12)
        ),
        dict(
            text=regression_text,
            xref="paper", yref="paper",
            x=0.5, y=1.05, showarrow=False,
            font=dict(size=13),
            align="center"
        )
    ],
    margin=dict(l=60, r=60, t=100, b=120)
)

========== 7. 导出 HTML 并自动打开 ==========

In [None]:
html_file_path = "C:\\Users\\bryennt\\Desktop\\original_norate_noyear4.html"
fig.write_html(html_file_path)
webbrowser.open_new_tab(html_file_path)

========== 8. 输出修改说明 ==========

In [None]:
print("Modification Details:")
print("1. Removed the interaction term with year; using only GDP in the regression model.")
print("2. Updated regression equation and visualization accordingly.")
print("3. Exported HTML file is now named original_norate_noyear4.html.")