In [None]:
import pandas as pd
import plotly.express as px
import statsmodels.api as sm
import numpy as np
import webbrowser

========== 1. 读取数据 ==========

In [None]:
file_path = "C:\\Users\\bryennt\\Desktop\\2020-2023青年失业率及季度GDP.xlsx"
data = pd.read_excel(file_path, sheet_name='Sheet1')

In [None]:
# 改：统一列名为中文
data.columns = ['时间点', 'GDP', '青年失业率']
data['GDP'] = pd.to_numeric(data['GDP'], errors='coerce')
data['青年失业率'] = pd.to_numeric(data['青年失业率'], errors='coerce')

In [None]:
# 时间升序排列并计算GDP增长率
data = data[::-1].reset_index(drop=True)
data['GDP增长率(%)'] = data['GDP'].pct_change() * 100

In [None]:
# 取对数变换，避免负值
constant = 10  # 避免负值
data['log_GDP增长率'] = np.log(data['GDP增长率(%)'] + constant)

In [None]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data = data.dropna()
# ========== 2. 增加时间缩写、年份等辅助信息 ==========

In [None]:
def convert_quarter_label(label):
    year = label[:4]
    quarter_map = {
        '第一季度': 'S1',
        '第二季度': 'S2',
        '第三季度': 'S3',
        '第四季度': 'S4',
    }
    quarter = label[-4:]
    return year + quarter_map.get(quarter, '')

In [None]:
data['时间缩写'] = data['时间点'].apply(convert_quarter_label)
data['年份'] = data['时间点'].str[:4]

========== 3. 模型改进：使用对数 GDP 进行 OLS 回归 ==========

In [None]:
# 1) 使用 log_GDP增长率 作为自变量
data['log_GDP_Year_Interaction'] = data['log_GDP增长率'] * data['年份'].astype(int)

In [None]:
# 2) 使用 statsmodels 做多元 OLS 回归（包含 log GDP 增长率和交互项）
X = data[['log_GDP增长率', 'log_GDP_Year_Interaction']]
y = data['青年失业率']
X_const = sm.add_constant(X)
model = sm.OLS(y, X_const).fit()

In [None]:
# 3) 提取回归系数和 R²
intercept = model.params['const']
coef_log_gdp = model.params['log_GDP增长率']
coef_interaction = model.params['log_GDP_Year_Interaction']
r2 = model.rsquared

In [None]:
# 4) 生成回归方程文字（改为英文）
regression_text = (
    f"Regression Equation: y = {intercept:.2f} + {coef_log_gdp:.2f}·log(x) + {coef_interaction:.2f}·(log(x)·year)<br>"
    f"R² = {r2:.3f}"
)

========== 4. 绘制散点图（使用年份分色） ==========

In [None]:
# 用 plotly.express 的 scatter，并用 color 区分年份
fig = px.scatter(
    data,
    x='log_GDP增长率',
    y='青年失业率',
    color='年份',               # 按“年份”分色
    text='时间缩写',
    hover_data={
        '时间点': True,
        'log_GDP增长率': ':.2f',
        '青年失业率': ':.2f',
        '年份': True
    },
    labels={
        'log_GDP增长率': 'Log of GDP Growth Rate',
        '青年失业率': 'Youth Unemployment Rate (%)',
        '年份': 'Year'
    },
    title='2020-2023 Youth Unemployment Rate vs Log GDP Growth Rate Analysis'
)

========== 5. 手动添加回归直线到图中 ==========

In [None]:
# 1) 生成一组平滑的 x 值
x_range = np.linspace(data['log_GDP增长率'].min(), data['log_GDP增长率'].max(), 100)
# 2) 计算 y_hat
X_pred = sm.add_constant(pd.DataFrame({
    'log_GDP增长率': x_range,
    'log_GDP_Year_Interaction': x_range * data['年份'].astype(int).mean()
}))
y_pred = model.predict(X_pred)

In [None]:
# 3) 将回归曲线添加到图中
fig.add_scatter(
    x=x_range,
    y=y_pred,
    mode='lines',
    line=dict(color='black'),
    name='Regression Line'
)

========== 6. 更新布局：英文标签 + 图注美化 + 显示图例 ==========

In [None]:
fig.update_traces(textposition='top center')
fig.update_layout(
    title_font_size=20,
    xaxis_title='Log of GDP Growth Rate',
    yaxis_title='Youth Unemployment Rate (%)',
    showlegend=True,  # 显示图例，区分不同年份
    annotations=[
        dict(
            text="Note: Quarter naming such as '2020S1' indicates the first quarter of 2020; Data source: National Bureau of Statistics",
            xref="paper", yref="paper",
            x=0, y=-0.15, showarrow=False,
            font=dict(size=12)
        ),
        dict(
            text=regression_text,
            xref="paper", yref="paper",
            x=0.5, y=1.05, showarrow=False,
            font=dict(size=13),
            align="center"
        )
    ],
    margin=dict(l=60, r=60, t=100, b=120)  # 加大底部、顶部留白，让图注更美观
)

========== 7. 导出 HTML 并自动打开 ==========

In [None]:
html_file_path = "C:\\Users\\bryennt\\Desktop\\original.html"
fig.write_html(html_file_path)
webbrowser.open_new_tab(html_file_path)

========== 8. 输出修改说明 ==========

In [None]:
print("Modification Details:")
print("1. Replaced GDP growth rate with log-transformed GDP growth rate in the regression model.")
print("2. Updated regression equation and visualization accordingly.")