In [None]:
import pandas as pd
import plotly.express as px
import statsmodels.api as sm
import webbrowser

In [None]:
file_path = "C:\\Users\\38670\\Documents\\Documents\\2020-2023 青年失业率及季度 GDP_转置.xlsx"
data = pd.read_excel(file_path, sheet_name='Sheet1')

In [None]:
data.columns = ['Time', 'GDP', 'Youth Unemployment Rate', 'Source']
data = data.drop(columns=['Source']).drop(0)
data['GDP'] = pd.to_numeric(data['GDP'], errors='coerce')
data['Youth Unemployment Rate'] = pd.to_numeric(data['Youth Unemployment Rate'], errors='coerce')
data.dropna(inplace=True)

In [None]:
#  时间升序排列并计算GDP增长率
data = data[::-1].reset_index(drop=True)
data['GDP Growth Rate (%)'] = data['GDP'].pct_change() * 100
data.dropna(inplace=True)

In [None]:
def convert_quarter_label(label):
    year = label[:4]
    quarter_map = {
        '第一季度': 'Q1',
        '第二季度': 'Q2',
        '第三季度': 'Q3',
        '第四季度': 'Q4',
    }
    quarter = label[-4:]
    return year + quarter_map.get(quarter, '')

In [None]:
data['Abbr Time'] = data['Time'].apply(convert_quarter_label)
data['Year'] = data['Time'].str[:4]

In [None]:
color_map = {
    '2020': 'red',
    '2021': 'green',
    '2022': 'blue',
    '2023': 'orange'
}
data['Color'] = data['Year'].map(color_map)

In [None]:
#  计算整体回归线
X = data['GDP Growth Rate (%)']
y = data['Youth Unemployment Rate']
X_const = sm.add_constant(X)
model = sm.OLS(y, X_const).fit()
slope = model.params['GDP Growth Rate (%)']
intercept = model.params['const']
r2 = model.rsquared
regression_text = f"Regression: y = {intercept:.2f} + {slope:.2f}x<br>R² = {r2:.3f}"

In [None]:
fig = px.scatter(
    data,
    x='GDP Growth Rate (%)',
    y='Youth Unemployment Rate',
    text='Abbr Time',
    hover_data={'Time': True, 'GDP Growth Rate (%)': ':.2f', 'Youth Unemployment Rate': ':.2f'},
    trendline="ols",
    trendline_color_override="black",
    labels={
        'GDP Growth Rate (%)': 'GDP Growth Rate (%)',
        'Youth Unemployment Rate': 'Youth Unemployment Rate (%)',
    },
    title='Youth Unemployment Rate vs GDP Growth Rate (2020-2023)'
)

In [None]:
fig.for_each_trace(
    lambda trace: trace.update(marker=dict(color=data['Color']) if trace.mode == 'markers' else {})
)

In [None]:
# 添加图注、公式
fig.update_traces(textposition='top center')
fig.update_layout(
    title_font_size=20,
    xaxis_title='GDP Growth Rate (%)',
    yaxis_title='Youth Unemployment Rate (%)',
    showlegend=False,
    annotations=[
        dict(
            text="Note: '2020Q1' means Q1 of 2020; Data Source: National Bureau of Statistics of China",
            xref="paper", yref="paper",
            x=0, y=-0.27, showarrow=False,
            font=dict(size=12)
        ),
        dict(
            text=regression_text,
            xref="paper", yref="paper",
            x=0.5, y=1.05, showarrow=False,
            font=dict(size=13),
            align="center"
        )
    ],
    margin=dict(b=130)
)

In [None]:
html_file_path = 'scatter_plot.html'
fig.write_html(html_file_path)
webbrowser.open_new_tab(html_file_path)