In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

try_calculate_exchange_rate_rub_rmb_path = kagglehub.competition_download('try-calculate-exchange-rate-rub-rmb')

print('Data source import complete.')


In [None]:
# 安装必要的库
!pip install uv
!uv pip install -q autogluon.timeseries --system

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdate
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

# 数据路径
data_path = "/kaggle/input/try-calculate-exchange-rate-rub-rmb/modified_data.csv"

# 读取数据
data = pd.read_csv(data_path)

# 将时间列转换为datetime类型
data['timestamp'] = pd.to_datetime(data['ftimestamp'])
# 丢弃原始时间列
data = data.drop(columns=['ftimestamp'])

print('Total amount of data. Here is a', len(data), 'time steps in this timeseries.')
# 打印数据前5行
print(data.head(5))

# 设置图像尺寸
plt.rcParams["figure.figsize"] = (15, 3)

# 绘制数据
plt.plot(data['timestamp'], data['target'])
plt.grid()

# 设置x轴的年份定位器
locator = mdate.YearLocator()
plt.gca().xaxis.set_major_locator(locator)

# 自动格式化x轴日期
plt.gcf().autofmt_xdate()

# 绘制图形
plt.show()

# 整个数据集被分为5组，通过 item_id区分
# 每一组最后50个数据为NaN，需要预测这些值并提交，其准确率用于评估模型效果

# 创建子图
fig, ax = plt.subplots()

# 获取数据中的最大值
max_val = np.max(data["target"])

# 填充数据
ax.fill_between(data["timestamp"], np.where(data["item_id"] % 2 == 0, max_val, 0), color='r', alpha=0.1, step='post')
ax.fill_between(data["timestamp"], np.where(data["item_id"] % 2!= 0, max_val, 0), color='b', alpha=0.1, step='post')
# 绘制数据
ax.plot(data["timestamp"], data['target'])
# 设置边距
ax.margins(x=0, y=0)

# 绘制图形
plt.show()

# 用于存储所有预测结果的列表
all_predictions = []

# 对每个item_id进行处理
for item_id in data['item_id'].unique():
    # 筛选出当前item_id的数据
    data_filtered = data[data['item_id'] == item_id]
    print(f"Processing item_id: {item_id}, Length of data: {len(data_filtered)}")

    # 转换数据类型为TimeSeriesDataFrame
    data_tsdf = TimeSeriesDataFrame.from_data_frame(data_filtered)
    # 填充缺失值
    data_tsdf = data_tsdf.fill_missing_values(method = 'interpolate')

    # 预测长度
    prediction_length = 50

    # 划分训练集和测试集
    train_data = data_tsdf.iloc[:-prediction_length]
    test_data = data_tsdf.iloc[-prediction_length:]

    # 训练模型
    predictor = TimeSeriesPredictor(prediction_length=prediction_length, freq='D').fit(
       train_data,
       verbosity=0,
       hyperparameters={
          "NaiveModel": {},
       },
    )

    # 进行预测
    predictions = predictor.predict(train_data)
    # 获取预测结果中的均值
    forecast_ts = predictions.loc[item_id]['mean']

    # 整理预测结果为DataFrame
    pred_df = pd.DataFrame({
        'item_id': [item_id] * prediction_length,
        'target': forecast_ts.values
    })
    # 添加到所有预测结果列表中
    all_predictions.append(pred_df)

# 合并所有预测结果
final_prediction_df = pd.concat(all_predictions, ignore_index=True)

# 按照新格式整理数据
df = pd.DataFrame({
    'item_id': range(len(final_prediction_df)),
    'target': final_prediction_df['target'].values
})

# 写入 CSV 文件（不保存索引）
df.to_csv('submission.csv', index=False)

print("预测完成，结果已保存到 submission.csv")