In [2]:
pip install yfinance pandas pandas_ta xgboost scikit-learn matplotlib

Collecting yfinance
  Downloading yfinance-0.2.65-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting pandas_ta
  Downloading pandas_ta-0.3.14b.tar.gz (115 kB)
     ---------------------------------------- 0.0/115.1 kB ? eta -:--:--
     --- ------------------------------------ 10.2/115.1 kB ? eta -:--:--
     -------------------------------------  112.6/115.1 kB 1.7 MB/s eta 0:00:01
     -------------------------------------- 115.1/115.1 kB 1.3 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.6-py311-none-an


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\HW\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [1]:
import yfinance as yf
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# candidate stocks
tickers = ["TSLA", "AAPL", "GOOGL", "META", "MSFT"]

start_date = "2020-01-01"
end_date = "2025-06-07" # 使用一个未来的日期以确保获取到最新数据

# 下载股票数据
# 'group_by="ticker"' 会使返回的 DataFrame 以股票代码为多重索引的第一层
try:
    data = yf.download(tickers, start=start_date, end=end_date, group_by='ticker')
    print("数据下载成功！")
    # 为了方便处理，我们先处理单只股票，例如特斯拉(TSLA)
    # 在实际策略中，您可以循环处理每只股票
    df = data['TSLA'].copy()
    # 删除没有交易的行
    df.dropna(inplace=True)
    print("以特斯拉(TSLA)为例进行后续分析：")
    print(df.head())
except Exception as e:
    print(f"数据下载失败: {e}")

ModuleNotFoundError: No module named 'yfinance'

In [None]:
import pandas_ta as ta

# 确保我们有一个有效的 DataFrame
if 'df' in locals() and not df.empty:
    print("开始计算技术指标...")

    # 计算 RSI
    df.ta.rsi(length=14, append=True)

    # 计算 MACD
    df.ta.macd(fast=12, slow=26, signal=9, append=True)

    # 计算布林带
    df.ta.bbands(length=20, std=2, append=True)

    # 计算移动平均线
    df['SMA_50'] = ta.sma(df['Close'], length=50)
    df['SMA_200'] = ta.sma(df['Close'], length=200)

    # 删除因计算指标而产生的 NaN 值
    df.dropna(inplace=True)

    print("技术指标计算完成：")
    # 查看包含新指标的 DataFrame 的最后几行
    print(df.tail())
else:
    print("DataFrame 'df' 不存在或为空，请先运行第一部分代码。")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import numpy as np

if 'df' in locals() and not df.empty and 'RSI_14' in df.columns:
    print("开始构建和训练预测模型...")

    # 1. 定义特征 (X) 和目标 (y)
    # 特征是除了我们自己创建的目标列之外的所有列
    features = [col for col in df.columns if col not in ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'target']]
    
    # 目标是预测第二天的价格方向
    # 如果第二天的收盘价 > 当天的收盘价，则为 1 (上涨)，否则为 0 (下跌)
    df['target'] = (df['Close'].shift(-1) > df['Close']).astype(int)

    # 由于 shift(-1) 会在最后一行产生 NaN，我们需要再次清理
    df.dropna(inplace=True)

    X = df[features]
    y = df['target']
    
    # 确保 X 和 y 的行数一致
    if len(X) != len(y):
        print("错误：特征和目标的长度不匹配！")
    else:
        # 2. 按时间顺序划分数据
        # 例如，使用前 80% 的数据作为训练集，后 20%作为测试集
        split_index = int(len(X) * 0.8)
        X_train, X_test = X[:split_index], X[split_index:]
        y_train, y_test = y[:split_index], y[split_index:]

        print(f"训练集大小: {len(X_train)}")
        print(f"测试集大小: {len(X_test)}")

        # 3. 初始化并训练 XGBoost 模型
        # 'objective':'binary:logistic'` 表示我们正在解决一个二分类问题
        # `eval_metric='logloss'` 是评估指标
        model = xgb.XGBClassifier(
            objective='binary:logistic',
            eval_metric='logloss',
            use_label_encoder=False,
            n_estimators=100, # 树的数量
            learning_rate=0.1,
            max_depth=3,
            random_state=42
        )

        model.fit(X_train, y_train)

        # 4. 在测试集上进行预测
        y_pred = model.predict(X_test)

        # 5. 评估模型
        accuracy = accuracy_score(y_test, y_pred)
        print(f"\n模型在测试集上的准确率: {accuracy:.4f}")

        print("\n分类报告:")
        print(classification_report(y_test, y_pred, target_names=['下跌 (0)', '上涨 (1)']))

        # 查看特征重要性
        feature_importance = pd.DataFrame({'feature': features, 'importance': model.feature_importances_})
        print("\n特征重要性:")
        print(feature_importance.sort_values(by='importance', ascending=False))

else:
    print("DataFrame 'df' 或所需指标不存在，请先运行第一和第二部分代码。")