与原来一样，就是多加了一个特征

排名Rank 分成前10、前50、前100、前150和前200五种类别 上榜时间DaysOnChart 分成<10，10-30，30-50，50-100，>100五种类别

先用逻辑回归

In [1]:
# -*- coding: utf-8 -*-
"""
多分类训练（加入 Artist_Score 后的版本）：
- 输入特征：7个音频特征（0~1） + Artist_Score（0~1）  → 共8个特征
- 任务A：Rank 五分类：Top10 / 11-50 / 51-100 / 101-150 / 151-200
- 任务B：DaysOnChart 五分类：<10 / 10-30 / 30-50 / 50-100 / >100
- 模型：逻辑回归（multinomial, class_weight='balanced'）
- 评估：Accuracy、Macro-F1、分类报告、混淆矩阵
"""

import os
import joblib
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# ============== 路径配置 ==============
CSV = Path(r"C:\Users\Akari\OneDrive\Desktop\SEM 1\AML\Final CW\Datas\spotify_preprocess_Artist.csv")
ART_DIR = Path(r"C:\Users\Akari\OneDrive\Desktop\SEM 1\AML\Final CW\PredictRankAndDay"); ART_DIR.mkdir(exist_ok=True)

# 新增 Artist_Score
FEATURES = [
    'Danceability','Energy','Valence','Loudness','Speechiness','Acousticness','Instrumentalness',
    'Artist_Score'   # <— 新特征
]

# ============== 读取数据 ==============
df = pd.read_csv(CSV)

# 仅保留需要的列，并清理缺失
need_cols = ['id','Title','Artists','Nationality'] + FEATURES + ['Rank','DaysOnChart']
missing = [c for c in need_cols if c not in df.columns]
if missing:
    raise ValueError(f"输入文件缺少必要列：{missing}\n请确认预处理已输出 Artist_Score 并包含所有字段。")

# 对 Artist_Score 做缺失兜底（极少数冷启动）
df['Artist_Score'] = df['Artist_Score'].astype(float)
if df['Artist_Score'].isna().any():
    df['Artist_Score'] = df['Artist_Score'].fillna(df['Artist_Score'].median())

df = df[need_cols].dropna(subset=FEATURES + ['Rank','DaysOnChart']).copy()

# Rank 合理范围裁剪（1..200）
df['Rank'] = df['Rank'].clip(lower=1, upper=200).astype(int)
df['DaysOnChart'] = df['DaysOnChart'].astype(int)

# ============== 分桶/打标签（五分类与原版一致） ==============
rank_bins   = [0, 10, 50, 100, 150, 200]
rank_labels = ["Top10", "11-50", "51-100", "101-150", "151-200"]
df['RankBucket'] = pd.cut(df['Rank'], bins=rank_bins, labels=rank_labels, right=True, include_lowest=True)

days_bins   = [-np.inf, 9, 30, 50, 100, np.inf]
days_labels = ["<10", "10-30", "30-50", "50-100", ">100"]
df['DaysBucket'] = pd.cut(df['DaysOnChart'], bins=days_bins, labels=days_labels, right=True)

df = df.dropna(subset=['RankBucket','DaysBucket']).copy()

# ============== 特征与标签 ==============
X = df[FEATURES].values
y_rank = df['RankBucket'].astype(str).values
y_days = df['DaysBucket'].astype(str).values

# ============== 分层切分（保持类分布） ==============
X_tr, X_te, y_rank_tr, y_rank_te = train_test_split(
    X, y_rank, test_size=0.2, random_state=42, stratify=y_rank
)
X_tr_d, X_te_d, y_days_tr, y_days_te = train_test_split(
    X, y_days, test_size=0.2, random_state=42, stratify=y_days
)

# ============== 标签编码 ==============
le_rank = LabelEncoder().fit(y_rank_tr)
le_days = LabelEncoder().fit(y_days_tr)

y_rank_tr_enc = le_rank.transform(y_rank_tr)
y_rank_te_enc = le_rank.transform(y_rank_te)
y_days_tr_enc = le_days.transform(y_days_tr)
y_days_te_enc = le_days.transform(y_days_te)

# ============== 定义与训练模型（逻辑回归，多类别，平衡类权重） ==============
rank_clf = LogisticRegression(
    multi_class="multinomial",
    class_weight="balanced",
    solver="lbfgs",
    max_iter=2000,
    random_state=42
)
rank_clf.fit(X_tr, y_rank_tr_enc)

days_clf = LogisticRegression(
    multi_class="multinomial",
    class_weight="balanced",
    solver="lbfgs",
    max_iter=2000,
    random_state=42
)
days_clf.fit(X_tr_d, y_days_tr_enc)

# ============== 评估（Accuracy、Macro-F1、报告、混淆矩阵） ==============
def evaluate(name, clf, Xte, yte_enc, label_encoder, label_order=None):
    pred_enc = clf.predict(Xte)
    acc = accuracy_score(yte_enc, pred_enc)
    f1m = f1_score(yte_enc, pred_enc, average="macro")

    y_true = label_encoder.inverse_transform(yte_enc)
    y_pred = label_encoder.inverse_transform(pred_enc)
    print(f"\n=== {name} - Test (with Artist_Score) ===")
    print(f"Accuracy = {acc:.4f}   Macro-F1 = {f1m:.4f}")
    print("\nClassification report:")
    print(classification_report(y_true, y_pred, labels=label_order))
    if label_order is None:
        label_order = sorted(label_encoder.classes_.tolist())
    cm = confusion_matrix(y_true, y_pred, labels=label_order)
    cm_df = pd.DataFrame(cm, index=[f"T:{l}" for l in label_order], columns=[f"P:{l}" for l in label_order])
    print("\nConfusion matrix:")
    print(cm_df)

evaluate("RankBucket", rank_clf, X_te, y_rank_te_enc, le_rank, label_order=rank_labels)
evaluate("DaysBucket", days_clf, X_te_d, y_days_te_enc, le_days, label_order=days_labels)

# ============== 保存模型与元数据 ==============
bundle = {
    "features": FEATURES,
    "rank": {"model": rank_clf, "label_encoder": le_rank, "labels": rank_labels, "bins": rank_bins},
    "days": {"model": days_clf, "label_encoder": le_days, "labels": days_labels, "bins": days_bins}
}
joblib.dump(bundle, ART_DIR / "chart_cls_models_with_artist.pkl")
print(f"\n✅ 分类模型已保存到: {ART_DIR / 'chart_cls_models_with_artist.pkl'}")

# ============== 推理函数（把概率也返回） ==============
def predict_buckets_from_features(feat_dict: dict):
    """
    输入：8个特征（含 Artist_Score），例如：
      {'Danceability':0.6,'Energy':0.8,'Valence':0.4,'Loudness':0.7,
       'Speechiness':0.05,'Acousticness':0.2,'Instrumentalness':0.0,
       'Artist_Score':0.73}
    """
    x = np.array([[feat_dict[f] for f in FEATURES]], dtype=float)

    # Rank
    pr_rank = rank_clf.predict_proba(x)[0]
    pred_rank_idx = int(np.argmax(pr_rank))
    pred_rank_label = le_rank.inverse_transform([pred_rank_idx])[0]
    rank_probs = {label: float(pr_rank[le_rank.transform([label])[0]]) for label in rank_labels}

    # Days
    pr_days = days_clf.predict_proba(x)[0]
    pred_days_idx = int(np.argmax(pr_days))
    pred_days_label = le_days.inverse_transform([pred_days_idx])[0]
    days_probs = {label: float(pr_days[le_days.transform([label])[0]]) for label in days_labels}

    return {
        "RankBucket": {"pred": pred_rank_label, "probs": rank_probs},
        "DaysBucket": {"pred": pred_days_label, "probs": days_probs},
    }

# 示例：
# print(predict_buckets_from_features({
#   'Danceability':0.6,'Energy':0.8,'Valence':0.4,'Loudness':0.7,
#   'Speechiness':0.05,'Acousticness':0.2,'Instrumentalness':0.0,
#   'Artist_Score':0.73
# }))





=== RankBucket - Test (with Artist_Score) ===
Accuracy = 0.2717   Macro-F1 = 0.2615

Classification report:
              precision    recall  f1-score   support

       Top10       0.20      0.56      0.30       168
       11-50       0.29      0.19      0.23       431
      51-100       0.28      0.18      0.22       489
     101-150       0.25      0.12      0.16       393
     151-200       0.33      0.53      0.41       352

    accuracy                           0.27      1833
   macro avg       0.27      0.32      0.26      1833
weighted avg       0.28      0.27      0.25      1833


Confusion matrix:
           P:Top10  P:11-50  P:51-100  P:101-150  P:151-200
T:Top10         94       44        19          7          4
T:11-50        180       81        90         45         35
T:51-100        91       84        88         53        173
T:101-150       67       37        66         47        176
T:151-200       37       37        53         37        188

=== DaysBucket - Test 

分三类 排名：Top50 / 51–100 / >100

上榜天数：<30 / 30–100 / >100

In [5]:
# -*- coding: utf-8 -*-
"""
多分类训练（三分类版本，加入 Artist_Score）：
- 输入特征：7个音频特征（0~1） + Artist_Score（0~1） → 共8个特征
- 任务A：最高排名 Rank → 三分类：Top50 / 51–100 / >100
- 任务B：上榜天数 DaysOnChart → 三分类：<30 / 30–100 / >100
- 模型：逻辑回归（multinomial），class_weight='balanced'
- 评估：Accuracy、Macro-F1、分类报告、混淆矩阵
"""

import os
import joblib
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# ============== 路径配置 ==============
CSV = Path(r"C:\Users\Akari\OneDrive\Desktop\SEM 1\AML\Final CW\Datas\spotify_preprocess_Artist.csv")
ART_DIR = Path(r"C:\Users\Akari\OneDrive\Desktop\SEM 1\AML\Final CW\PredictRankAndDay"); ART_DIR.mkdir(exist_ok=True)

# ★ 新增 Artist_Score 到特征列表
FEATURES = [
    'Danceability','Energy','Valence','Loudness','Speechiness','Acousticness','Instrumentalness',
    'Artist_Score'
]

# ============== 读取数据 ==============
df = pd.read_csv(CSV)

# 仅保留需要的列，并清理缺失
need_cols = ['id','Title','Artists','Nationality'] + FEATURES + ['Rank','DaysOnChart']
missing = [c for c in need_cols if c not in df.columns]
if missing:
    raise ValueError(f"输入文件缺少必要列：{missing}\n请确认预处理文件已包含 Artist_Score。")

# 冷启动兜底：Artist_Score 缺失用中位数填充
df['Artist_Score'] = pd.to_numeric(df['Artist_Score'], errors='coerce')
if df['Artist_Score'].isna().any():
    df['Artist_Score'] = df['Artist_Score'].fillna(df['Artist_Score'].median())

df = df[need_cols].dropna(subset=FEATURES + ['Rank','DaysOnChart']).copy()

# Rank 合理范围裁剪（1..200）
df['Rank'] = df['Rank'].clip(lower=1, upper=200).astype(int)
df['DaysOnChart'] = df['DaysOnChart'].astype(int)

# ============== 分桶/打标签（三分类） ==============
# 任务A：Rank三分类（Top50 / 51–100 / >100）
rank_bins   = [0, 50, 100, 200]
rank_labels = ["Top50", "51–100", ">100"]
df['RankBucket'] = pd.cut(df['Rank'], bins=rank_bins, labels=rank_labels, right=True, include_lowest=True)

# 任务B：DaysOnChart三分类（<30 / 30–100 / >100）
days_bins   = [-np.inf, 29, 100, np.inf]
days_labels = ["<30", "30–100", ">100"]
df['DaysBucket'] = pd.cut(df['DaysOnChart'], bins=days_bins, labels=days_labels, right=True)

# 丢弃未能分桶的异常
df = df.dropna(subset=['RankBucket','DaysBucket']).copy()

# ============== 特征与标签 ==============
X = df[FEATURES].values
y_rank = df['RankBucket'].astype(str).values
y_days = df['DaysBucket'].astype(str).values

# ============== 分层切分（保持类分布） ==============
X_tr, X_te, y_rank_tr, y_rank_te = train_test_split(
    X, y_rank, test_size=0.2, random_state=42, stratify=y_rank
)
X_tr_d, X_te_d, y_days_tr, y_days_te = train_test_split(
    X, y_days, test_size=0.2, random_state=42, stratify=y_days
)

# ============== 编码标签（为方便保存与推理） ==============
le_rank = LabelEncoder().fit(y_rank_tr)
le_days = LabelEncoder().fit(y_days_tr)
y_rank_tr_enc = le_rank.transform(y_rank_tr); y_rank_te_enc = le_rank.transform(y_rank_te)
y_days_tr_enc = le_days.transform(y_days_tr); y_days_te_enc = le_days.transform(y_days_te)

# ============== 定义与训练模型（逻辑回归，多类别，平衡类权重） ==============
rank_clf = LogisticRegression(
    multi_class="multinomial",
    class_weight="balanced",
    solver="lbfgs",
    max_iter=2000,
    random_state=42
)
rank_clf.fit(X_tr, y_rank_tr_enc)

days_clf = LogisticRegression(
    multi_class="multinomial",
    class_weight="balanced",
    solver="lbfgs",
    max_iter=2000,
    random_state=42
)
days_clf.fit(X_tr_d, y_days_tr_enc)

# ============== 评估函数（Accuracy、Macro-F1、报告、混淆矩阵） ==============
def evaluate(name, clf, Xte, yte_enc, label_encoder, label_order=None):
    pred_enc = clf.predict(Xte)
    acc = accuracy_score(yte_enc, pred_enc)
    f1m = f1_score(yte_enc, pred_enc, average="macro")

    y_true = label_encoder.inverse_transform(yte_enc)
    y_pred = label_encoder.inverse_transform(pred_enc)
    print(f"\n=== {name} - Test (3-class, with Artist_Score) ===")
    print(f"Accuracy = {acc:.4f}   Macro-F1 = {f1m:.4f}")
    print("\nClassification report:")
    print(classification_report(y_true, y_pred, labels=label_order))
    if label_order is None:
        label_order = sorted(label_encoder.classes_.tolist())
    cm = confusion_matrix(y_true, y_pred, labels=label_order)
    cm_df = pd.DataFrame(cm, index=[f"T:{l}" for l in label_order], columns=[f"P:{l}" for l in label_order])
    print("\nConfusion matrix:")
    print(cm_df)

evaluate("RankBucket", rank_clf, X_te, y_rank_te_enc, le_rank, label_order=rank_labels)
evaluate("DaysBucket", days_clf, X_te_d, y_days_te_enc, le_days, label_order=days_labels)

# ============== 保存模型与元数据 ==============
bundle = {
    "features": FEATURES,
    "rank": {"model": rank_clf, "label_encoder": le_rank, "labels": rank_labels, "bins": rank_bins},
    "days": {"model": days_clf, "label_encoder": le_days, "labels": days_labels, "bins": days_bins}
}
joblib.dump(bundle, ART_DIR / "chart_cls_3class_models_with_artist.pkl")
print(f"\n✅ 三分类逻辑回归模型已保存到: {ART_DIR / 'chart_cls_3class_models_with_artist.pkl'}")

# ============== 推理函数（带概率输出） ==============
def predict_buckets_from_features(feat_dict: dict):
    """
    输入：8个特征（含 Artist_Score），例如：
      {'Danceability':0.6,'Energy':0.8,'Valence':0.4,'Loudness':0.7,
       'Speechiness':0.05,'Acousticness':0.2,'Instrumentalness':0.0,
       'Artist_Score':0.73}
    输出：两个任务的类别预测与概率分布
    """
    x = np.array([[feat_dict[f] for f in FEATURES]], dtype=float)

    pr_rank = rank_clf.predict_proba(x)[0]
    pred_rank_idx = int(np.argmax(pr_rank))
    pred_rank_label = le_rank.inverse_transform([pred_rank_idx])[0]
    rank_probs = {label: float(pr_rank[le_rank.transform([label])[0]]) for label in rank_labels}

    pr_days = days_clf.predict_proba(x)[0]
    pred_days_idx = int(np.argmax(pr_days))
    pred_days_label = le_days.inverse_transform([pred_days_idx])[0]
    days_probs = {label: float(pr_days[le_days.transform([label])[0]]) for label in days_labels}

    return {
        "RankBucket": {"pred": pred_rank_label, "probs": rank_probs},
        "DaysBucket": {"pred": pred_days_label, "probs": days_probs},
    }

# 示例：
# print(predict_buckets_from_features({
#   'Danceability':0.6,'Energy':0.8,'Valence':0.4,'Loudness':0.7,
#   'Speechiness':0.05,'Acousticness':0.2,'Instrumentalness':0.0,
#   'Artist_Score':0.73
# }))





=== RankBucket - Test (3-class, with Artist_Score) ===
Accuracy = 0.5177   Macro-F1 = 0.4830

Classification report:
              precision    recall  f1-score   support

       Top50       0.53      0.66      0.59       599
      51–100       0.30      0.22      0.26       489
        >100       0.61      0.60      0.60       745

    accuracy                           0.52      1833
   macro avg       0.48      0.49      0.48      1833
weighted avg       0.50      0.52      0.51      1833


Confusion matrix:
          P:Top50  P:51–100  P:>100
T:Top50       393       140      66
T:51–100      158       109     222
T:>100        185       113     447

=== DaysBucket - Test (3-class, with Artist_Score) ===
Accuracy = 0.5074   Macro-F1 = 0.4082

Classification report:
              precision    recall  f1-score   support

         <30       0.82      0.55      0.66      1277
      30–100       0.21      0.21      0.21       294
        >100       0.25      0.65      0.36       262

  

随机森林

In [21]:
# -*- coding: utf-8 -*-
"""
非线性树模型对照实验（加入 Artist_Score）：
- 模型：RandomForestClassifier（两套：RankBucket / DaysBucket）
- 特征：7个音频特征 + Artist_Score（共 8 个特征）
- 标签：五分类（与逻辑回归一致）
- 评估：Accuracy、Macro-F1、分类报告、混淆矩阵
- 输出：PredictRankAndDay/chart_cls_rf_models_with_artist.pkl
"""

import os
import joblib
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# ============== 路径配置 ==============
CSV = Path(r"C:\Users\Akari\OneDrive\Desktop\SEM 1\AML\Final CW\Datas\spotify_preprocess_Artist.csv")
ART_DIR = Path(r"C:\Users\Akari\OneDrive\Desktop\SEM 1\AML\Final CW\PredictRankAndDay"); ART_DIR.mkdir(exist_ok=True)

# ★ 加入 Artist_Score
FEATURES = [
    'Danceability','Energy','Valence','Loudness','Speechiness','Acousticness','Instrumentalness',
    'Artist_Score'
]

# ============== 读取数据 ==============
df = pd.read_csv(CSV)
need_cols = ['id','Title','Artists','Nationality'] + FEATURES + ['Rank','DaysOnChart']
missing = [c for c in need_cols if c not in df.columns]
if missing:
    raise ValueError(f"输入文件缺少必要列：{missing}\n请确认预处理已包含 Artist_Score。")

# 冷启动兜底：Artist_Score 缺失用中位数填充
df['Artist_Score'] = pd.to_numeric(df['Artist_Score'], errors='coerce')
if df['Artist_Score'].isna().any():
    df['Artist_Score'] = df['Artist_Score'].fillna(df['Artist_Score'].median())

df = df[need_cols].dropna(subset=FEATURES + ['Rank','DaysOnChart']).copy()
df['Rank'] = df['Rank'].clip(lower=1, upper=200).astype(int)
df['DaysOnChart'] = df['DaysOnChart'].astype(int)

# ============== 分桶/打标签（与之前一致） ==============
rank_bins   = [0, 10, 50, 100, 150, 200]
rank_labels = ["Top10", "11-50", "51-100", "101-150", "151-200"]
df['RankBucket'] = pd.cut(df['Rank'], bins=rank_bins, labels=rank_labels, right=True, include_lowest=True)

days_bins   = [-np.inf, 9, 30, 50, 100, np.inf]
days_labels = ["<10", "10-30", "30-50", "50-100", ">100"]
df['DaysBucket'] = pd.cut(df['DaysOnChart'], bins=days_bins, labels=days_labels, right=True)

df = df.dropna(subset=['RankBucket','DaysBucket']).copy()

# ============== 特征与标签 ==============
X = df[FEATURES].values
y_rank = df['RankBucket'].astype(str).values
y_days = df['DaysBucket'].astype(str).values

# ============== 分层切分（保持类分布） ==============
X_tr, X_te, y_rank_tr, y_rank_te = train_test_split(
    X, y_rank, test_size=0.2, random_state=42, stratify=y_rank
)
X_tr_d, X_te_d, y_days_tr, y_days_te = train_test_split(
    X, y_days, test_size=0.2, random_state=42, stratify=y_days
)

# ============== 标签编码（保存以便推理还原） ==============
le_rank = LabelEncoder().fit(y_rank_tr)
le_days = LabelEncoder().fit(y_days_tr)
y_rank_tr_enc = le_rank.transform(y_rank_tr); y_rank_te_enc = le_rank.transform(y_rank_te)
y_days_tr_enc = le_days.transform(y_days_tr); y_days_te_enc = le_days.transform(y_days_te)

# ============== 定义随机森林（可按需调参） ==============
rf_params = dict(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=2,
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1
)
rank_clf = RandomForestClassifier(**rf_params)
days_clf = RandomForestClassifier(**rf_params)

# 训练
rank_clf.fit(X_tr, y_rank_tr_enc)
days_clf.fit(X_tr_d, y_days_tr_enc)

# ============== 评估函数 ==============
def evaluate(name, clf, Xte, yte_enc, label_encoder, label_order=None):
    pred_enc = clf.predict(Xte)
    acc = accuracy_score(yte_enc, pred_enc)
    f1m = f1_score(yte_enc, pred_enc, average="macro")
    y_true = label_encoder.inverse_transform(yte_enc)
    y_pred = label_encoder.inverse_transform(pred_enc)

    print(f"\n=== {name} - Test (RandomForest, with Artist_Score) ===")
    print(f"Accuracy = {acc:.4f}   Macro-F1 = {f1m:.4f}")
    print("\nClassification report:")
    print(classification_report(y_true, y_pred, labels=label_order))
    if label_order is None:
        label_order = sorted(label_encoder.classes_.tolist())
    cm = confusion_matrix(y_true, y_pred, labels=label_order)
    cm_df = pd.DataFrame(cm, index=[f"T:{l}" for l in label_order], columns=[f"P:{l}" for l in label_order])
    print("\nConfusion matrix:")
    print(cm_df)

evaluate("RankBucket", rank_clf, X_te, y_rank_te_enc, le_rank, label_order=rank_labels)
evaluate("DaysBucket", days_clf, X_te_d, y_days_te_enc, le_days, label_order=days_labels)

# ============== 特征重要性（可解释） ==============
def print_feature_importance(name, clf):
    fi = clf.feature_importances_
    order = np.argsort(fi)[::-1]
    print(f"\n{name} - Feature Importance:")
    for i in order:
        print(f"  {FEATURES[i]:16s}  {fi[i]:.4f}")

print_feature_importance("RankBucket", rank_clf)
print_feature_importance("DaysBucket", days_clf)

# ============== 保存模型与元数据 ==============
bundle = {
    "features": FEATURES,
    "rank": {"model": rank_clf, "label_encoder": le_rank, "labels": rank_labels, "bins": rank_bins},
    "days": {"model": days_clf, "label_encoder": le_days, "labels": days_labels, "bins": days_bins}
}
out_path = ART_DIR / "chart_cls_rf_models_with_artist.pkl"
joblib.dump(bundle, out_path)
print(f"\n✅ 随机森林分类模型已保存到: {out_path}")

# ============== 推理函数（含概率） ==============
def predict_buckets_from_features_rf(feat_dict: dict):
    """
    输入：8个特征（含 Artist_Score, 0~1）
    输出：两个任务的类别预测与概率分布
    """
    x = np.array([[feat_dict[f] for f in FEATURES]], dtype=float)

    pr_rank = rank_clf.predict_proba(x)[0]
    pred_rank_idx = int(np.argmax(pr_rank))
    pred_rank_label = le_rank.inverse_transform([pred_rank_idx])[0]
    rank_probs = {lab: float(pr_rank[le_rank.transform([lab])[0]]) for lab in rank_labels}

    pr_days = days_clf.predict_proba(x)[0]
    pred_days_idx = int(np.argmax(pr_days))
    pred_days_label = le_days.inverse_transform([pred_days_idx])[0]
    days_probs = {lab: float(pr_days[le_days.transform([lab])[0]]) for lab in days_labels}

    return {
        "RankBucket": {"pred": pred_rank_label, "probs": rank_probs},
        "DaysBucket": {"pred": pred_days_label, "probs": days_probs},
    }

# # 示例：
# print(predict_buckets_from_features_rf({
#   'Danceability':0.6,'Energy':0.8,'Valence':0.4,'Loudness':0.7,
#   'Speechiness':0.05,'Acousticness':0.2,'Instrumentalness':0.0,
#   'Artist_Score':0.73
# }))



=== RankBucket - Test (RandomForest, with Artist_Score) ===
Accuracy = 0.3573   Macro-F1 = 0.3510

Classification report:
              precision    recall  f1-score   support

       Top10       0.40      0.32      0.35       168
       11-50       0.38      0.52      0.44       431
      51-100       0.33      0.36      0.34       489
     101-150       0.31      0.27      0.29       393
     151-200       0.41      0.28      0.33       352

    accuracy                           0.36      1833
   macro avg       0.37      0.35      0.35      1833
weighted avg       0.36      0.36      0.35      1833


Confusion matrix:
           P:Top10  P:11-50  P:51-100  P:101-150  P:151-200
T:Top10         53       83        19         10          3
T:11-50         48      223       114         31         15
T:51-100        18      135       174        110         52
T:101-150        9       96       109        107         72
T:151-200        3       55       114         82         98

=== Days

三类

In [19]:
# -*- coding: utf-8 -*-
"""
三分类版本（非线性随机森林，加入 Artist_Score）：
- RankBucket:  Top50 / 51–100 / >100
- DaysBucket:  <30 / 30–100 / >100
- 特征：7 个音频特征 + Artist_Score  → 共 8 个
"""

import os
import joblib
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# ============== 路径配置 ==============
CSV = Path(r"C:\Users\Akari\OneDrive\Desktop\SEM 1\AML\Final CW\Datas\spotify_preprocess_Artist.csv")
ART_DIR = Path(r"C:\Users\Akari\OneDrive\Desktop\SEM 1\AML\Final CW\PredictRankAndDay"); ART_DIR.mkdir(exist_ok=True)

# ★ 加入 Artist_Score
FEATURES = [
    'Danceability','Energy','Valence','Loudness','Speechiness','Acousticness','Instrumentalness',
    'Artist_Score'
]

# ============== 读取数据 ==============
df = pd.read_csv(CSV)
need_cols = ['id','Title','Artists','Nationality'] + FEATURES + ['Rank','DaysOnChart']
missing = [c for c in need_cols if c not in df.columns]
if missing:
    raise ValueError(f"输入文件缺少必要列：{missing}\n请确认预处理已包含 Artist_Score。")

# 冷启动兜底：Artist_Score 缺失用中位数填充
df['Artist_Score'] = pd.to_numeric(df['Artist_Score'], errors='coerce')
if df['Artist_Score'].isna().any():
    df['Artist_Score'] = df['Artist_Score'].fillna(df['Artist_Score'].median())

df = df[need_cols].dropna(subset=FEATURES + ['Rank','DaysOnChart']).copy()
df['Rank'] = df['Rank'].clip(lower=1, upper=200).astype(int)
df['DaysOnChart'] = df['DaysOnChart'].astype(int)

# ============== 三分类分桶 ==============
# Rank: Top50 / 51–100 / >100
rank_bins   = [0, 50, 100, 200]
rank_labels = ["Top50", "51–100", ">100"]
df['RankBucket'] = pd.cut(df['Rank'], bins=rank_bins, labels=rank_labels, right=True, include_lowest=True)

# Days: <30 / 30–100 / >100
days_bins   = [-np.inf, 29, 100, np.inf]
days_labels = ["<30", "30–100", ">100"]
df['DaysBucket'] = pd.cut(df['DaysOnChart'], bins=days_bins, labels=days_labels, right=True)

df = df.dropna(subset=['RankBucket','DaysBucket']).copy()

# ============== 特征与标签 ==============
X = df[FEATURES].values
y_rank = df['RankBucket'].astype(str).values
y_days = df['DaysBucket'].astype(str).values

# ============== 分层切分（保持类分布） ==============
Xtr_r, Xte_r, y_rank_tr, y_rank_te = train_test_split(
    X, y_rank, test_size=0.2, random_state=42, stratify=y_rank
)
Xtr_d, Xte_d, y_days_tr, y_days_te = train_test_split(
    X, y_days, test_size=0.2, random_state=42, stratify=y_days
)

# ============== 标签编码 ==============
le_rank = LabelEncoder().fit(y_rank_tr)
le_days = LabelEncoder().fit(y_days_tr)
y_rank_tr_enc = le_rank.transform(y_rank_tr); y_rank_te_enc = le_rank.transform(y_rank_te)
y_days_tr_enc = le_days.transform(y_days_tr); y_days_te_enc = le_days.transform(y_days_te)

# ============== 定义随机森林模型（与五分类版一致，可按需调参） ==============
rf_params = dict(
    n_estimators=400,
    max_depth=None,          # 若过拟合可改为 12 或 14
    min_samples_leaf=2,      # 可升到 4 增强泛化
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1
)
rank_clf = RandomForestClassifier(**rf_params)
days_clf = RandomForestClassifier(**rf_params)

rank_clf.fit(Xtr_r, y_rank_tr_enc)
days_clf.fit(Xtr_d, y_days_tr_enc)

# ============== 评估函数 ==============
def evaluate(name, clf, Xte, yte_enc, label_encoder, label_order=None):
    pred_enc = clf.predict(Xte)
    acc = accuracy_score(yte_enc, pred_enc)
    f1m = f1_score(yte_enc, pred_enc, average="macro")
    y_true = label_encoder.inverse_transform(yte_enc)
    y_pred = label_encoder.inverse_transform(pred_enc)
    print(f"\n=== {name} - Test (RandomForest, 3-class, with Artist_Score) ===")
    print(f"Accuracy = {acc:.4f}   Macro-F1 = {f1m:.4f}")
    print("\nClassification report:")
    print(classification_report(y_true, y_pred, labels=label_order))
    if label_order is None:
        label_order = sorted(label_encoder.classes_.tolist())
    cm = confusion_matrix(y_true, y_pred, labels=label_order)
    cm_df = pd.DataFrame(cm, index=[f"T:{l}" for l in label_order], columns=[f"P:{l}" for l in label_order])
    print("\nConfusion matrix:")
    print(cm_df)

evaluate("RankBucket", rank_clf, Xte_r, y_rank_te_enc, le_rank, label_order=rank_labels)
evaluate("DaysBucket", days_clf, Xte_d, y_days_te_enc, le_days, label_order=days_labels)

# ============== 特征重要性 ==============
def show_feature_importance(name, clf):
    fi = clf.feature_importances_
    order = np.argsort(fi)[::-1]
    print(f"\n{name} - Feature Importance:")
    for i in order:
        print(f"  {FEATURES[i]:16s}  {fi[i]:.4f}")

show_feature_importance("RankBucket", rank_clf)
show_feature_importance("DaysBucket", days_clf)

# ============== 保存模型 ==============
bundle = {
    "features": FEATURES,
    "rank": {"model": rank_clf, "label_encoder": le_rank, "labels": rank_labels, "bins": rank_bins},
    "days": {"model": days_clf, "label_encoder": le_days, "labels": days_labels, "bins": days_bins}
}
outp = ART_DIR / "chart_cls_rf_3class_models_with_artist.pkl"
joblib.dump(bundle, outp)
print(f"\n✅ 三分类随机森林模型已保存到: {outp}")

# ============== 推理函数 ==============
def predict_3class_from_features(feat_dict: dict):
    """
    输入：8 个特征（含 Artist_Score, 0~1）
    """
    x = np.array([[feat_dict[f] for f in FEATURES]], dtype=float)

    pr_rank = rank_clf.predict_proba(x)[0]
    pred_rank_idx = int(np.argmax(pr_rank))
    pred_rank_label = le_rank.inverse_transform([pred_rank_idx])[0]
    rank_probs = {lab: float(pr_rank[le_rank.transform([lab])[0]]) for lab in rank_labels}

    pr_days = days_clf.predict_proba(x)[0]
    pred_days_idx = int(np.argmax(pr_days))
    pred_days_label = le_days.inverse_transform([pred_days_idx])[0]
    days_probs = {lab: float(pr_days[le_days.transform([lab])[0]]) for lab in days_labels}

    return {
        "RankBucket": {"pred": pred_rank_label, "probs": rank_probs},
        "DaysBucket": {"pred": pred_days_label, "probs": days_probs},
    }

# # 示例：
# print(predict_3class_from_features({
#   'Danceability':0.6,'Energy':0.8,'Valence':0.4,'Loudness':0.7,
#   'Speechiness':0.05,'Acousticness':0.2,'Instrumentalness':0.0,
#   'Artist_Score':0.73
# }))



=== RankBucket - Test (RandomForest, 3-class, with Artist_Score) ===
Accuracy = 0.5401   Macro-F1 = 0.4926

Classification report:
              precision    recall  f1-score   support

       Top50       0.57      0.69      0.62       599
      51–100       0.34      0.19      0.24       489
        >100       0.58      0.65      0.61       745

    accuracy                           0.54      1833
   macro avg       0.50      0.51      0.49      1833
weighted avg       0.51      0.54      0.52      1833


Confusion matrix:
          P:Top50  P:51–100  P:>100
T:Top50       415        75     109
T:51–100      157        91     241
T:>100        158       103     484

=== DaysBucket - Test (RandomForest, 3-class, with Artist_Score) ===
Accuracy = 0.6727   Macro-F1 = 0.3885

Classification report:
              precision    recall  f1-score   support

         <30       0.74      0.91      0.81      1277
      30–100       0.26      0.08      0.12       294
        >100       0.30      