<a href="https://colab.research.google.com/github/ANNE513/first-group/blob/main/11_16%E6%9C%80%E7%B5%82%E6%A8%A1%E5%9E%8B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ============================================================
# üì¶ Step 0: transformers ÁâàÊú¨Ê™¢Êü•
# ============================================================
import importlib, subprocess, sys

def get_training_args():
    import transformers
    version = transformers.__version__
    print(f"üîç transformers ÁâàÊú¨: {version}")
    major = int(version.split('.')[0])

    from transformers import TrainingArguments

    if major >= 4:
        return TrainingArguments(
            output_dir="./results",
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate=3e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=5,
            weight_decay=0.01,
            logging_dir="./logs",
            logging_steps=50,
            report_to="none",
        )
    else:
        return TrainingArguments(
            output_dir="./results",
            do_eval=True,
            save_strategy="epoch",
            learning_rate=3e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=2,
            weight_decay=0.01,
            logging_dir="./logs",
            logging_steps=50,
            report_to="none",
        )

try:
    import transformers
except:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "transformers", "accelerate", "datasets"])
    import transformers

# ============================================================
# Step 1: ‰∏äÂÇ≥Ë®ìÁ∑¥Ë≥áÊñô
# ============================================================
from google.colab import files
import pandas as pd

print("üì§ Ë´ã‰∏äÂÇ≥ train_split.xlsx Ëàá test_split.xlsx...")
uploaded = files.upload()

if "train_split.xlsx" not in uploaded or "test_split.xlsx" not in uploaded:
    raise ValueError("‚ùå Ë´ãÂøÖÈ†à‰∏äÂÇ≥ train_split.xlsx + test_split.xlsx ÊâçËÉΩË®ìÁ∑¥ÔºÅ")

train_df = pd.read_excel("train_split.xlsx")
test_df = pd.read_excel("test_split.xlsx")

print(f"üìä Ë®ìÁ∑¥ÈõÜÁ≠ÜÊï∏: {len(train_df)}, Ê∏¨Ë©¶ÈõÜÁ≠ÜÊï∏: {len(test_df)}")

# ============================================================
# Step 2: Tokenizer / Dataset
# ============================================================
import torch, random, numpy as np
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score, f1_score, classification_report

device = "cuda" if torch.cuda.is_available() else "cpu"

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
set_seed()

tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")

def encode_texts(texts):
    return tokenizer(texts, truncation=True, padding=True, max_length=256)

train_enc = encode_texts(train_df["content_cleaned"].tolist())
test_enc  = encode_texts(test_df["content_cleaned"].tolist())

class FoodDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = FoodDataset(train_enc, train_df["label"].tolist())
test_dataset  = FoodDataset(test_enc,  test_df["label"].tolist())

# ============================================================
# Step 3: Class weights + Ê®°Âûã
# ============================================================
label_counts = Counter(train_df["label"].tolist())
total = sum(label_counts.values())
weights = [total / label_counts[i] for i in range(2)]
class_weights = torch.tensor(weights, dtype=torch.float).to(device)
print(f"üî∏ Class Weights: {class_weights}")

model = AutoModelForSequenceClassification.from_pretrained(
    "hfl/chinese-roberta-wwm-ext",
    num_labels=2
)

def weighted_loss(outputs, labels):
    logits = outputs.logits
    loss_fct = CrossEntropyLoss(weight=class_weights)
    return loss_fct(logits, labels)

training_args = get_training_args()

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        loss = weighted_loss(outputs, labels)
        return (loss, outputs) if return_outputs else loss

def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=-1)
    return {
        "accuracy": accuracy_score(pred.label_ids, preds),
        "f1": f1_score(pred.label_ids, preds),
    }

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# ============================================================
# Step 4: ÈñãÂßãË®ìÁ∑¥
# ============================================================
print("\nüöÄ ÈñãÂßãË®ìÁ∑¥Ê®°Âûã...")
trainer.train()

model.save_pretrained("./food_model")
tokenizer.save_pretrained("./food_model")
print("üíæ Ê®°ÂûãÂ∑≤ÊàêÂäüÂÑ≤Â≠òÂà∞ /food_model")

# ============================================================
# Step 5: Ë®ìÁ∑¥ÂÆåÊàê ‚Üí Ëá™ÂãïÈÄ≤ÂÖ•È†êÊ∏¨Ê®°ÂºèÔºàÂê´‰ø°ÂøÉÂàÜÊï∏Ôºâ
# ============================================================
print("\nüîÆ Ë®ìÁ∑¥ÂÆåÊàêÔºÅË´ã‰∏äÂÇ≥ predict_sample.csv ÈÄ≤Ë°åÈ†êÊ∏¨...")
from google.colab import files
uploaded2 = files.upload()

if "predict_sample.csv" not in uploaded2:
    raise ValueError("‚ùå Ë´ã‰∏äÂÇ≥ predict_sample.csv Áî®ÊñºÈ†êÊ∏¨ÔºÅ")

import pandas as pd
import torch
import numpy as np

# ËÆÄÂèñÈ†êÊ∏¨Ë≥áÊñô
pred_df = pd.read_csv("predict_sample.csv")
texts = pred_df["content_cleaned"].astype(str).tolist()

# Á∑®Á¢º
enc = tokenizer(
    texts,
    truncation=True,
    padding=True,
    max_length=256,
    return_tensors="pt"
)
enc = {k: v.to(device) for k, v in enc.items()}
model.to(device)
model.eval()

# ========= üî¢ È†êÊ∏¨ + ‰ø°ÂøÉÂàÜÊï∏ =========
with torch.no_grad():
    outputs = model(**enc)
    logits = outputs.logits                    # [num_samples, num_labels]
    probs = torch.softmax(logits, dim=1)       # ËΩâÊàêÊ©üÁéá
    probs_np = probs.cpu().numpy()
    preds = np.argmax(probs_np, axis=1)        # ÂèñÊúÄÂ§ßÊ©üÁéáÁöÑÈ°ûÂà•

# ========= üßæ ÊääÁµêÊûúÂØ´Âõû DataFrame =========
pred_df["prediction"] = preds
pred_df["prob_0"] = probs_np[:, 0]             # È°ûÂà• 0 ÁöÑÊ©üÁéá
pred_df["prob_1"] = probs_np[:, 1]             # È°ûÂà• 1 ÁöÑÊ©üÁéá

# ‰ø°ÂøÉÂàÜÊï∏ÔºöÂèñÂÖ©ÂÄãÈ°ûÂà•Ê©üÁéáÁöÑÊúÄÂ§ßÂÄº
pred_df["confidence"] = pred_df[["prob_0", "prob_1"]].max(axis=1)

# ‰πüÂèØ‰ª•Áúã„ÄåÊ®°ÂûãÊúâÂ§ö‰∏çÁ¢∫ÂÆö„Äç= 1 - confidenceÔºàÊï∏Â≠óË∂äÂ§ßË∂ä‰∏çÁ¢∫ÂÆöÔºâ
pred_df["uncertainty"] = 1 - pred_df["confidence"]

# ‰∏≠ÊñáÊ®ôÁ±§ÔºàÂèØ‰æù‰Ω†ÂéüÊú¨ÂÆöÁæ©Ë™øÊï¥Ôºâ
pred_df["prediction_label"] = pred_df["prediction"].map({
    0: "‚úÖ Á¨¶ÂêàÊ≥ïË¶è",
    1: "‚ö†Ô∏è ‰∏çÁ¨¶ÂêàÊ≥ïË¶è"
})

print("\nüìä ÈÉ®ÂàÜÈ†êÊ∏¨ÁµêÊûúÔºàÂê´‰ø°ÂøÉÂàÜÊï∏ÔºâÔºö")
print(pred_df[["content_cleaned", "prediction", "prediction_label", "prob_0", "prob_1", "confidence"]].head())

# ========= üîª ‰æù„Äå‰ø°ÂøÉÁî±‰ΩéÂà∞È´ò„ÄçÊéíÂ∫èÔºàÊúÄÂÆπÊòìÈåØÁöÑÂú®ÊúÄ‰∏äÈù¢Ôºâ =========
pred_sorted = pred_df.sort_values(by="confidence", ascending=True)

# ========= üíæ Â≠òÊ™î & ‰∏ãËºâ =========
pred_df.to_csv("predict_output_with_confidence.csv", index=False)
pred_sorted.to_csv("predict_output_sorted_by_confidence.csv", index=False)

files.download("predict_output_with_confidence.csv")
files.download("predict_output_sorted_by_confidence.csv")

print("\nüíæ Â∑≤‰∏ãËºâÔºö")
print(" - predict_output_with_confidence.csvÔºàÂéüÈ†ÜÂ∫è + ‰ø°ÂøÉÂàÜÊï∏Ôºâ")
print(" - predict_output_sorted_by_confidence.csvÔºà‰ø°ÂøÉÁî±‰ΩéÂà∞È´òÊéíÂ∫èÔºâ")


üì§ Ë´ã‰∏äÂÇ≥ train_split.xlsx Ëàá test_split.xlsx...


Saving test_split.xlsx to test_split.xlsx
Saving train_split.xlsx to train_split.xlsx
üìä Ë®ìÁ∑¥ÈõÜÁ≠ÜÊï∏: 1233, Ê∏¨Ë©¶ÈõÜÁ≠ÜÊï∏: 309


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

üî∏ Class Weights: tensor([1.9665, 2.0347], device='cuda:0')


pytorch_model.bin:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üîç transformers ÁâàÊú¨: 4.57.1

üöÄ ÈñãÂßãË®ìÁ∑¥Ê®°Âûã...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2433,0.164638,0.954693,0.955975
2,0.0655,0.06591,0.977346,0.978852
3,0.0243,0.08654,0.980583,0.981707
4,0.0167,0.090391,0.977346,0.978593
5,0.0008,0.073997,0.983819,0.984802


üíæ Ê®°ÂûãÂ∑≤ÊàêÂäüÂÑ≤Â≠òÂà∞ /food_model

üîÆ Ë®ìÁ∑¥ÂÆåÊàêÔºÅË´ã‰∏äÂÇ≥ predict_sample.csv ÈÄ≤Ë°åÈ†êÊ∏¨...


Saving predict_sample.csv to predict_sample.csv

üìä ÈÉ®ÂàÜÈ†êÊ∏¨ÁµêÊûúÔºàÂê´‰ø°ÂøÉÂàÜÊï∏ÔºâÔºö
                                     content_cleaned  prediction  \
0  ÊãâËìì„ÄÇÈñÉÁ∫ñËàíÁú†ÁõäÁîüËèåÂáçÊØèÊó•‰∏ÄÊ¢ùËºïÈ¨ÜÂÖ•Âè£Âπ´Âä©Ë™øÁØÄÁîüÁêÜÊ©üËÉΩÁ∂≠ÊåÅËÖ∏ÈÅìÂÅ•Â∫∑„ÄÇÂØåÂê´ËÜ≥È£üÁ∫ñÁ∂≠‰øÉÈÄ≤Ê∂àÂåñÈÅìË†ï...           0   
1     Â§©Â†ÇÊ§íÂèØ‰ª•È®ôÂ§ßËÖ¶ÁöÑ‰∫§ÊÑüÁ•ûÁ∂ìÂÆÉÂèØ‰ª•È®ôÂ§ßËÖ¶Â¢ûÂä†ÊàëÂÄëÁöÑÈùúÊÖãËÉΩÈáèÊ∂àËÄó„ÄÇ0Áò¶000ÂÖ¨Êñ§Áò¶‰∫ÜÂ•∂Â•∂ÈÇÑËÆäÂ§ß„ÄÇ           1   
2  ÊØèÊó•‰∏ÄÂåÖÂ†ÖÊûú‰πæ‰øùÊåÅË∫´È´îÂÅ•Â∫∑ÂÖßÂê´000Á®ÆÁÑ°Ë™øÂë≥Â†ÖÊûúÂèØÂä†ÂÖ•ÁâõÂ•∂ÊàñÂÑ™Ê†ºÁï∂Êó©È§êÈ£üÁî®ÂÖßÁÇ∫Â∞èÂåÖË£ùË®≠Ë®àÊñπ‰æø...           0   
3  ÊÉ†Ê∞èÂïüË≥¶ÂïüË≥¶Ê∞¥Ëß£„ÄÇ0Âê´Ë±êÂØåÁáüÈ§äÁ¥†ÊúâÂä©ÊñºÁ∂≠ÊåÅÊ∂àÂåñÈÅìÊ©üËÉΩ‰øÉÈÄ≤ËÖ∏ÈÅìË†ïÂãïËÆìÊÇ®ÊØèÂ§©ËºïÈ¨ÜË™øÁØÄÁîüÁêÜÊ©üËÉΩ‰∫´Âèó...           0   
4  ÈÄôÊ¨æÊ°îÊ¢óÊ∞¥Ê¢®Ê±ÅÊúâÈéÆÂí≥ÊäóÁÇé‰ª•ÂèäÊäóÊ∞ßÂåñÂäüÊïà„ÄÇÊ∞£ÁÆ°ÊïèÊÑüÊúãÂèãÁöÑÊïëÊòü„ÄÇÂ∞§ÂÖ∂ÁâπÂà•ÈÅ©ÂêàÂÆπÊòìÂñâÂö®ÊïèÊÑü„ÄÅÊ∞£ÁÆ°„ÄÅ...           1   

  prediction_label    prob_0    prob_1  confidence

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


üíæ Â∑≤‰∏ãËºâÔºö
 - predict_output_with_confidence.csvÔºàÂéüÈ†ÜÂ∫è + ‰ø°ÂøÉÂàÜÊï∏Ôºâ
 - predict_output_sorted_by_confidence.csvÔºà‰ø°ÂøÉÁî±‰ΩéÂà∞È´òÊéíÂ∫èÔºâ
