# 科系整理

##先整理大學的科系與對應

In [None]:
from google.colab import files
import pandas as pd

# 選擇上傳 Excel 檔案
uploaded = files.upload()

# 取得檔案名稱
file_name = list(uploaded.keys())[0]

# 讀取 Excel
df = pd.read_excel(file_name)

# 顯示前幾筆資料
df.head()

In [None]:
expanded_data = []
for index, row in df.iterrows():
    if pd.notna(row["discrib"]):  # 確保第五欄有值
        split_values = row["discrib"].split("、")  # 依據頓號切割
        for value in split_values:
            expanded_data.append([row["mainid"], row["midid"], row["lastid"], value, row["eduid"]])  # 重新組合成新行
    else:
        expanded_data.append([row["mainid"], row["midid"], row["lastid"], row["lastname"], row["eduid"]])  # 沒有第五欄的保留原始

# 轉換為新的 DataFrame
expanded_df = pd.DataFrame(expanded_data, columns=["mainid", "midid", "lastid	", "lastname", "eduid"])

In [None]:
expanded_df.head(10)

In [None]:
# 假設 df 是你的 DataFrame
output_file = "output.xlsx"  # 輸出的檔案名稱

# 將 DataFrame 儲存為 Excel
expanded_df.to_excel(output_file, index=False)

# 讓 Colab 下載 Excel 檔案
from google.colab import files
files.download(output_file)

##ckip 與 jeiba

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
! pip install -U ckip-transformers
from ckip_transformers import __version__
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker
from google.colab import files
import jieba
import numpy as np

In [None]:
# 文本預處理，這裡可以使用 jieba 進行中文分詞
def jieba_text(text):
    # 使用 jieba 分詞
    words = jieba.cut(text)
    return ' '.join(words)

In [None]:
# Show version
print(__version__)

# Initialize drivers
print("Initializing drivers ... WS")
ws_driver = CkipWordSegmenter(model="albert-base", device=-1)
print("Initializing drivers ... POS")
pos_driver = CkipPosTagger(model="albert-base", device=-1)
print("Initializing drivers ... NER")
ner_driver = CkipNerChunker(model="albert-base", device=-1)
print("Initializing drivers ... all done")
print()

#model 有其它的可以選，如 "bert-base"
#device=0 是使用 GPU， device=-1 是使用 CPU，不指定也可以。


def clean(sentence_ws, sentence_pos):
  short_with_pos = []
  short_sentence = []
  stop_pos = set(['Nep', 'Nh', 'Nb',]) # 這 3 種詞性不保留
  for word_ws, word_pos in zip(sentence_ws, sentence_pos):
    # 只留名詞和動詞
    is_N_or_V = word_pos.startswith("V") or word_pos.startswith("N") or word_pos.startswith("A")
    # 去掉名詞裡的某些詞性
    is_not_stop_pos = word_pos not in stop_pos
    # 只剩一個字的詞也不留
    is_not_one_charactor = not (len(word_ws) == 1)
    # 組成串列
    if is_N_or_V and is_not_stop_pos and is_not_one_charactor:
      short_with_pos.append(f"{word_ws}({word_pos})")
      short_sentence.append(f"{word_ws}")
  return (" ".join(short_sentence), " ".join(short_with_pos))


In [None]:
uploaded = files.upload() #上傳資料
# Get the filename from the uploaded dictionary
filename = list(uploaded.keys())[0]
data = pd.read_csv(filename)

In [None]:
#確認是否有亂碼
with open(filename, "rb") as f:
    for i, line in enumerate(f):
        try:
            line.decode("utf-8")
        except UnicodeDecodeError as e:
            print(f"錯誤發生在第 {i+1} 行，錯誤訊息: {e}")
            break
print(data.dtypes)
print(data.head(10))

In [None]:
data.dropna(subset=['mainid'], inplace=True)
data['mainid'] = data['mainid'].astype(int)

In [None]:
# 儲存分詞結果與新的句子
data["result"] = ""
data["n_lastname"] = ""

# 進行每個部份的分析
for index, row in data.iterrows():
    description = row["lastname"]
    # 分詞
    ws_result = ws_driver([description])
    # 詞性標註
    pos_result = pos_driver(ws_result)

    # 清洗數據
    cleaned_sentence, cleaned_with_pos = clean(ws_result[0], pos_result[0])

    # 保存结果
    data.at[index, "result"] = cleaned_with_pos
    data.at[index, "n_lastname"] = cleaned_sentence

In [None]:
data['jeiba_lastname'] = data['lastname'].apply(jieba_text)
data['n_lastname'] = data['n_lastname'].replace('', np.nan)
data['n_lastname'] = data['n_lastname'].fillna(data['jeiba_lastname']).str.lstrip()


#修正jeiba 會是NA的狀況
print(data['jeiba_lastname'].isnull().sum())
# 篩選出 'jeiba_lastname' 欄位為 NaN 的整筆資料
missing_data = data[data['jeiba_lastname'].isnull()]

# 顯示這些遺失的資料
#print(missing_data)  # 顯示所有遺失的資料
data['jeiba_lastname'] = data['jeiba_lastname'].fillna(data['lastname'])
'''
nan_indices = data[data['n_lastname'].isna()].index
print(nan_indices)
nan_rows_details = data.loc[[425, 2172]]
print(nan_rows_details)

不分系 跟基督教學系會被nan

'''
print(data['n_lastname'].isnull().sum())
data['n_lastname'] = data['n_lastname'].str.lstrip() #由於連結jeiba 會將最前面空格也保留
print(data['n_lastname'].isnull().sum())

In [None]:
print(data['n_lastname'].isnull().sum())
print(data.iloc[676])

In [None]:
nan_indices = data[data['n_lastname'].isna()].index
print(nan_indices)
nan_rows_details = data.loc[[425, 2172]]
nan_rows_details

In [None]:
output_filename = "2022科系分類_奕嘉_已經分詞.csv"
data.to_csv(output_filename, index=False, encoding='utf-8-sig')
files.download(output_filename)

##分類測試

In [None]:
import pandas as pd
import jieba
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from google.colab import files

from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import RandomizedSearchCV


#### 結果不理想，所以將data2 中部份資料與data 結合

In [None]:
#上傳資料 兩份資料
uploaded = files.upload() #上傳資料
file1 = list(uploaded.keys())[0]
df1 = pd.read_csv(file1)


uploaded = files.upload() #上傳資料
file2 = list(uploaded.keys())[0]
df2 = pd.read_csv(file2)

#合併
data = pd.concat([df1, df2], ignore_index=True)


In [None]:
#準確率不高，所以採用smote 後再進行

# 檢查類別分部
data_df = pd.DataFrame({'lastname': data['n_lastname'], 'label': data['mainid']})
label_counts = data_df['label'].value_counts()

# 查看是否有少於5筆的樣本
rare_labels = label_counts[label_counts < 6].index
print(f"稀有類別: {rare_labels}")


In [None]:
# 處理不均衡數據 (使用 SMOTE)
X = data_df['lastname'].tolist()
y = data_df['label'].tolist()

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# 將文本數據轉換為數值特徵
vectorizer = TfidfVectorizer(max_features=5000, min_df=1, max_df=0.9, ngram_range=(1,2))  # 取前 5000 個高頻詞
X_vectorized = vectorizer.fit_transform(X)


# 單純分割數據
train_texts, val_texts, train_labels, val_labels = train_test_split(
    X_vectorized, y_encoded, test_size=0.2, random_state=69, stratify=y_encoded
)

# 使用 SMOTE 平衡類別分佈
smote = SMOTE(sampling_strategy='not majority', random_state=42, k_neighbors=3) #複製not minority
X_resampled, y_resampled = smote.fit_resample(X_vectorized, y_encoded)

# 分割數據集_smote
train_texts_smote, val_texts_smote, train_labels_smote, val_labels_smote = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=69, stratify=y_resampled
)


### 隨機森林

In [None]:
# 訓練模型：這裡用訓練特徵和訓練標籤
rdf_model = RandomForestClassifier(n_estimators=1000, random_state=42,max_depth= None,min_samples_split=2, min_samples_leaf=1,class_weight='balanced')
rdf_model.fit(train_texts, train_labels)

# 預測和評估：用驗證特徵進行預測，並將預測結果與驗證標籤比較
y_pred = rdf_model.predict(val_texts)
print(classification_report(val_labels, y_pred))

In [None]:
# 訓練模型：這裡用訓練特徵和訓練標籤 _smote
rdf_model_smote = RandomForestClassifier(n_estimators=1000, random_state=42,max_depth= None,min_samples_split=2, min_samples_leaf=1,class_weight='balanced')
rdf_model_smote.fit(train_texts_smote, train_labels_smote)
# 預測和評估：用驗證特徵進行預測，並將預測結果與驗證標籤比較
y_pred_smote = rdf_model_smote.predict(val_texts_smote)
print(classification_report(val_labels_smote, y_pred_smote))

###朴素貝

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
#smote
nb_model_smote = MultinomialNB(alpha=0.1)
nb_model_smote.fit(train_texts_smote, train_labels_smote)

# 預測
y_pred_model_nb_smote = nb_model_smote.predict(val_texts_smote)
# 評估
print(classification_report(val_labels_smote, y_pred_model_nb_smote))

In [None]:
#原始
nb_model = MultinomialNB(alpha=0.1)
nb_model.fit(train_texts, train_labels)

# 預測
y_pred_model_nb = nb_model.predict(val_texts)
# 評估
print(classification_report(val_labels, y_pred_model_nb))

###SVM

In [None]:
from sklearn.svm import SVC

In [None]:
#smote
svm_model_smote = SVC(kernel='linear', C=1.3, class_weight='balanced', probability=True, random_state=69)
svm_model_smote.fit(train_texts_smote, train_labels_smote)

# 預測
y_pred_svm_smote = svm_model_smote.predict(val_texts_smote)

# 評估
print(classification_report(val_labels_smote, y_pred_svm_smote))

In [None]:
#m原始
svm_model = SVC(kernel='linear', C=1.3, class_weight='balanced', probability=True, random_state=69)
svm_model.fit(train_texts, train_labels)

# 預測
y_pred_svm = svm_model.predict(val_texts)

# 評估
print(classification_report(val_labels, y_pred_svm))

###LightGBM


In [None]:
import lightgbm as lgb
from sklearn.metrics import classification_report

In [None]:
import numpy as np
print("Zero feature ratio:", np.mean(train_texts_smote.toarray() == 0))


In [None]:
lgbm_model_smote = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.005,
    max_depth=-1,          # 允許無限深度
    num_leaves=100,         # 增加葉子數
    min_child_samples=2,   # 減少最小節點樣本數
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# 訓練 LightGBM 模型
lgbm_model_smote.fit(train_texts_smote, train_labels_smote)

# 預測
y_pred_lgbm_smote = lgbm_model_smote.predict(val_texts_smote)

# 評估
print(classification_report(val_labels_smote, y_pred_lgbm_smote))

In [None]:
lgbm_model = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=-1,          # 允許無限深度
    num_leaves=50,         # 增加葉子數
    min_child_samples=2,   # 減少最小節點樣本數
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# 訓練 LightGBM 模型
lgbm_model.fit(train_texts, train_labels)

# 預測
y_pred_lgbm= lgbm_model.predict(val_texts)

# 評估
print(classification_report(val_labels, y_pred_lgbm))

## 對答案


In [None]:
uploaded = files.upload() #上傳資料
# Get the filename from the uploaded dictionary
filename = list(uploaded.keys())[0]
data2 = pd.read_csv(filename)

In [None]:
print(data2.dtypes)
data2['n_lastname'] = data2['n_lastname'].fillna(data2['jeiba_lastname'])
data2['n_lastname'] = data2['n_lastname'].fillna(data2['lastname'])

In [None]:
print(data2['n_lastname'].isnull().sum())

In [None]:
def predict_and_evaluate(new_data, model):
    # 取得預測結果，注意這裡假設模型接受 ln_lastname 欄位作為輸入
    # 將文本數據轉換為數值特徵
    X_new = vectorizer.transform(new_data['n_lastname'])

    predicted_ids = model.predict(X_new)

    # 使用相同的 label_encoder 將編碼數字轉回原始標籤
    predicted_labels = label_encoder.inverse_transform(predicted_ids)

    # 嘗試獲取預測機率（部分模型可能不支援）
    try:
        probs = model.predict_proba(X_new)  # 取得所有類別的機率分布
        max_probs = probs.max(axis=1)  # 取得最高的機率值
    except AttributeError:
        max_probs = [None] * len(predicted_ids)  # 若模型不支援，則填 None

    # 將預測結果加入 DataFrame 中
    new_data['predicted_label'] = predicted_labels

    # 根據預測結果與原始 mainid 進行比較，產生 code 欄位 (1 表示預測正確，0 表示錯誤)
    new_data['code'] = (new_data['predicted_label'] == new_data['mainid']).astype(int)
    new_data['prediction_confidence'] = max_probs  # 新增預測機率欄位
    # 計算準確率
    accuracy = new_data['code'].mean()
    return new_data, accuracy

###將各模型結果儲存

In [None]:
#單一模型測試
data3, accu = predict_and_evaluate(data2, svm_model_smote)
print(f"準確率: {accu:.4f}")

In [None]:
models = {
    "rdf": rdf_model,
    "rdf_smote": rdf_model_smote,
    "nb": nb_model,
    "nb_smote": nb_model_smote,
    "svm": svm_model,
    "svm_smote": svm_model_smote,
    "lgbm": lgbm_model,
    "lgbm_smote": lgbm_model_smote
}

# 用字典存儲不同模型的預測結果
results_df = {}  # 存放 DataFrame
accuracies = {}  # 存放準確率

for model_name, model in models.items():
    results_df[model_name], accuracies[model_name] = predict_and_evaluate(data2.copy(), model)
    print(f"{model_name}: Accuracy = {accuracies[model_name]:.4f}")



In [None]:
#將模型結果分別儲存
rdf_df = results_df["rdf"]
rdf_smote_df = results_df["rdf_smote"]
nb_df = results_df["nb"]
nb_smote_df = results_df["nb_smote"]
svm_df = results_df["svm"]
svm_smote_df = results_df["svm_smote"]
lgbm_df = results_df["lgbm"]
lgbm_smote_df = results_df["lgbm_smote"]

In [None]:
for model_name, df in results_df.items():
    filename = f"{model_name}_predictions.csv"  # 生成檔名
    df.to_csv(filename, index=False, encoding='utf-8-sig')
    files.download(filename)  # 下載檔案（適用於 Google Colab）
    print(f"✅ {filename} 已儲存！")

###檢查模型結果


In [None]:
# 選擇要合併的 DataFrame
dfs_to_merge = [rdf_smote_df, nb_smote_df, svm_smote_df, lgbm_smote_df]

# 先為每個 DataFrame 內的 predicted_label 和 code 加上專屬後綴，避免衝突
suffixes = ["_rdf", "_nb", "_svm", "_lgbm"]
for i, df in enumerate(dfs_to_merge):
    df.rename(columns={"predicted_label": f"predicted_label{suffixes[i]}",
              "code": f"code{suffixes[i]}",
              "prediction_confidence" : f"prediction_confidence{suffixes[i]}"}, inplace=True)

# 以第一個 DataFrame 作為基準，依次合併
merged_smote_df = dfs_to_merge[0]
for df in dfs_to_merge[1:]:
    merged_smote_df = merged_smote_df.merge(
        df,
        on=["mainid", "lastname", "result", "n_lastname", "jeiba_lastname","n_lastname_2", "n", "N", "k"],
        how="outer"
    )


In [None]:
merged_smote_df

##集成分類

In [None]:
import pandas as pd

# 建立一個 DataFrame 存放所有模型的預測結果
ensemble_results = pd.DataFrame()

# 先加入真實標籤
ensemble_results["true_label"] = data2["mainid"]
ensemble_results["n_lastname"] = data2["n_lastname"]  # 加入 n_lastname
# 加入各模型的預測標籤
ensemble_results["rdf_pred"] = rdf_smote_df['predicted_label_rdf']
ensemble_results["nb_pred"] = nb_smote_df['predicted_label_nb']
ensemble_results["svm_pred"] = svm_smote_df['predicted_label_svm']
ensemble_results["lgbm_pred"] = lgbm_smote_df['predicted_label_lgbm']

# 加入個模型的預測機率
ensemble_results["rdf_confidence"] = rdf_smote_df['prediction_confidence_rdf']
ensemble_results["nb_confidence"] = nb_smote_df['prediction_confidence_nb']
ensemble_results["svm_confidence"] = svm_smote_df['prediction_confidence_svm']
ensemble_results["lgbm_confidence"] = lgbm_smote_df['prediction_confidence_lgbm']

# 檢查結果
#print(ensemble_results.head())

In [None]:
from collections import defaultdict

# 設定權重（NB 最高，LGBM 次高，RF 最低）
model_weights = {
    "nb_pred": 0.5,   # NB 權重最高
    "lgbm_pred": 0.2, # LGBM 權重低
    "rdf_pred": 0.3    # 隨機森林權重中
}

def weighted_voting(row):
    """當 SVM 預測錯誤時，使用加權投票"""
    svm_pred = row["svm_pred"]
    true_label = row["true_label"]

    #信心低於0.9
    threshold = 0.90  # SVM 信心機率閾值
    svm_pred = row["svm_pred"]
    svm_confidence = row["svm_confidence"]  # SVM 預測的信心機率

    # 若 SVM 信心高於 90%，則直接使用 SVM 預測結果
    if pd.notna(svm_confidence) and svm_confidence >= threshold:
        return svm_pred


    # 使用 NB、LGBM、RF 進行加權投票
    vote_scores = defaultdict(int)

    for model, weight in model_weights.items():
        predicted_class = row[model]  # 取得該模型的預測結果
        vote_scores[predicted_class] += predicted_class * weight  # 累加該類別的權重

    # 取得權重最高的類別
    final_prediction = max(vote_scores, key=vote_scores.get)
    return final_prediction

# 套用加權投票修正
ensemble_results["final_pred"] = ensemble_results.apply(weighted_voting, axis=1)

# 計算最終準確率
final_accuracy = (ensemble_results["final_pred"] == ensemble_results["true_label"]).mean()
print(f"🎯 最終加權投票修正後的準確率：{final_accuracy:.4f}")

In [None]:
#輸出結果
output_filename = "預測結果比對.csv"
ensemble_results.to_csv(output_filename, index=False, encoding='utf-8-sig')
files.download(output_filename)

###儲存模型

In [None]:
models = {
    "rdf": rdf_model,
    "rdf_smote": rdf_model_smote,
    "nb": nb_model,
    "nb_smote": nb_model_smote,
    "svm": svm_model,
    "svm_smote": svm_model_smote,
    "lgbm": lgbm_model,
    "lgbm_smote": lgbm_model_smote
}

In [None]:
import joblib
from google.colab import files

# 儲存所有模型
joblib.dump(svm_model_smote, 'svm.joblib')
joblib.dump(nb_model_smote, 'naive_bayes.joblib')
joblib.dump(rdf_model_smote, 'random_forest.joblib')
joblib.dump(lgbm_model_smote, 'lightgbm.joblib')

# 儲存向量化器 & 標籤編碼器
joblib.dump(vectorizer, 'vectorizer.joblib')
joblib.dump(label_encoder, 'label_encoder.joblib')


# 下載所有模型
for file_name in ["svm.joblib", "naive_bayes.joblib", "random_forest.joblib", "lightgbm.joblib",
                  "vectorizer.joblib", "label_encoder.joblib"]:
    files.download(file_name)

print("✅ 所有模型已儲存並下載！")


In [None]:
files.download("random_forest.joblib")

### 上傳模型進行使用

In [None]:
# 儲存向量化器 & 標籤編碼器
joblib.dump(vectorizer, 'vectorizer.joblib')
joblib.dump(label_encoder, 'label_encoder.joblib')

In [None]:
import joblib

# 載入模型
model = joblib.load('naive_bayes.joblib')

# 載入 TF-IDF vectorizer
vectorizer = joblib.load('vectorizer.joblib')

# 載入 LabelEncoder
label_encoder = joblib.load('label_encoder.joblib')

In [None]:
#單一模型測試
data3, accu = predict_and_evaluate(data2, model)
print(f"準確率: {accu:.4f}")

In [None]:
print(len(vectorizer.get_feature_names_out()))  # 這應該是 3837
