In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, make_scorer
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
import joblib
import logging
import time
import tempfile



# 加载数据集
df = pd.read_csv('/Users/yaojiaxing/Documents/Uvic /202405/Professional Career Development II/final_df.csv')
print('Data loaded successfully')

# 检查并处理np.nan值
df.dropna(subset=['Text'], inplace=True)


# 计算钓鱼邮件的数量
phishing_count = df[df['Result'] == 1].shape[0]
print(f'Number of phishing emails: {phishing_count}')

# 计算需要的正常邮件数量 (10:1 的比例)
normal_count = phishing_count * 10

# 随机选择需要的正常邮件样本
normal_emails = df[df['Result'] == 0].sample(n=normal_count, random_state=42)

# 提取所有钓鱼邮件
phishing_emails = df[df['Result'] == 1]

# 合并新的数据集
new_df = pd.concat([normal_emails, phishing_emails]).reset_index(drop=True)

# 提取特征和标签
X = new_df['Text']
y = new_df['Result']

# 输出新的数据集信息
# print(f'New dataset size: {new_df.shape[0]}')
# print(f'Number of normal emails: {new_df[new_df['Result'] == 0].shape[0]}')
# print(f'Number of phishing emails: {new_df[new_df['Result'] == 1].shape[0]}')

# 保存新的数据集到CSV文件
new_df.to_csv('/Users/yaojiaxing/Documents/Uvic /202405/Professional Career Development II/new_final_df.csv', index=False)
print('New dataset saved as new_final_df.csv')




Data loaded successfully
Number of phishing emails: 2463
New dataset saved as new_final_df.csv


In [5]:
# 创建临时文件
log_file = tempfile.NamedTemporaryFile(delete=False, suffix='.log')
log_file_path = log_file.name

# 设置日志记录到临时文件
logging.basicConfig(filename=log_file_path, level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

# 记录起始时间
start_time = time.time()

logging.info("TFIDF vectorizing...")
# 进行TF-IDF向量化
tfidf_vectorizer = TfidfVectorizer(max_features=100)
X_tfidf = tfidf_vectorizer.fit_transform(X).toarray()
logging.info('TF-IDF vectorization completed')


# 分割数据集，20%作为测试集，80%作为训练集
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, stratify=y, random_state=42)
logging.info('Data split into train and test sets')



# 定义SVM模型和参数网格
svm_model = SVC(class_weight='balanced', random_state=42)
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}


# 定义评估指标
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

logging.info("Grid search with cross-validation...")

# 使用网格搜索进行超参数优化和交叉验证
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(svm_model, param_grid, scoring='recall', cv=stratified_kfold, return_train_score=True)
grid_search.fit(X_train, y_train)

# 输出最佳参数
logging.info(f'Best parameters: {grid_search.best_params_}')

logging.info('Grid search with cross-validation completed')


# # 获取交叉验证的结果
# cv_results = pd.DataFrame(grid_search.cv_results_)

# # 输出交叉验证结果
# logging.info('[mean_test_accuracy, mean_test_precision, mean_test_recall, mean_test_f1]:')
# logging.info(f'cv_results:',{cv_results})


logging.info("Training model...\n")
# 使用最佳参数在整个训练集上训练最终模型
best_svm_model = grid_search.best_estimator_
best_svm_model.fit(X_train, y_train)

logging.info("Trained model!\n")

# 在测试集上进行预测
y_pred_train = best_svm_model.predict(X_train)
y_pred_test = best_svm_model.predict(X_test)

# 评估最终模型在训练集上的表现
accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)
f1_train = f1_score(y_train, y_pred_train)
conf_matrix_train = confusion_matrix(y_train, y_pred_train)

# 输出最终模型评估结果
logging.info(f'Train Accuracy: {accuracy_train}')
logging.info(f'Train Precision: {precision_train}')
logging.info(f'Train Recall: {recall_train}')
logging.info(f'Train F1 Score: {f1_train}')
logging.info(f'Train Confusion Matrix:\n{conf_matrix_train}')

# 评估最终模型在测试集上的表现
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
conf_matrix_test = confusion_matrix(y_test, y_pred_test)


# 输出最终模型评估结果
logging.info(f'Test Accuracy: {accuracy_test}')
logging.info(f'Test Precision: {precision_test}')
logging.info(f'Test Recall: {recall_test}')
logging.info(f'Test F1 Score: {f1_test}')
logging.info(f'Test Confusion Matrix:\n{conf_matrix_test}')

# 记录结束时间并计算总运行时间
end_time = time.time()
total_time = end_time - start_time
logging.info(f'Total training time: {total_time:.2f} seconds')

# 保存最终模型和向量化器以供部署
joblib.dump(best_svm_model, 'svm_phishing_detection_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
logging.info('Final model and vectorizer saved')

TypeError: unhashable type: 'DataFrame'