In [4]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-macosx_12_0_arm64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-3.0.0


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

In [16]:
df = pd.read_csv("/Users/zhangyuxuan/Desktop/mergedata_spam_sms.csv")
df["Label"] = df["Label"].map({"SPAM": 0, "HAM": 1})

X = df["Message"]   
y = df["Label"]      

# TF-IDF 
#   - stop_words="english" eg: "the", "and" 
vectorizer = TfidfVectorizer(stop_words="english", max_features=3000)
X_tfidf = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# define and train XGBoost classifier
#   - eval_metric='mlogloss' 
#   - use_label_encoder=False avoid warning of label encoder 
model = xgb.XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))
# (precision, recall, f1-score)
print(classification_report(y_test, y_pred, target_names=["SPAM", "HAM"]))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 89.74%
              precision    recall  f1-score   support

        SPAM       0.90      0.73      0.81       631
         HAM       0.90      0.97      0.93      1532

    accuracy                           0.90      2163
   macro avg       0.90      0.85      0.87      2163
weighted avg       0.90      0.90      0.89      2163



In [12]:
df = pd.read_csv("/Users/zhangyuxuan/Desktop/mergedata_spam_sms.csv")

df["Label"] = df["Label"].map({"SPAM": 0, "HAM": 1})

X = df["Message"]
y = df["Label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# create Pipeline
#    (1) TF-IDF symbol extract
#    (2) XGBoost classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('xgb', xgb.XGBClassifier(
        eval_metric='mlogloss' 
    ))
])

param_grid = {
    # TF-IDF 
    'tfidf__ngram_range': [(1,1), (1,2)],  
    'tfidf__max_features': [3000, 5000],   
    'tfidf__min_df': [1, 2],              
    'tfidf__max_df': [0.8, 0.9],          
    
    # XGBoost 
    'xgb__n_estimators': [100, 200],       
    'xgb__max_depth': [4, 6],              
    'xgb__learning_rate': [0.1, 0.3],     
    'xgb__subsample': [0.8, 1.0],          
    'xgb__colsample_bytree': [0.8, 1.0],   
}

# GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='accuracy',      
    cv=3,                    
    n_jobs=-1,               
    verbose=2                # 显示详细搜索过程
)

grid_search.fit(X_train, y_train)

# best parameter
print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)

# using the best model to prodict 
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["SPAM", "HAM"]))


Fitting 3 folds for each of 512 candidates, totalling 1536 fits
[CV] END tfidf__max_df=0.8, tfidf__max_features=3000, tfidf__min_df=1, tfidf__ngram_range=(1, 1), xgb__colsample_bytree=0.8, xgb__learning_rate=0.1, xgb__max_depth=4, xgb__n_estimators=100, xgb__subsample=1.0; total time=   1.1s
[CV] END tfidf__max_df=0.8, tfidf__max_features=3000, tfidf__min_df=1, tfidf__ngram_range=(1, 1), xgb__colsample_bytree=0.8, xgb__learning_rate=0.1, xgb__max_depth=4, xgb__n_estimators=200, xgb__subsample=1.0; total time=   1.9s
[CV] END tfidf__max_df=0.8, tfidf__max_features=3000, tfidf__min_df=1, tfidf__ngram_range=(1, 1), xgb__colsample_bytree=0.8, xgb__learning_rate=0.1, xgb__max_depth=6, xgb__n_estimators=200, xgb__subsample=1.0; total time=   3.2s
[CV] END tfidf__max_df=0.8, tfidf__max_features=3000, tfidf__min_df=1, tfidf__ngram_range=(1, 1), xgb__colsample_bytree=0.8, xgb__learning_rate=0.3, xgb__max_depth=4, xgb__n_estimators=200, xgb__subsample=0.8; total time=   1.6s
[CV] END tfidf__max_

In [4]:
# final model according to the Best Parameters last step
# Best Parameters: {'tfidf__max_df': 0.8, 'tfidf__max_features': 3000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 1),
# 'xgb__colsample_bytree': 1.0, 'xgb__learning_rate': 0.3, 'xgb__max_depth': 6, 'xgb__n_estimators': 200, 'xgb__subsample': 1.0}
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("/Users/zhangyuxuan/Desktop/mergedata_spam_sms.csv")
df["Label"] = df["Label"].map({"SPAM": 0, "HAM": 1})

X = df["Message"]
y = df["Label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

final_model = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_df=0.8,
        min_df=2,
        max_features=3000,
        ngram_range=(1,1)
    )),
    ('xgb', xgb.XGBClassifier(
        colsample_bytree=1.0,
        learning_rate=0.3,
        max_depth=6,
        n_estimators=200,
        subsample=1.0,
        eval_metric='mlogloss'
    ))
])


final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred, target_names=["SPAM", "HAM"]))


Test Accuracy: 92.28%
              precision    recall  f1-score   support

        SPAM       0.90      0.83      0.86       631
         HAM       0.93      0.96      0.95      1532

    accuracy                           0.92      2163
   macro avg       0.92      0.89      0.90      2163
weighted avg       0.92      0.92      0.92      2163



In [6]:
!pip install memory_profiler



In [8]:
%load_ext memory_profiler
%memit final_model.predict(["Try out today the LOVE METER to calculate whether you two are the perfect match! Send your name + your partner name to 33550. N50 Ex: Bello + Passion"])

peak memory: 140.23 MiB, increment: 8.53 MiB


In [19]:
new_sms = ["Congrats! You've won a free data plan."]
prediction = final_model.predict(new_sms)
print("预测结果:", "Safe" if prediction[0] == 1 else "Scam")

预测结果: Safe
