In [320]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import joblib

In [321]:
train = pd.read_csv('../exps/feature_Labels/train_final.csv')
test = pd.read_csv('../exps/feature_Labels/test_final.csv')

In [322]:
train.head()

Unnamed: 0,Age,Fare,FamilySize,Age*Pclass,Fare_Pclass,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,...,Ticket_Prefix_STONO2,Ticket_Prefix_STONOQ,Ticket_Prefix_SWPP,Ticket_Prefix_WC,Ticket_Prefix_WEP,Ticket_Prefix_XXX,IsChild_0,IsChild_1,IsMother_0,IsMother_1
0,-0.214804,-0.897141,0.073352,0.130752,-0.446508,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.632697,1.344578,0.073352,-0.78695,0.844527,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.042531,-0.815913,-0.558346,0.524052,-0.44229,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.504236,1.045295,0.073352,-0.885275,0.503646,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,0.504236,-0.801547,-0.558346,1.408979,-0.441509,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [323]:
test.head()

Unnamed: 0,Age,Fare,FamilySize,Age*Pclass,Fare_Pclass,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,...,Ticket_Prefix_STONO2,Ticket_Prefix_STONOQ,Ticket_Prefix_SWPP,Ticket_Prefix_WC,Ticket_Prefix_WEP,Ticket_Prefix_XXX,IsChild_0,IsChild_1,IsMother_0,IsMother_1
0,0.481789,-0.82706,-0.558346,1.359817,-0.442889,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
1,0.96594,-0.928924,0.073352,2.588882,-0.448071,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,1.402369,-0.629771,-0.558346,2.031705,-0.401008,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
3,0.100898,-0.733907,-0.558346,0.622378,-0.437682,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,-0.214804,-0.404865,0.705051,0.130752,-0.415029,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


In [324]:
target_ = pd.read_csv('../data/train.csv')
target = target_['Survived']

In [325]:
train.shape

(891, 69)

## Models không tham số

In [326]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'Random Forest Classifier' : RandomForestClassifier(),
    'Support Vector' : SVC(),
    'XGBoost' : XGBClassifier()
}

In [327]:
cv = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)
for name, model in models.items():
    acc = cross_val_score(model, train, target, cv=cv, scoring='accuracy').mean()
    f1 = cross_val_score(model, train, target, cv=cv, scoring='f1').mean()
    auc = cross_val_score(model, train, target, cv=cv, scoring='roc_auc').mean()

    print(f"{name} CV Results:")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    print(f"  ROC AUC:  {auc:.4f}")
    print("-" * 35)

Logistic Regression CV Results:
  Accuracy: 0.8238
  F1 Score: 0.7620
  ROC AUC:  0.8796
-----------------------------------
Random Forest Classifier CV Results:
  Accuracy: 0.8305
  F1 Score: 0.7760
  ROC AUC:  0.8906
-----------------------------------
Support Vector CV Results:
  Accuracy: 0.8373
  F1 Score: 0.7738
  ROC AUC:  0.8708
-----------------------------------
XGBoost CV Results:
  Accuracy: 0.8260
  F1 Score: 0.7652
  ROC AUC:  0.8708
-----------------------------------


# Hyperparameters Tunning (Tinh chỉnh siêu tham số cho model)

## SVC

ta thấy điểm của models SVC đang có độ chính xác cao hơn các model khác nên ta sẽ tinh chỉnh nó trước

In [328]:
param_grid_svm = {
    'C': [0.01, 0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['rbf']
}

svm_model = SVC(probability=True,random_state=42)
grid_search_svm = GridSearchCV(
    estimator=svm_model,
    param_grid=param_grid_svm,
    scoring='accuracy',  # Sử dụng ROC_AUC hoặc accuracy làm chỉ số chính
    cv=cv,
    verbose=2,
    n_jobs=-1 # Sử dụng tất cả các nhân CPU
)

In [329]:
grid_search_svm.fit(train,target)

Fitting 15 folds for each of 16 candidates, totalling 240 fits


0,1,2
,estimator,SVC(probabili...ndom_state=42)
,param_grid,"{'C': [0.01, 0.1, ...], 'gamma': [0.001, 0.01, ...], 'kernel': ['rbf']}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,1
,kernel,'rbf'
,degree,3
,gamma,0.1
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [330]:
best_param_svm = grid_search_svm.best_params_
best_param_svm

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}

In [331]:
optimized_svm = SVC(kernel='rbf', C=1.0, gamma=0.1,probability=True,random_state=42)

In [332]:
acc = cross_val_score(optimized_svm, train, target, cv=cv, scoring='accuracy').mean()
f1 = cross_val_score(optimized_svm, train, target, cv=cv, scoring='f1').mean()
auc = cross_val_score(optimized_svm, train, target, cv=cv, scoring='roc_auc').mean()

print(f"SVM CV Results:")
print(f"  Accuracy: {acc:.4f}")
print(f"  F1 Score: {f1:.4f}")
print(f"  ROC AUC:  {auc:.4f}")
print("-" * 35)

SVM CV Results:
  Accuracy: 0.8384
  F1 Score: 0.7717
  ROC AUC:  0.8732
-----------------------------------


In [333]:
optimized_svm.fit(train,target)
prediction = optimized_svm.predict(test)

## Lưu model sau khi train

In [334]:
model_filename = '../exps/saved_dump/optimized_svm_model.pkl'
joblib.dump(optimized_svm,model_filename)

['../exps/saved_dump/optimized_svm_model.pkl']

## Submission

In [335]:
test_data = pd.read_csv('../data/test.csv')
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'], 
    'Survived': prediction
})
submission['Survived'] = submission['Survived'].astype(int)
submission.to_csv("../Submit/tn4/SVM.csv",index=False,header=True)