In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import copy
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from Modeling import *
from MyProcess import *

In [2]:
df_train_raw = pd.read_csv('train.csv')
df_test_raw = pd.read_csv('test.csv')
df_all_raw = pd.concat([df_train_raw, df_test_raw], axis=0)

df_train = copy.copy(df_train_raw)
df_test = copy.copy(df_test_raw)
df_all = copy.copy(df_all_raw)

print(df_train.shape)
df_train.head(5)

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df_test_2 = copy.copy(df_test_raw)

In [4]:
df_train_2 = copy.copy(df_train_raw)
df_train_2.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
survived_woman = df_train_2[(df_train_2['Sex'] == 'female') & (df_train_2['Survived'] == 1)]['Pclass'].value_counts().sort_index()
woman_all =  df_train_2[df_train_2['Sex'] == 'female']['Pclass'].value_counts().sort_index()


survived_woman/woman_all

Pclass
1    0.968085
2    0.921053
3    0.500000
Name: count, dtype: float64

In [6]:
survived_woman

Pclass
1    91
2    70
3    72
Name: count, dtype: int64

In [7]:
woman_all

Pclass
1     94
2     76
3    144
Name: count, dtype: int64

In [8]:
survived_man = df_train_2[(df_train_2['Sex'] == 'male') & (df_train_2['Survived'] == 1)]['Pclass'].value_counts().sort_index()
man_all =  df_train_2[df_train_2['Sex'] == 'male']['Pclass'].value_counts().sort_index()

In [9]:
survived_man

Pclass
1    45
2    17
3    47
Name: count, dtype: int64

In [10]:
man_all

Pclass
1    122
2    108
3    347
Name: count, dtype: int64

In [11]:
survived_man / man_all

Pclass
1    0.368852
2    0.157407
3    0.135447
Name: count, dtype: float64

In [12]:
# 생존율 매핑
survival_rate_map = {
    ('female', 1): 0.968085,
    ('female', 2): 0.921053,
    ('female', 3): 0.500000,
    ('male', 1): 0.500000,
    ('male', 2): 0.157407,
    ('male', 3): 0.135447,
}

# 새 열 추가
df_train_2['Sex_Pclass'] = df_train_2.apply(lambda row: survival_rate_map[(row['Sex'], row['Pclass'])], axis=1)
df_test_2['Sex_Pclass'] = df_test_2.apply(lambda row: survival_rate_map[(row['Sex'], row['Pclass'])], axis=1)



In [13]:
df_train_2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_Pclass
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0.135447
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0.968085
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0.5
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0.968085
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0.135447


In [14]:
df_test_2.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_Pclass
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0.135447
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0.5
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0.157407
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0.135447
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0.5


In [15]:
process_basic_df_train = myProcessor(df_train)
process_basic_df_test = myProcessor(df_test)
preprocessed_basic_df_train =  process_basic_df_train.preprocess_df()
preprocessed_basic_df_test =  process_basic_df_test.preprocess_df(_remove=False)

In [16]:
preprocessed_basic_df_train['Sex_Pclass'] = df_train_2['Sex_Pclass']
preprocessed_basic_df_test['Sex_Pclass'] = df_test_2['Sex_Pclass']

In [17]:
preprocessed_basic_df_train

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,C,Q,S,Master,Miss,Mr,Mrs,else,Sex_Pclass
0,0,3,1,1,0,-0.516017,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.135447
1,1,1,0,1,0,0.693558,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.968085
2,1,3,0,0,0,-0.503267,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.500000
3,1,1,0,1,0,0.350080,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.968085
4,0,3,1,0,0,-0.500905,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.135447
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,0,0,5,-0.102803,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.500000
886,0,2,1,0,0,-0.407401,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.157407
887,1,1,0,0,0,-0.086274,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.968085
889,1,1,1,0,0,-0.086274,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.500000


In [18]:
# def embarked_mapping_F(x):
#     if x == 'S':
#         return 1
#     elif x == 'C':
#         return 2
#     else:
#         return 3
    
# preprocessed_basic_df_train['Embarked'] = preprocessed_basic_df_train['Embarked'].map(embarked_mapping_F)
# preprocessed_basic_df_train['Sex'] = preprocessed_basic_df_train['Sex'].map(lambda x : 1 if x == 'male' else 0)


# preprocessed_basic_df_test['Embarked'] = preprocessed_basic_df_test['Embarked'].map(embarked_mapping_F)
# preprocessed_basic_df_test['Sex'] = preprocessed_basic_df_test['Sex'].map(lambda x : 1 if x == 'male' else 0)

In [19]:
basic_y = preprocessed_basic_df_train['Survived']
basic_X = preprocessed_basic_df_train.drop(columns=['Survived'])

In [20]:
basic_X.head(3)

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,C,Q,S,Master,Miss,Mr,Mrs,else,Sex_Pclass
0,3,1,1,0,-0.516017,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.135447
1,1,0,1,0,0.693558,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.968085
2,3,0,0,0,-0.503267,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.5


In [21]:
basic_df_train_modeling = MyModelingProcess(_X = basic_X, _y= basic_y)

In [22]:
basic_df_train_modeling.DecisionTree()
basic_df_train_modeling.RandomForest()
basic_df_train_modeling.LogisticRegression()
basic_df_train_modeling.SVM_kernel()
basic_df_train_modeling.KNN()

모델 DecisionTree이 models/DecisionTree 파일로 저장되었습니다.
DecisionTree Best parameters:
{'classifier__criterion': 'entropy', 'classifier__max_depth': 5, 'classifier__min_samples_split': 5}
0.8216487737614498
모델 RandomForest이 models/RandomForest 파일로 저장되었습니다.
Best parameters with RandomForest:
{'classifier__max_depth': None, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}
0.820338816113464


150 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/yujin/anaconda3/envs/machine_learning/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/yujin/anaconda3/envs/machine_learning/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/yujin/anaconda3/envs/machine_learning/lib/python3.9/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/Users/yujin/anaconda

모델 LogisticRegression이 models/LogisticRegression 파일로 저장되었습니다.
LogisticRegression grid search result: 
{'classifier__C': 10, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
0.7992219048557077

모델 SVM_kernel이 models/SVC_kernel_grid 파일로 저장되었습니다.
SVM_kernel Best parameters:
{'classifier__C': 1, 'classifier__gamma': 0.1, 'classifier__kernel': 'rbf'}
0.8258347286516301
모델 KNN이 models/KNN 파일로 저장되었습니다.
Best parameters for KNN:
{'classifier__n_neighbors': 7, 'classifier__p': 1, 'classifier__weights': 'uniform'}
0.8174628188712697


In [64]:
model_list = []
Dtree = basic_df_train_modeling.load_model("models/DecisionTree")
model_list.append(Dtree)

knn = basic_df_train_modeling.load_model("models/RandomForest")
model_list.append(knn)

SVC_grid = basic_df_train_modeling.load_model("models/SVC_kernel_grid")
model_list.append(SVC_grid)

SVC_kernel = basic_df_train_modeling.load_model("models/KNN")
model_list.append(SVC_kernel)

# LogisticRegression = basic_df_train_modeling.load_model("models/LogisticRegression")
# model_list.append(LogisticRegression)

모델 DecisionTree이 models/DecisionTree 파일에서 불러와졌습니다.
모델 RandomForest이 models/RandomForest 파일에서 불러와졌습니다.
모델 SVM_kernel이 models/SVC_kernel_grid 파일에서 불러와졌습니다.
모델 KNN이 models/KNN 파일에서 불러와졌습니다.


In [65]:
Dtree = basic_df_train_modeling.load_model("models/DecisionTree_sbs")
model_list.append(Dtree)

knn = basic_df_train_modeling.load_model("models/RandomForest_sbs")
model_list.append(knn)

SVC_grid = basic_df_train_modeling.load_model("models/SVC_kernel_grid_sbs")
model_list.append(SVC_grid)

SVC_kernel = basic_df_train_modeling.load_model("models/KNN_sbs")
model_list.append(SVC_kernel)

모델 DecisionTree_sbs이 models/DecisionTree_sbs 파일에서 불러와졌습니다.
모델 RandomForest_sbs이 models/RandomForest_sbs 파일에서 불러와졌습니다.
모델 SVM_kernel_sbs이 models/SVC_kernel_grid_sbs 파일에서 불러와졌습니다.
모델 KNN_sbs이 models/KNN_sbs 파일에서 불러와졌습니다.


In [45]:
basic_df_train_modeling.DecisionTree_sbs()
basic_df_train_modeling.RandomForest_sbs()
basic_df_train_modeling.KNN_sbs()
basic_df_train_modeling.SVM_kernel_sbs()

모델 DecisionTree_sbs이 models/DecisionTree_sbs 파일로 저장되었습니다.
Best parameters with SBS:
{'classifier__criterion': 'gini', 'classifier__max_depth': 15, 'classifier__min_samples_split': 10}
Best cross-validation accuracy with SBS:
0.8174628188712696
모델 RandomForest_sbs이 models/RandomForest_sbs 파일로 저장되었습니다.
Best parameters with SBS:
{'classifier__max_depth': 5, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Best cross-validation accuracy with SBS:
0.8118388653599922
모델 KNN_sbs이 models/KNN_sbs 파일로 저장되었습니다.
Best parameters for KNN with SBS:
{'classifier__n_neighbors': 7, 'classifier__p': 1, 'classifier__weights': 'uniform'}
0.8020585048754063
모델 SVM_kernel_sbs이 models/SVC_kernel_grid_sbs 파일로 저장되었습니다.
Best parameters with SBS:
{'classifier__C': 1, 'classifier__gamma': 0.1, 'classifier__kernel': 'rbf'}
Best cross-validation accuracy with SBS:
0.8215995272333301


In [53]:
model_list2 = []
Dtree = basic_df_train_modeling.load_model("models/DecisionTree_sbs")
model_list2.append(Dtree)

knn = basic_df_train_modeling.load_model("models/RandomForest_sbs")
model_list2.append(knn)

SVC_grid = basic_df_train_modeling.load_model("models/SVC_kernel_grid_sbs")
model_list2.append(SVC_grid)

SVC_kernel = basic_df_train_modeling.load_model("models/KNN_sbs")
model_list2.append(SVC_kernel)

모델 DecisionTree_sbs이 models/DecisionTree_sbs 파일에서 불러와졌습니다.
모델 RandomForest_sbs이 models/RandomForest_sbs 파일에서 불러와졌습니다.
모델 SVM_kernel_sbs이 models/SVC_kernel_grid_sbs 파일에서 불러와졌습니다.
모델 KNN_sbs이 models/KNN_sbs 파일에서 불러와졌습니다.


In [24]:
for model in model_list:
    print(model[0], model[1], model[2])

DecisionTree Pipeline(steps=[('classifier',
                 DecisionTreeClassifier(criterion='entropy', max_depth=5,
                                        min_samples_split=5))]) 0.8216487737614498
RandomForest Pipeline(steps=[('classifier',
                 RandomForestClassifier(min_samples_split=10,
                                        n_estimators=50))]) 0.820338816113464
SVM_kernel Pipeline(steps=[('classifier', SVC(C=1, gamma=0.1))]) 0.8258347286516301
KNN Pipeline(steps=[('classifier', KNeighborsClassifier(n_neighbors=7, p=1))]) 0.8174628188712697
LogisticRegression Pipeline(steps=[('classifier',
                 LogisticRegression(C=10, max_iter=1000, penalty='l1',
                                    solver='saga'))]) 0.7992219048557077


In [25]:
# 앙상블 예측 메서드 추가
def ensemble_predict_(models, X_train, y_train, X_test):

    # VotingClassifier를 위한 estimators 리스트 생성
    estimators = [(name, model) for name, model, best_score in models]

    # VotingClassifier 생성
    ensemble_model = VotingClassifier(estimators=estimators, voting='hard')

    # 앙상블 모델 학습 (각 개별 모델은 이미 학습되었으므로, 재학습 없이 fit 필요 없음)
    # 그러나 VotingClassifier의 fit 메서드를 호출해야 합니다.
    ensemble_model.fit(X_train, y_train)

    # 예측 수행
    predictions = ensemble_model.predict(X_test)

    return predictions

In [26]:
# from Modeling import  ensemble_predict
# y_pred = ensemble_predict_(models=model_list, 
#                            X_train=basic_X.values, 
#                            y_train=basic_y.values, 
#                            X_test=preprocessed_basic_df_test.values)

In [67]:
y_pred_ver1 = ensemble_predict_(models=model_list, 
                           X_train=basic_X.values, 
                           y_train=basic_y.values, 
                           X_test=preprocessed_basic_df_test.values)

In [56]:
y_pred_ver2 = ensemble_predict_(models=model_list2, 
                           X_train=basic_X.values, 
                           y_train=basic_y.values, 
                           X_test=preprocessed_basic_df_test.values)

In [57]:
for model in model_list2:
    print(model[2], model[0])

0.8174628188712696 DecisionTree_sbs
0.8118388653599922 RandomForest_sbs
0.8215995272333301 SVM_kernel_sbs
0.8020585048754063 KNN_sbs


In [66]:
for model in model_list:
    print(model[2], model[0])

0.8216487737614498 DecisionTree
0.820338816113464 RandomForest
0.8258347286516301 SVM_kernel
0.8174628188712697 KNN
0.8174628188712696 DecisionTree_sbs
0.8118388653599922 RandomForest_sbs
0.8215995272333301 SVM_kernel_sbs
0.8020585048754063 KNN_sbs


In [68]:
df_test_raw['Survived'] = y_pred_ver1
df_submit = df_test_raw[['PassengerId', 'Survived']]
df_submit.to_csv('submit_3_6_2.csv', index=False)

In [58]:
df_test_raw['Survived'] = y_pred_ver2
df_submit = df_test_raw[['PassengerId', 'Survived']]
df_submit.to_csv('submit_3_7.csv', index=False)

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.linear_model import LogisticRegression

# # 모델 매핑
# model_mapping = {
#     "DecisionTree": DecisionTreeClassifier,
#     "RandomForest": RandomForestClassifier,
#     "SVM_kernel": SVC,
#     "KNN": KNeighborsClassifier,
#     "LogisticRegression": LogisticRegression
# }

# # 저장된 모델 리스트
# model_list = [
#     ("DecisionTree", DecisionTreeClassifier(max_depth=15, min_samples_split=5), 0.8249778390623461),
#     ("RandomForest", RandomForestClassifier(min_samples_split=10, n_estimators=50), 0.8138087264847828),
#     ("SVM_kernel", SVC(C=100, gamma=0.01), 0.825007386979218),
#     ("KNN", KNeighborsClassifier(n_neighbors=11, p=1, weights='distance'), 0.8053875701763026),
#     ("LogisticRegression", LogisticRegression(C=100, max_iter=1000, penalty='l1', solver='saga'), 0.8124199743918054),
# ]

# # 데이터 준비 (예제용)
# X_train, X_test, y_train, y_test = ...  # 데이터를 미리 준비하세요.

# # 새로 모델 훈련
# trained_models = {}

# for name, model_instance, _ in model_list:
#     # 모델 클래스 가져오기
#     model_class = model_mapping[name]

#     # 모델 매개변수 추출
#     params = model_instance.get_params()

#     # 새로운 모델 생성
#     model = model_class(**params)

#     # 모델 훈련
#     model.fit(X_train, y_train)

#     # 저장된 모델
#     trained_models[name] = model

#     print(f"Model {name} trained with parameters: {params}")


In [32]:
# 필요한 라이브러리 임포트
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
# XGBoost 모델 초기화
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# 모델 학습
xgb_model.fit(basic_X, basic_y)

# 예측
y_pred = xgb_model.predict(preprocessed_basic_df_test)


Parameters: { "use_label_encoder" } are not used.



In [33]:
basic_X

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,C,Q,S,Master,Miss,Mr,Mrs,else,Sex_Pclass
0,3,1,1,0,-0.516017,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.135447
1,1,0,1,0,0.693558,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.968085
2,3,0,0,0,-0.503267,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.500000
3,1,0,1,0,0.350080,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.968085
4,3,1,0,0,-0.500905,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.135447
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,3,0,0,5,-0.102803,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.500000
886,2,1,0,0,-0.407401,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.157407
887,1,0,0,0,-0.086274,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.968085
889,1,1,0,0,-0.086274,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.500000


In [34]:
# 파이프라인 구성
pipe_pca = Pipeline([
    ('pca', PCA()),
    ('classifier', RandomForestClassifier())
])

# 그리드 서치를 위한 파라미터 그리드 설정
param_grid_RF = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5, 10],
}

# 그리드 서치 객체 생성
grid_pca = GridSearchCV(pipe_pca, param_grid_RF, cv=15, scoring='accuracy', n_jobs=-1)

# 모델 학습
grid_pca.fit(basic_X.values, basic_y.values)

# 최적의 파라미터와 점수 출력
print("Best parameters with PCA:")
print(grid_pca.best_params_)
print("Best cross-validation accuracy with PCA:")
print(grid_pca.best_score_)

Best parameters with PCA:
{'classifier__max_depth': 5, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}
Best cross-validation accuracy with PCA:
0.8204787234042552


In [39]:
# 예측
y_pred_RF = grid_pca.predict(preprocessed_basic_df_test.values)
df_test_raw['Survived'] = y_pred
df_submit = df_test_raw[['PassengerId', 'Survived']]
df_submit.to_csv('param_grid_RF.csv', index=False)

In [35]:
from sklearn.ensemble import GradientBoostingClassifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
# GradientBoostingClassifier 사용
gb_model = GradientBoostingClassifier()

grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, 
                           cv=15, scoring='accuracy')

# 모델 학습
grid_search.fit(basic_X.values, basic_y.values)




In [36]:
grid_search.best_estimator_
print(grid_search.best_score_)

0.8246453900709219


In [37]:
# 예측
y_pred = grid_search.predict(preprocessed_basic_df_test.values)
df_test_raw['Survived'] = y_pred
df_submit = df_test_raw[['PassengerId', 'Survived']]
df_submit.to_csv('submit_grid_xg_boost.csv', index=False)