# Ensemble/Voting Classification in Python with Scikit-Learn
ref：https://www.kaggle.com/c/titanic/submit

In [3]:
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, log_loss
from sklearn.model_selection import train_test_split, KFold, cross_val_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier

In [4]:
training_data = pd.read_csv("data/train.csv")
testing_data = pd.read_csv("data/test.csv")
def get_nulls(training, testing):
    print("Training Data:")
    print(pd.isnull(training).sum())
    print("Testing Data:")
    print(pd.isnull(testing).sum())

In [10]:
# Drop the cabin column, as there are too many missing values
# Drop the ticket numbers too, as there are too many categories
# Drop names as they won't really help predict survivors



# Taking the mean/average value would be impacted by the skew
# so we should use the median value to impute missing values
training_data["Age"].fillna(training_data["Age"].median(),inplace=True)


get_nulls(training_data, testing_data)

Training Data:
PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64
Testing Data:
PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64


In [5]:
# 先查缺漏值情況
get_nulls(training_data, testing_data)

# Drop the cabin column, as there are too many missing values (cabin缺漏太多,拿掉)
# Drop the ticket numbers too, as there are too many categories(ticket太多類,拿掉)
# Drop names as they won't really help predict survivors (names沒用,拿掉)
training_data.drop(labels=["Cabin","Ticket","Name"],axis=1, inplace=True)
testing_data.drop(labels=["Cabin","Ticket","Name"],axis=1, inplace=True)
print(f"training_data: \n{training_data}")
print(f"testing_data: \n{testing_data}")

# 還是有很多缺漏值需要填補(Age用中位數來補值, Embarked填成"S"
# Taking the mean/average value would be impacted by the skew
# so we should use the median value to impute missing values
# training_data["Age"].fillna(training_data["Age"].median(),inplace=True)# 舊方法未來可能有問題
# testing_data["Age"].fillna(training_data["Age"].median(),inplace=True)# 舊方法未來可能有問題
# training_data["Embarked"].fillna("S",inplace=True)# 舊方法未來可能有問題
training_data["Age"] = training_data["Age"].fillna(training_data["Age"].median()) # 直接覆蓋,不用inplace=True
testing_data["Age"] = testing_data["Age"].fillna(training_data["Age"].median())# 直接覆蓋,不用inplace=True
training_data["Embarked"] = training_data["Embarked"].fillna("S")# 直接覆蓋,不用inplace=True
# (testing_data的Embarked沒有空值所以不必填)
# testing_data["Fare"].fillna(testing_data["Fare"].median(),inplace=True) # 舊方法未來可能有問題
testing_data["Fare"] = testing_data["Fare"].fillna(testing_data["Fare"].median())# 直接覆蓋,不用inplace=True
# (training_data的Fare沒有空值所以不必填)
get_nulls(training_data, testing_data)



# Fit the encoder on the data (Feature: Sex)
encoder_Sex = LabelEncoder()
encoder_Sex.fit(training_data["Sex"])

# Transform and replace training data
training_sex_encoded = encoder_Sex.transform(training_data["Sex"])
training_data["Sex"] = training_sex_encoded
test_sex_encoded = encoder_Sex.transform(testing_data["Sex"])
testing_data["Sex"] = test_sex_encoded
# ----------------------------------------------------------
# Fit the encoder on the data (Feature: Embarked)
encoder_Embarked = LabelEncoder()
encoder_Embarked.fit(training_data["Embarked"])

# Transform and replace training data
training_Embarked_encoded = encoder_Embarked.transform(training_data["Embarked"])
training_data["Embarked"] = training_Embarked_encoded
test_Embarked_encoded = encoder_Embarked.transform(testing_data["Embarked"])
testing_data["Embarked"] = test_Embarked_encoded

# Any value we want to reshape needs be turned into array first
Ages_train = np.array(training_data["Age"]).reshape(-1, 1)
Fare_train = np.array(training_data["Fare"]).reshape(-1, 1)
Ages_test = np.array(testing_data["Age"]).reshape(-1, 1)
Fare_test= np.array(testing_data["Fare"]).reshape(-1, 1)


# Scaler takes arrays
scaler = StandardScaler()

training_data["Age"] = scaler.fit_transform(Ages_train)
training_data["Fare"] = scaler.fit_transform(Fare_train)
testing_data["Age"] = scaler.fit_transform(Ages_test)
testing_data["Fare"] = scaler.fit_transform(Fare_test)


Training Data:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Testing Data:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
training_data: 
     PassengerId  Survived  Pclass     Sex   Age  SibSp  Parch     Fare  \
0              1         0       3    male  22.0      1      0   7.2500   
1              2         1       1  female  38.0      1      0  71.2833   
2              3         1       3  female  26.0      0      0   7.9250   
3              4         1       1  female  35.0      1      0  53.1000   
4              5         0       3    male  35.0      0      0   8.0500   
..           ...       ...     ...     ...

In [6]:
# Now to select our training/testing data
X_features = training_data.drop(labels=['PassengerId', 'Survived'], axis=1)
y_labels = training_data['Survived']

print(X_features.head(5))
print(y_labels.head(5))

# Make the train/test data from validation

X_train, X_val, y_train, y_val = train_test_split(X_features, y_labels, test_size=0.1,random_state=12)


#  ===========前處理到這邊完成===============================

   Pclass  Sex       Age  SibSp  Parch      Fare  Embarked
0       3    1 -0.565736      1      0 -0.502445         2
1       1    0  0.663861      1      0  0.786845         0
2       3    0 -0.258337      0      0 -0.488854         2
3       1    0  0.433312      1      0  0.420730         2
4       3    1  0.433312      0      0 -0.486337         2
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


## Simple Averaging Approach

In [7]:
# Simple Averaging Approach
# 假設要用3種方法
LogReg_clf = LogisticRegression()
DTree_clf = DecisionTreeClassifier()
SVC_clf = SVC()

LogReg_clf.fit(X_train, y_train)
DTree_clf.fit(X_train, y_train)
SVC_clf.fit(X_train, y_train)

LogReg_pred = LogReg_clf.predict(X_val)
DTree_pred = DTree_clf.predict(X_val)
SVC_pred = SVC_clf.predict(X_val)

averaged_preds = (LogReg_pred + DTree_pred + SVC_pred)//3
acc = accuracy_score(y_val, averaged_preds)
print(acc)


0.8


## Bagging Classification Example

In [8]:
# Bagging Classification Example
# base_estimator已棄用,改為estimator
logreg_bagging_model = BaggingClassifier(estimator  = LogReg_clf,n_estimators=50,random_state=12)
dtree_bagging_model = BaggingClassifier(estimator  = DTree_clf,n_estimators=50,random_state=12)
random_forest = RandomForestClassifier(n_estimators=100,random_state=12)
extra_trees = ExtraTreesClassifier(n_estimators=100,random_state=12)
def bagging_ensemble(ModelName,model):
    k_folds = KFold(n_splits=20, random_state=12,shuffle=True) # 分成20堆
    results = cross_val_score(model, X_train, y_train, cv=k_folds)
    print(f"{ModelName} score: {results.mean()}")


bagging_ensemble("logreg_bagging_model",logreg_bagging_model)
bagging_ensemble("dtree_bagging_model",dtree_bagging_model)
bagging_ensemble("random_forest",random_forest)
bagging_ensemble("extra_trees",extra_trees)



logreg_bagging_model score: 0.7927134146341464
dtree_bagging_model score: 0.8188719512195123
random_forest score: 0.8113719512195123
extra_trees score: 0.7963719512195122


## Boosting Classification Example

In [9]:
# Boosting Classification Example
k_folds = KFold(n_splits=20, random_state=12,shuffle=True) # 切成20堆
num_estimators = [20, 40, 60, 80, 100] # 不一定越大越好

for i in num_estimators:
    ada_boost = AdaBoostClassifier(n_estimators=i,random_state=12)
    results = cross_val_score(ada_boost,X_train,y_train,cv=k_folds)
    print("Results for {} estimators:".format(i))
    print(results.mean())

Results for 20 estimators:
0.8064634146341463
Results for 40 estimators:
0.8089024390243903
Results for 60 estimators:
0.8051829268292684
Results for 80 estimators:
0.8039329268292683
Results for 100 estimators:
0.8051829268292684


## voting\Stacking Classification Example

In [10]:
# voting\Stacking Classification Example
# voting='hard' (多數決)
voting_clf = VotingClassifier(estimators=[('SVC', SVC_clf), ('DTree', DTree_clf), ('LogReg', LogReg_clf)], voting='hard')
voting_clf.fit(X_train, y_train)
preds = voting_clf.predict(X_val)
acc = accuracy_score(y_val, preds)
l_loss = log_loss(y_val, preds)
f1 = f1_score(y_val, preds)

print("Accuracy is: " + str(acc))
print("Log Loss is: " + str(l_loss))
print("F1 Score is: " + str(f1))

print("====================================")

Accuracy is: 0.8333333333333334
Log Loss is: 6.0072755648528595
F1 Score is: 0.7761194029850746


In [11]:
print(f"X_train: \n{X_train}")
print(f"testing_data: \n{testing_data}")

# 先把PassengerId 弄掉
test = testing_data.drop(labels=["PassengerId"], axis=1)
print(f"test.shape: {test.shape}")

submission_df = pd.DataFrame(columns=["PassengerId","Survived"])
submission_df["PassengerId"] = testing_data["PassengerId"]
print(f"submission_df: \n{submission_df}")
Predict_Survived = voting_clf.predict(test)
submission_df["Survived"] = Predict_Survived
print(f"submission_df: \n{submission_df}")
submission_df.to_csv("submissions.csv",header=True, index=False)

X_train: 
     Pclass  Sex       Age  SibSp  Parch      Fare  Embarked
715       3    1 -0.796286      0      0 -0.494391         2
319       1    0  0.817561      1      1  2.059694         0
829       1    0  2.508257      0      0  0.962353         2
79        3    0  0.049062      0      0 -0.397241         2
484       1    1 -0.335187      1      0  1.185430         0
..      ...  ...       ...    ...    ...       ...       ...
241       3    0 -0.104637      1      0 -0.336334         1
253       3    1  0.049062      1      0 -0.324253         2
390       1    1  0.510161      1      2  1.767741         2
667       3    1 -0.104637      0      0 -0.491874         2
843       3    1  0.394887      0      0 -0.518805         0

[801 rows x 7 columns]
testing_data: 
     PassengerId  Pclass  Sex       Age  SibSp  Parch      Fare  Embarked
0            892       3    1  0.371062      0      0 -0.497413         1
1            893       3    0  1.358985      1      0 -0.512278        