## I. Initialize

In [484]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

TRAIN_FILE = "/home/straw/Data_mining/TechnicalTutorial/PythonLib/titanic/train.csv"
TEST_FILE = "/home/straw/Data_mining/TechnicalTutorial/PythonLib/titanic/test.csv"

In [485]:
titanic_train = pd.read_csv(TRAIN_FILE)
titanic_train_y = titanic_train['Survived']
titanic_test = pd.read_csv(TEST_FILE)
print(titanic_train)
print(titanic_test)

PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ...    ..

## II.Preprocessing Data

In [486]:
# concat test and training data 
titanic_all = pd.concat([titanic_train, titanic_test], ignore_index=True)
titanic_all

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.2500,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.9250,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.0500,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,,,S,8.0500,"Spector, Mr. Woolf",0,1305,3,male,0,,A.5. 3236
1305,39.0,C105,C,108.9000,"Oliva y Ocana, Dona. Fermina",0,1306,1,female,0,,PC 17758
1306,38.5,,S,7.2500,"Saether, Mr. Simon Sivertsen",0,1307,3,male,0,,SOTON/O.Q. 3101262
1307,,,S,8.0500,"Ware, Mr. Frederick",0,1308,3,male,0,,359309


In [487]:
# view missing data of each feature 
for column in titanic_all.columns.to_list():
    print(f"{column} missing " + str(titanic_all[column].isnull().sum())  + " value")

Age missing 263 value
Cabin missing 1014 value
Embarked missing 0 value
Fare missing 0 value
Name missing 0 value
Parch missing 0 value
PassengerId missing 0 value
Pclass missing 0 value
Sex missing 0 value
SibSp missing 0 value
Survived missing 418 value
Ticket missing 0 value


### Handle missing value
1. Embarked, Fare: Embarked chỉ thiếu 2, Fare thiếu 1 nên em điền tay luôn vào file (trên mạng có thông tin về Embarked của 2 người bị thiếu, Fare em điền bừa)
2. Age: Sau khi xem xét mối tương quan giữa Age và các Feature khác, nhận thấy Age có mối tương quan lớn nhất với Pclass. Nên sẽ điền các giá trị của Age dựa theo việc lấy trung bình Age giữa các group Pclass.  

In [488]:
titanic_all.corr().abs()['Age']

Age            1.000000
Fare           0.177278
Parch          0.150917
PassengerId    0.028814
Pclass         0.408106
SibSp          0.243699
Survived       0.077221
Name: Age, dtype: float64

In [489]:
median_age_by_sex_pclass = titanic_all.groupby(['Sex', 'Pclass'])['Age'].median()
missing_age_row_list = titanic_all[titanic_all['Age'].isnull()]['Age'].index.to_list()

for row in missing_age_row_list:
    sex = titanic_all.ix[row, 'Sex']
    pclass = titanic_all.ix[row, 'Pclass']
    
    titanic_all.ix[row, 'Age'] = median_age_by_sex_pclass[sex][pclass]
median_age_by_sex_pclass


Sex     Pclass
female  1         36.0
        2         28.0
        3         22.0
male    1         42.0
        2         29.5
        3         25.0
Name: Age, dtype: float64

### Add new feature
1. Title: để tận dụng đc name ta sẽ thu gọn female về dạng Miss/Mrs/Ms, male về dạng Dr/Military/Noble/Clergy.   
2. Family Size: tổng Parch và SibSP  

In [490]:
titanic_all['Title'] = titanic_all['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
titanic_all['Title'] = titanic_all['Title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
titanic_all['Title'] = titanic_all['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Dr/Military/Noble/Clergy')


In [491]:
titanic_all['FamilySize'] = titanic_all['Parch'] + titanic_all['SibSp'] + 1
#titanic_all['Age*Class'] = titanic_all['Age']*titanic_all['Pclass']

### Convert String data to int
#### Để đưa đc dữ liệu vào model chúng phải có dạng float hoặc int 

In [492]:
titanic_all.loc[titanic_all['Sex'] == "male", "Sex"] = 1
titanic_all.loc[titanic_all['Sex'] == "female", "Sex"] = 0

### Onehot encode Title and Embarked
#### Title và Embarked cx phải chuyển về int hoặc float nhưng nếu chỉ thành 0, 1, 2, ..n  như trên thì ta vô tình tạo ra khoảng cách giữa n với 0. Dùng Onehot encode để các kiểu dữ liệu trong một feature bình đẳng với nhau
#### Đoc thêm về onehot <a href = "https://viblo.asia/p/feature-engineering-phan-3-feature-engineering-voi-du-lieu-dang-phan-loai-categorical-data-GrLZDQx2lk0">Onehot</a>

In [493]:
from sklearn.preprocessing import OneHotEncoder

embarked_onehot = OneHotEncoder()
embarked = [['C'], ['Q'], ['S']]
embarked_onehot.fit(embarked)

embarked_df = pd.DataFrame(embarked_onehot.transform(titanic_all['Embarked'].values.reshape(-1, 1)).toarray(), 
                  columns = embarked_onehot.get_feature_names(['Embarked']))


title_onehot = OneHotEncoder()
title = [['Dr/Military/Noble/Clergy'], ['Miss/Mrs/Ms'], ['Mr'], ['Master']]
title_onehot.fit(title)

title_df = pd.DataFrame(title_onehot.transform(titanic_all['Title'].values.reshape(-1, 1)).toarray(), 
                  columns = title_onehot.get_feature_names(['Title']))

titanic_all = pd.concat([titanic_all, embarked_df, title_df], axis=1)
titanic_all

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,...,Ticket,Title,FamilySize,Embarked_C,Embarked_Q,Embarked_S,Title_Dr/Military/Noble/Clergy,Title_Master,Title_Miss/Mrs/Ms,Title_Mr
0,22.0,,S,7.2500,"Braund, Mr. Owen Harris",0,1,3,1,1,...,A/5 21171,Mr,2,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,...,PC 17599,Miss/Mrs/Ms,2,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,26.0,,S,7.9250,"Heikkinen, Miss. Laina",0,3,3,0,0,...,STON/O2. 3101282,Miss/Mrs/Ms,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,35.0,C123,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,...,113803,Miss/Mrs/Ms,2,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,35.0,,S,8.0500,"Allen, Mr. William Henry",0,5,3,1,0,...,373450,Mr,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,25.0,,S,8.0500,"Spector, Mr. Woolf",0,1305,3,1,0,...,A.5. 3236,Mr,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1305,39.0,C105,C,108.9000,"Oliva y Ocana, Dona. Fermina",0,1306,1,0,0,...,PC 17758,Miss/Mrs/Ms,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1306,38.5,,S,7.2500,"Saether, Mr. Simon Sivertsen",0,1307,3,1,0,...,SOTON/O.Q. 3101262,Mr,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1307,25.0,,S,8.0500,"Ware, Mr. Frederick",0,1308,3,1,0,...,359309,Mr,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0


### Drop columns: drop unimportant columns (thực ra là Cabin với Ticket em ko xử lí dc) 

In [494]:
titanic_all = titanic_all.drop(['Name', 'Cabin', 'Embarked', 'Title', 'Survived', 'Ticket'], axis = 1)



### Update train, test data

In [495]:
titanic_train = titanic_all.loc[:890, :]
titanic_test = titanic_all.loc[891:, :]
titanic_test.shape

(418, 15)

### Write all data 

In [496]:
with open("/home/straw/Data_mining/TechnicalTutorial/PythonLib/titanic/all.csv", "w") as out_file:
    for column in titanic_all.columns.tolist():
        out_file.write(column + ",")
    out_file.write('\n')

    for index, row in titanic_all.iterrows():
        for column in titanic_all.columns.tolist():
            out_file.write(str(row[column]) + ",")
        out_file.write('\n')
    out_file.close()

## III. Train Model 

### Define each Model as an object of class Classifier Model 
#### Ps: cái này e tham khảo idol Nguyễn Minh Quang - aka Moi


In [497]:
from sklearn.metrics import classification_report

class ClassifierModel(object):
    def __init__(self, features_train = None, labels_train = None,  estimator = None, features_test = None, labels_test = None):
        self.features_train = features_train
        self.features_test = features_test
        self.labels_train = labels_train
        self.labels_test = labels_test
        self.estimator = estimator
    def get_estimator(self):
        return self.estimator
    def train(self):
        self.estimator.fit(self.features_train, self.labels_train)
        self.get_result()
    def predict(self,input):
        return self.estimator.predict(input)
    """
        predict and retrive classificatino_report (precision, recall, accuracy, f1 score)
    """
    def get_result(self):
        y_true, y_pred = self.labels_test, self.estimator.predict(self.features_test)
        print(classification_report(y_true, y_pred))

In [498]:
# split training data using 5-fold cross validation
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(titanic_train, titanic_train_y, train_size = 0.8, shuffle = True, random_state = 21)

### Naive Model
#### Naive cho kết quả khá tốt, tốc độ chạy cũng nhanh 

In [499]:
from sklearn.naive_bayes import GaussianNB
naive = ClassifierModel(train_X, train_y, GaussianNB(), test_X, test_y)
naive.train()

precision    recall  f1-score   support

           0       0.84      0.87      0.85       105
           1       0.80      0.77      0.79        74

    accuracy                           0.83       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179



### RandomForest 
#### Kết quả cải thiện hơn Naive một chút 

In [500]:
from sklearn.ensemble import RandomForestClassifier
random_forest = ClassifierModel(train_X, train_y, RandomForestClassifier(criterion='entropy', 
                                                                        n_estimators=1100,
                                                                        max_depth=5,
                                                                        min_samples_split=4,
                                                                        min_samples_leaf=5,
                                                                        oob_score=True,
                                                                        random_state=21), 
                                test_X, test_y)
random_forest.train()

precision    recall  f1-score   support

           0       0.83      0.90      0.87       105
           1       0.85      0.74      0.79        74

    accuracy                           0.84       179
   macro avg       0.84      0.82      0.83       179
weighted avg       0.84      0.84      0.84       179



### SGDClassifier 
#### Em muốn chọn một mô hình nữa để làm VotingClassifire giữa 3 model. SGD trả ra kết quả tốt hơn so vơi Kmeans và SVM nên em chọn cái này làm model thứ 3 cho VotingClassifier. Nhưng ko hiệu quả lắm vì độ chính xác của SGD hơi thấp 

In [501]:
from sklearn.linear_model import SGDClassifier
sgd = ClassifierModel(train_X, train_y, SGDClassifier(), test_X, test_y)
sgd.train()

precision    recall  f1-score   support

           0       0.63      0.87      0.73       105
           1       0.59      0.27      0.37        74

    accuracy                           0.62       179
   macro avg       0.61      0.57      0.55       179
weighted avg       0.61      0.62      0.58       179



In [502]:
from sklearn.ensemble import VotingClassifier
voting_classifier_hard = ClassifierModel(train_X, train_y, 
                                         VotingClassifier(estimators=[("naive", naive.get_estimator()),
                                                                    ("random_forest", random_forest.get_estimator()),
                                                                    ("sgd", sgd.get_estimator())],
                                                          voting = "hard",
                                                          weights = [2, 2.5, 1]), 
                                         test_X, test_y)
voting_classifier_hard.train()

precision    recall  f1-score   support

           0       0.84      0.90      0.87       105
           1       0.85      0.76      0.80        74

    accuracy                           0.84       179
   macro avg       0.84      0.83      0.84       179
weighted avg       0.84      0.84      0.84       179



### Predict with RandomForest

In [503]:
titanic_pred = random_forest.predict(titanic_test)
with open("/home/straw/Data_mining/TechnicalTutorial/PythonLib/titanic/predict.csv", "w") as out_file:
    out_file.write("PassengerId,Survived\n")
    for i in range(len(titanic_pred)):
        out_file.write(str(i + 892) + "," + str(titanic_pred[i]) + '\n')
    out_file.close()
titanic_pred.shape

(418,)