In [147]:
# import necessary modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import numpy as np





In [148]:
# download Titanic dataset
# use train.csv as whole dataset(train and test)
data = pd.read_csv("./titanic/train.csv")

# check what the dataset looks like
print(data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [149]:
# preprocessing
# drop PassengerID (bacause it's just a ID, which doesn't explain each passenger's feature)
# drop "Name" because it is string data that doesn't contribute to prediction without further processing 
# drop "Cabin" because it has many missing values and is difficult to impute
# drop 'Ticket' because it is just random numbers of tickets
data = data.drop(['PassengerId','Name','Cabin', 'Ticket'],axis=1)





In [150]:
# separate features and target variable
X = data.drop(columns=['Survived'])
Y = data['Survived']


In [151]:
# encode category values
catCols = ['Sex', 'Embarked']

# I serched how to use LabelEncoder on ChatGPT
le = LabelEncoder()
for col in catCols:
    X[col] = le.fit_transform(X[col])

In [152]:
# train_test_split
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=seed)


In [153]:
# fill missing values in 'Age' with average value
mean_age = X_train['Age'].mean()
X_train['Age'] = X_train['Age'].fillna(mean_age)
X_test['Age'] = X_test['Age'].fillna(mean_age)



In [154]:
# comment out below because this will be contained in pipeline below
# test = SelectKBest(score_func=chi2, k=7)
# test.fit(X_train, Y_train)

# print(test.scores_)
# X_train = test.transform(X_train)
# X_test = test.transform(X_test)

# #I searched how to get names of transformed columns on ChatGPT 
# selected_columns = X.columns[test.get_support()]

In [155]:
# construct model
models = {
    "decision tree classifier": DecisionTreeClassifier(random_state=seed),
    "random forest classifier": RandomForestClassifier(random_state=seed),
    "logistic regression": LogisticRegression(random_state=seed),
    "SVM":SVC(random_state=seed)
}

In [156]:
# add grid serch for hyper parameter tuning

k_vals = np.array([1,2,3,4,5,6,7])
depths = np.array([1,2,3,4,5,6,7,8,9,10,None])

for model_name, model in models.items():
    print(model_name)
    if model_name in ["logistic regression","SVM"]:
        pipeline = Pipeline([
            ('select', SelectKBest(score_func=chi2)),
            ('scaler', StandardScaler()),
            ('classifier', model)
        ])
        param_grid = {
            'select__k' : k_vals
        }
    else:
        pipeline = Pipeline([
            ('select', SelectKBest(score_func=chi2)),
            ('classifier', model)
        ])
        
        param_grid = {
            'select__k' : k_vals,
            'classifier__max_depth' : depths
        }
    grid = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5)
    grid.fit(X_train,Y_train)

    print("Best score", grid.best_score_)
    print("Best params:", grid.best_params_)

decision tree classifier
Best score 0.837310924369748
Best params: {'classifier__max_depth': 3, 'select__k': 7}
random forest classifier
Best score 0.8473949579831933
Best params: {'classifier__max_depth': 7, 'select__k': 5}
logistic regression
Best score 0.8121288515406162
Best params: {'select__k': 6}
SVM
Best score 0.8507002801120448
Best params: {'select__k': 7}


In [157]:
# Check feature importances for tree based models

for model_name, model in models.items():
    if model_name == "decision tree classifier" or model_name == "random forest classifier":
        print(f"model name : {model_name}")
        model.fit(X_train, Y_train)
        for feature, importance in zip(selected_columns, model.feature_importances_):
            print(f"{feature} : {importance}")
        print("\n")

model name : decision tree classifier
Pclass : 0.09623324305327119
Sex : 0.34003122864683594
Age : 0.20084448863408566
SibSp : 0.06334857788334443
Parch : 0.04144702003099539
Fare : 0.2435455288802713
Embarked : 0.01454991287119608


model name : random forest classifier
Pclass : 0.07468120681505755
Sex : 0.28435953091206606
Age : 0.249739957511038
SibSp : 0.05494368017033649
Parch : 0.04242774393633285
Fare : 0.2541714567816247
Embarked : 0.03967642387354438




In [158]:
# for feature, importance in zip(selected_columns, model.feature_importances_):
#     print(f"{feature} : {importance}")

# As a result of the feature_importances_, 'Sex' is the most important feature (0.33)

for model_name, model in models.items():
    print(model_name)
    if model_name == "SVM":
        pipeline = Pipeline([
            ('select', SelectKBest(score_func=chi2,k=7)),
            ('scaler', StandardScaler()),
            ('classifier', model)
        ])
    elif model_name == "logistic regression":
        pipeline = Pipeline([
            ('select', SelectKBest(score_func=chi2, k=6)),
            ('scaler', StandardScaler()),
            ('classifier', model)
        ])
    elif model_name == "decision tree classifier":
        pipeline = Pipeline([
            ('select', SelectKBest(score_func=chi2,k=7)),
            ('classifier', model)
        ])
    elif model_name == "random forest classifier":
            pipeline = Pipeline([
            ('select', SelectKBest(score_func=chi2,k=7)),
            ('classifier', model)
        ])

    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)
    cm = confusion_matrix(Y_test, Y_pred)
    report = classification_report(Y_test, Y_pred)

    print(f"confusion matrix \n {cm} \n")
    print(f"classification report \n {report}")

decision tree classifier
confusion matrix 
 [[144  32]
 [ 53  66]] 

classification report 
               precision    recall  f1-score   support

           0       0.73      0.82      0.77       176
           1       0.67      0.55      0.61       119

    accuracy                           0.71       295
   macro avg       0.70      0.69      0.69       295
weighted avg       0.71      0.71      0.71       295

random forest classifier
confusion matrix 
 [[146  30]
 [ 41  78]] 

classification report 
               precision    recall  f1-score   support

           0       0.78      0.83      0.80       176
           1       0.72      0.66      0.69       119

    accuracy                           0.76       295
   macro avg       0.75      0.74      0.75       295
weighted avg       0.76      0.76      0.76       295

logistic regression
confusion matrix 
 [[148  28]
 [ 39  80]] 

classification report 
               precision    recall  f1-score   support

           0     

###Results###
When I used 6 feature, the accuracy scores were below.
model name : decision tree classifier

classification report 
               precision    recall  f1-score   support

           0       0.74      0.82      0.78       176
           1       0.68      0.58      0.63       119

    accuracy                           0.72       295
   macro avg       0.71      0.70      0.70       295
weighted avg       0.72      0.72      0.72       295

classification report 
               precision    recall  f1-score   support

           0       0.78      0.85      0.81       176
           1       0.74      0.66      0.70       119

    accuracy                           0.77       295
   macro avg       0.76      0.75      0.76       295
weighted avg       0.77      0.77      0.77       295


classification report 
               precision    recall  f1-score   support

           0       0.79      0.83      0.81       176
           1       0.73      0.68      0.70       119

    accuracy                           0.77       295
   macro avg       0.76      0.76      0.76       295
weighted avg       0.77      0.77      0.77       295

classification report 
               precision    recall  f1-score   support

           0       0.78      0.86      0.82       176
           1       0.75      0.64      0.69       119

    accuracy                           0.77       295
   macro avg       0.77      0.75      0.75       295
weighted avg       0.77      0.77      0.77       295
