In [1]:
# import necessary modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


In [2]:
# download Titanic dataset
# use train.csv as whole dataset(train and test)
data = pd.read_csv("./titanic/train.csv")

# check what the dataset looks like
print(data.head())

   PassengerId  Survived  Pclass  ...     Fare Cabin  Embarked
0            1         0       3  ...   7.2500   NaN         S
1            2         1       1  ...  71.2833   C85         C
2            3         1       3  ...   7.9250   NaN         S
3            4         1       1  ...  53.1000  C123         S
4            5         0       3  ...   8.0500   NaN         S

[5 rows x 12 columns]


In [3]:
# preprocessing
# drop PassengerID (bacause it's just a ID, which doesn't explain each passenger's feature)
# drop "Name" because it is string data that doesn't contribute to prediction without further processing 
# drop "Cabin" because it has many missing values and is difficult to impute
# drop 'Ticket' because it is just random numbers of tickets
data = data.drop(['PassengerId','Name','Cabin', 'Ticket'],axis=1)





In [4]:
# separate features and target variable
X = data.drop(columns=['Survived'])
Y = data['Survived']


In [5]:
# encode category values
catCols = ['Sex', 'Embarked']

# I serched how to use LabelEncoder on ChatGPT
le = LabelEncoder()
for col in catCols:
    X[col] = le.fit_transform(X[col])

In [6]:
# train_test_split
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=seed)


In [7]:
# fill missing values in 'Age' with average value
mean_age = X_train['Age'].mean()
X_train['Age'] = X_train['Age'].fillna(mean_age)
X_test['Age'] = X_test['Age'].fillna(mean_age)



In [8]:
test = SelectKBest(score_func=chi2, k=5)
test.fit(X_train, Y_train)

print(test.scores_)
X_train = test.transform(X_train)
X_test = test.transform(X_test)

#I searched how to get names of transformed columns on ChatGPT 
selected_columns = X.columns[test.get_support()]

[1.84560548e+01 6.89950162e+01 3.60447369e+01 1.39522532e+00
 1.23627560e+01 2.92818330e+03 7.91119791e+00]


In [9]:
# construct model
# model = DecisionTreeClassifier(random_state=seed)
# model.fit(X_train, Y_train)

models = {
    "decision tree classifier" : DecisionTreeClassifier(random_state=seed),
    "random forest classifier" : RandomForestClassifier(random_state=seed),
    "logistic regression" :LogisticRegression(random_state=seed),
    "SVM" : SVC(random_state=seed)
}


In [None]:
# for feature, importance in zip(selected_columns, model.feature_importances_):
#     print(f"{feature} : {importance}")

# As a result of the feature_importances_, 'Sex' is the most important feature (0.33)

NameError: name 'model' is not defined

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)

print(accuracy)



0.7423728813559322


In [None]:
# confusion matrix
cm = confusion_matrix(Y_test, y_pred)

# classification report
report = classification_report(Y_test, y_pred)

print(cm)
print(report)


[[146  30]
 [ 46  73]]
              precision    recall  f1-score   support

           0       0.76      0.83      0.79       176
           1       0.71      0.61      0.66       119

    accuracy                           0.74       295
   macro avg       0.73      0.72      0.73       295
weighted avg       0.74      0.74      0.74       295

