In [3]:
# https://financedata.github.io/posts/display-all-values-in-cell-jupyter-notebook.html
# Q: Jupyter Notebook 셀에서 마지막 값 하나만이 아니라 중간에 모든 값을 차례로 출력하고 싶습니다.
# A: print(), IPython.display를 써서 매번 출력하거나 InteractiveShell의 옵션을 지정합니다.

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn as sk
from sklearn.impute import KNNImputer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# os walk: https://wikidocs.net/39
#os.walk는 시작 디렉터리부터 시작하여 그 하위 모든 디렉터리를 차례대로 방문하게 해주는 함수이다

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [5]:
df_train = pd.read_csv("/kaggle/input/titanic/train.csv")

In [6]:
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")

# Data Pre-processing

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
df_train["Sex"].replace("female",0, inplace = True)
df_train["Sex"].replace("male",1, inplace = True)

In [9]:
df_test["Sex"].replace("female",0, inplace = True)
df_test["Sex"].replace("male",1, inplace = True)

In [10]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [11]:
df_train.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [12]:
df_train["Survived"] = df_train["Survived"].astype("category")
df_train["Pclass"] = df_train["Pclass"].astype("category")
df_train["Sex"] = df_train["Sex"].astype("category")
df_train["SibSp"] = df_train["SibSp"].astype("category")
df_train["Parch"] = df_train["Parch"].astype("category")
df_train["Embarked"] = df_train["Embarked"].astype("category")
df_train["Ticket"] = df_train["Ticket"].astype("category")
df_train["Cabin"] = df_train["Cabin"].astype("category")

In [13]:
df_test["Pclass"] = df_test["Pclass"].astype("category")
df_test["Sex"] = df_test["Sex"].astype("category")
df_test["SibSp"] = df_test["SibSp"].astype("category")
df_test["Parch"] = df_test["Parch"].astype("category")
df_test["Embarked"] = df_test["Embarked"].astype("category")

In [14]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    category
 2   Pclass       891 non-null    category
 3   Name         891 non-null    object  
 4   Sex          891 non-null    category
 5   Age          714 non-null    float64 
 6   SibSp        891 non-null    category
 7   Parch        891 non-null    category
 8   Ticket       891 non-null    category
 9   Fare         891 non-null    float64 
 10  Cabin        204 non-null    category
 11  Embarked     889 non-null    category
dtypes: category(8), float64(2), int64(1), object(1)
memory usage: 64.6+ KB


In [15]:
df_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [16]:
df_train.drop(["Name", "Ticket", "Cabin", "Fare","Embarked"], axis =1, inplace = True)
df_test.drop(["Name", "Ticket", "Cabin", "Fare","Embarked"], axis =1, inplace = True)

# KNN Imputation

In [17]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_train = pd.DataFrame(scaler.fit_transform(df_train), columns = df_train.columns)
df_test = pd.DataFrame(scaler.fit_transform(df_test), columns = df_test.columns)

In [18]:
imputer = KNNImputer(n_neighbors = 5)
df_train = pd.DataFrame(imputer.fit_transform(df_train), columns = df_train.columns)

In [19]:
df_train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
dtype: int64

In [20]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    float64
 1   Pclass       418 non-null    float64
 2   Sex          418 non-null    float64
 3   Age          332 non-null    float64
 4   SibSp        418 non-null    float64
 5   Parch        418 non-null    float64
dtypes: float64(6)
memory usage: 19.7 KB


In [21]:
df_test.isna().sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
dtype: int64

In [22]:
df_test = pd.DataFrame(imputer.fit_transform(df_test), columns =df_test.columns)

In [23]:
x_train = np.array(df_train.drop(['Survived'], axis =1))
y_train = np.array(df_train['Survived'])

# Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [25]:
model1 = LogisticRegression(solver='liblinear', random_state=0, max_iter = 1000).fit(x_train,y_train)

In [26]:
model1.coef_

array([[ 0.19865294, -2.06922355, -2.48476887, -2.18795473, -1.59358922,
        -0.36286297]])

In [27]:
pd.DataFrame(confusion_matrix(y_train, model1.predict(x_train)))
conf1 = confusion_matrix(y_train, model1.predict(x_train))
Sensitivity = round(conf1[1,1]/(conf1[1,1]+conf1[0,1]),2)
Specificity = round(conf1[0,0]/(conf1[0,0]+conf1[1,0]),2)
Precision = round(conf1[1,1]/(conf1[1,1]+conf1[1,0]),2)
accuracy = round((conf1[1,1]+conf1[0,0])/(conf1[1,1]+conf1[0,0] + conf1[1,0]+conf1[0,1]),2)
Sensitivity, Specificity, Precision, accuracy

Unnamed: 0,0,1
0,477,72
1,105,237


(0.77, 0.82, 0.69, 0.8)

In [28]:
##AUC ROC 
from sklearn.metrics import roc_auc_score
round(roc_auc_score(y_train, model1.predict_proba(x_train)[:,1]),2)

0.86

# DECISION TREE

In [29]:
from sklearn.tree import DecisionTreeClassifier

model2= DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 10).fit(x_train, y_train)


In [30]:
model2.score(x_train, y_train)

0.8361391694725028

In [31]:
pd.DataFrame(confusion_matrix(y_train, model2.predict(x_train)))
conf2 = confusion_matrix(y_train, model2.predict(x_train))
Sensitivity = round(conf2[1,1]/(conf2[1,1]+conf2[0,1]),2)
Specificity = round(conf2[0,0]/(conf2[0,0]+conf2[1,0]),2)
Precision = round(conf2[1,1]/(conf2[1,1]+conf2[1,0]),2)
accuracy = round((conf2[1,1]+conf2[0,0])/(conf2[1,1]+conf2[0,0] + conf2[1,0]+conf2[0,1]),2)
Sensitivity, Specificity, Precision, accuracy

Unnamed: 0,0,1
0,490,59
1,87,255


(0.81, 0.85, 0.75, 0.84)

In [32]:
##AUC ROC 
from sklearn.metrics import roc_auc_score
round(roc_auc_score(y_train, model2.predict_proba(x_train)[:,1]),2)

0.88

# Random Forest

In [33]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
# Instantiate model with 1000 decision trees
model3 = RandomForestClassifier(n_estimators = 100, random_state = 1, max_depth =5).fit(x_train,y_train)


In [34]:
round(model3.score(x_train,y_train),2)

0.86

In [35]:
pd.DataFrame(confusion_matrix(y_train, model3.predict(x_train)))
conf3 = confusion_matrix(y_train,model3.predict(x_train))
Sensitivity = round(conf3[1,1]/(conf3[1,1]+conf3[0,1]),2)
Specificity = round(conf3[0,0]/(conf3[0,0]+conf3[1,0]),2)
Precision = round(conf3[1,1]/(conf3[1,1]+conf3[1,0]),2)
accuracy = round((conf3[1,1]+conf3[0,0])/(conf3[1,1]+conf3[0,0] + conf3[1,0]+conf3[0,1]),2)
Sensitivity, Specificity, Precision, accuracy

Unnamed: 0,0,1
0,520,29
1,99,243


(0.89, 0.84, 0.71, 0.86)

In [36]:
##AUC ROC 
from sklearn.metrics import roc_auc_score
round(roc_auc_score(y_train, model3.predict_proba(x_train)[:,1]),2)

0.91

# Final Submission

In [37]:
pred = (model3.predict(df_test))

In [38]:
test = pd.read_csv("/kaggle/input/titanic/test.csv")
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': pred})

In [39]:
output.to_csv('submission1.csv')
