In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data=pd.read_csv('/kaggle/input/titanic/train.csv')
test_data=pd.read_csv('/kaggle/input/titanic/test.csv')
print(train_data.head(5))
print(test_data.head(5))

In [None]:
#일단 불필요한 데이터 제거 : Remove useless data
#PassengerId는 별도로 저장해야 함. : pID to save result
pID = test_data['PassengerId']

#Name, PassengerId, Ticket이 필요가 없음.
#Name, PassengerId, Ticket are useless 
train_data.drop(['Name', 'PassengerId', 'Ticket'], axis = 1, inplace = True)
test_data.drop(['Name', 'PassengerId', 'Ticket'], axis = 1, inplace = True)


#Cabin도 맨 앞의 것만 남기고 제거
#We don't need Cabin either except the first word
train_data['Cabin'] = train_data['Cabin'].str[:1]
test_data['Cabin'] = test_data['Cabin'].str[:1]

print(train_data.head(5))
print(test_data.head(5))

In [None]:
#NaN 확인
#Check NaN

print(train_data.isnull().sum())
print('------------')
print(test_data.isnull().sum())

In [None]:
#age, cabin, Embarked, Fare의 NaN 제거
#remove NaN
train_data['Age'].fillna(train_data['Age'].mean(), inplace = True)
train_data['Cabin'].fillna('N', inplace = True)
train_data['Embarked'].fillna('N', inplace = True)

test_data['Age'].fillna(test_data['Age'].mean(), inplace = True)
test_data['Cabin'].fillna('N', inplace = True)
test_data['Fare'].fillna(0, inplace = True)

#확인
#Check
print(train_data.head(5))
print(test_data.head(5))

In [None]:
#문자형 데이터도 숫자형으로 변환
#Sex, Cabin, Embarked가 문자형.
#Convert textual data to numeric data
from sklearn import preprocessing

features = ['Sex', 'Cabin', 'Embarked']
for f in features :
    le = preprocessing.LabelEncoder()
    concat_series = test_data[f]
    concat_series = concat_series.append(train_data[f])
    le = le.fit(concat_series)
    train_data[f] = le.transform(train_data[f])
    test_data[f] = le.transform(test_data[f])

#확인
print(train_data.head(5))
print(test_data.head(5))

In [None]:
#나이랑 요금 범주로 변경
#Convert numeric data to categorical data => age, fare
def age_convert(age) :
    if age <= -1 : return 0
    return int(age/10+1) #0~9살부터 1

def fare_convert(fare) :
    return int(fare/10)

train_data['Age'] = train_data['Age'].apply(lambda x : age_convert(x))
train_data['Fare'] = train_data['Fare'].apply(lambda x : fare_convert(x))
test_data['Age'] = test_data['Age'].apply(lambda x : age_convert(x))
test_data['Fare'] = test_data['Fare'].apply(lambda x : fare_convert(x))

#확인
print(train_data.head(5))
print(test_data.head(5))

In [None]:
#테스트, 데이터 셋 구분
#Training data
train_x = train_data.drop('Survived', axis = 1)
train_y = train_data['Survived']

In [None]:
#랜덤 포레스트 -76.07%
#Random Forest - 76.07 accuracy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state = 1)
model = GridSearchCV(rf, param_grid = {'n_estimators' : range(100, 1000, 100)}, verbose = True)
model.fit(train_x, train_y)

model = model.best_estimator_
pred = model.predict(train_x)
print(accuracy_score(train_y, pred))

In [None]:
#모델 저장
#save model
#PassengetId, Survived

pred = model.predict(test_data)
pred = pd.Series(pred, name = 'Survived')
save_data = pd.concat([pID, pred], axis = 1)

print(save_data.head(3))
print(save_data.tail(3))

save_data.to_csv('result_rf_with_fare.csv', index = False)

In [None]:
#로지스틱 회귀 77.03%
#Logistic Regression 77.03%
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

lr = LogisticRegression(random_state = 1)
params = {'C' : [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.5, 1, 5, 10], 'penalty' : ['l1', 'l2']}

model = GridSearchCV(lr, params, cv = 5)
model.fit(train_x, train_y)
print(model.best_estimator_)

model = model.best_estimator_
pred = model.predict(train_x)
print(accuracy_score(train_y, pred))

In [None]:
#모델 저장
#save model
#PassengetId, Survived

pred = model.predict(test_data)
pred = pd.Series(pred, name = 'Survived')
save_data = pd.concat([pID, pred], axis = 1)

print(save_data.head(3))
print(save_data.tail(3))

save_data.to_csv('result_lr_with_fare.csv', index = False)

In [None]:
#fare가 굳이 필요할까?
#Do we really need 'fare'?

train_data.drop('Fare', axis = 1, inplace=True)
test_data.drop('Fare', axis = 1, inplace = True)

#확인
print(train_data.head(5))
print(test_data.head(5))

In [None]:
#테스트, 데이터 셋 구분
train_x = train_data.drop('Survived', axis = 1)
train_y = train_data['Survived']

#랜덤 포레스트 -75.35%
#Random Forest - 75.35 accuracy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state = 1)
model = GridSearchCV(rf, param_grid = {'n_estimators' : range(100, 1000, 100)}, verbose = True)
model.fit(train_x, train_y)

model = model.best_estimator_
pred = model.predict(train_x)
print(accuracy_score(train_y, pred))

pred = model.predict(test_data)
pred = pd.Series(pred, name = 'Survived')
save_data = pd.concat([pID, pred], axis = 1)
save_data.to_csv('result_rf_without_fare.csv', index = False)



#로지스틱 회귀 77.27%
#Logistic Regression 77.27%
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

lr = LogisticRegression(random_state = 1)
params = {'C' : [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.5, 1, 5, 10], 'penalty' : ['l1', 'l2']}

model = GridSearchCV(lr, params, cv = 5)
model.fit(train_x, train_y)

model = model.best_estimator_
pred = model.predict(train_x)
print(accuracy_score(train_y, pred))

pred = model.predict(test_data)
pred = pd.Series(pred, name = 'Survived')
save_data = pd.concat([pID, pred], axis = 1)
save_data.to_csv('result_lr_without_fare.csv', index = False)