# Load data & details


## Load data 

In [1]:
# module import 
import Func_T1  #Func_sets(custom)
import matplotlib.pyplot as plt
from sklearn import metrics
import numpy as np
import pandas as pd
import seaborn as sns
import warnings 
import copy
import os



# Ignore warnings
warnings.filterwarnings('ignore')

# data load 
#df = pd.read_csv("C:\\Users\\icecr\\OneDrive\\CSV_file\\titanic\\train.csv")
df = pd.read_csv("C:\\Users\\82103\\OneDrive\\CSV_file\\titanic\\train.csv")  #for notebook

pd.set_option('display.max_rows',50)  #row 출력개수 설정(자동생략 option off 기능)
pd.set_option('display.max_columns',20)  #column 출력개수 설정(자동생략 option off 기능)
df[1:20]

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\icecr\\OneDrive\\CSV_file\\titanic\\train.csv'

## details

In [None]:
# Verify column and types
df.info()

In [None]:
# Label coulumn class ratio
print("<Label Class Ratio>")
df['Survived'].value_counts() /df.shape[0] 

In [None]:
# Value count : X
print("<Column : Pclass> \n", df['Sex'].value_counts(), end='\n\n')
print("<Column : Name> \n", df['Name'].value_counts(), end='\n\n')
print("<Column : Sex> \n", df['Sex'].value_counts(), end='\n\n')
print("<Column : Age> \n", df['Age'].value_counts(), end='\n\n')
print("<Column : SibSp> \n", df['SibSp'].value_counts(), end='\n\n')
print("<Column : Parch> \n", df['Parch'].value_counts(), end='\n\n')
print("<Column : Ticket> \n", df['Ticket'].value_counts(), end='\n\n')
print("<Column : Fare> \n", df['Fare'].value_counts(), end='\n\n')
print("<Column : Embarked> \n", df['Embarked'].value_counts(), end='\n\n')
print("<Column : Cabin> \n", df['Cabin'].value_counts(), end='\n\n')

In [None]:
# Verify null data
print("<Isnull>")
print(df.isnull().sum())
print('')
print("Total :", df.isnull().sum().sum())

# Pre-processing & Analysis

## Exchange NaN values

In [None]:
df2 = Func_T1.fillna_int(df, method="mean", c1="Age")
df2 = Func_T1.fillna_str(df2, method="mode", c1="Embarked")
df2 = Func_T1.fillna_str2(df2, value='N', c1="Cabin")
print(df2.isnull().sum())

## Analysis : Visualization

In [None]:
plt.figure(figsize=(10,10))
corr=df2.corr()
sns.heatmap(corr, cmap='RdBu')

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(data=df2['Fare'], color='red')
plt.show()

In [None]:
# import module
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Pclass ~ Survived
#sns.barplot(x='Sex', y='Survived', data=df_2)
sns.barplot(x='Pclass', y='Survived', hue='Sex', data=df2)

In [None]:
# Survived ~ Age

def get_category(age):
    cat = ''
    if age <= -1 : cat='Unknown'
    elif age <= 5: cat='Baby'
    elif age <= 12: cat='Child'
    elif age <= 18: cat='Teenager'
    elif age <= 25: cat='Student'
    elif age <= 35: cat='Young Adult'
    elif age <= 60: cat='Adult'
    else: cat='Elderly'
        
    return cat

plt.figure(figsize=(10,6))
group_names=['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Elderly']

df2['Age_cat'] = df2['Age'].apply(lambda x : get_category(x))
sns.barplot(x='Age_cat', y='Survived', hue='Sex', data=df2, order=group_names)
df2.drop('Age_cat', axis=1, inplace=True)
            

## Drop features

In [None]:
df3 = Func_T1.drop_features(df2, c1="PassengerId", c2="Name", c3="Ticket", c4="Cabin")
df3

## Get outlier & Exchange

In [None]:
df4 = Func_T1.transform_outlier(dataframe=df3, method="both", column="Fare", weight=1.5)

In [None]:
# Checkout outlier
Func_T1.transform_outlier(dataframe=df4, method="both", column="Fare", weight=1.5)

## Get_dummy

In [None]:
# Create dummy column 
df5 = copy.deepcopy(df4)
df5 = pd.get_dummies(df5, drop_first=True)  #drop_first help modeling better**
df5

#delete Cabin_T ---> testset don't has Cabin_T, trainset has 1 data
#df5 = df5.drop(['Cabin_T'], axis=1)
df5

# Training

## import XGBoost

In [None]:
# import module 
import xgboost as xgb
from xgboost import XGBClassifier

# verify xgboost version
print(xgb.__version__)  #1.3.3

## Validate db

In [None]:
# Devide feature & label
y_df = df5['Survived']
X_df = df5.drop('Survived', axis=1)

# Devide trainset:testset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=0)

# Verify label ratio 
train_cnt = y_train.count()
test_cnt = y_test.count()

print('trainset :', X_train.shape)
print('testset :',X_test.shape, end='\n\n')

print("trainset label ratio")
print(y_train.value_counts()/train_cnt, end='\n\n')

print("trainset label ratio")
print(y_test.value_counts()/test_cnt)

## Training with non-gridsearch

In [None]:
## Models

# Import module
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

# Make object : classifier
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
lr_clf = LogisticRegression()


#DecisionTree
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print('dt_Acc : {0:.4f}'.format(accuracy_score(y_test, dt_pred)))
print('dt_roc_auc : {0:.4f}'.format(roc_auc_score(y_test, dt_clf.predict_proba(X_test)[:,1])))
print('')


#RandomForest
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print('rf_Acc : {0:.4f}'.format(accuracy_score(y_test, rf_pred)))
print('rf_roc_auc : {0:.4f}'.format(roc_auc_score(y_test, rf_clf.predict_proba(X_test)[:,1])))
print('')


#LogisticRegression
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
print('lr_Acc : {0:.4f}'.format(accuracy_score(y_test, lr_pred)))
print('lr_roc_auc : {0:.4f}'.format(roc_auc_score(y_test, lr_clf.predict_proba(X_test)[:,1])))
print('')


##XGBoost
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(n_estimators=100)
xgb_clf.fit(X_train, y_train)
           
xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1])
xgb_acc_score = accuracy_score(y_test, xgb_clf.predict(X_test))
print('')
print('xgb_Acc : {0:.4f}'.format(xgb_acc_score))
print('xgb_roc_auc : {0:.4f}'.format(xgb_roc_score))

## Get Feature importance

In [None]:
# Feature importance = visualization

from xgboost import plot_importance
import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(1, 1, figsize=(10,8))
plot_importance(xgb_clf, ax=ax, max_num_features=12, height=0.4)

## GridCV : XGBoost

In [None]:
#Import modules 
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier


params = {#general params
          'booster':['gbtree'],  #default=gbtree, other:gblinear
          'silent':[0],  #default=0, other:1(=silent)    
          
          #Booster params
          'learning_rate':[0.1, 0.05, 0.01],  #default=0.1
          'n_estimators':[100,150,200,500],  #default=100
          'min_child_weight':[1,2],  #default=1
          'min_split_loss':[0,10,50],  #default=0  #gamma
          'max_depth':[3,4,5],  #default=3
          'subsample':[1],  #default=1
          'colsample_bytree':[0.8,1],  #default=1
          'reg_lamda':[1],  #default=1
          'reg_alpha':[0],  #default=0
          'scale_pos_weight':[1],  #default=1
    
          #Training task params    
          'objective':['binary:logistic'],  #default=logistic, other:index top
          'eval_metric':['error']  #default=rmse|error  #ohter : mae, logloss, merror, mlogloss
          }


xgb_clf_t = XGBClassifier()

xgb_grid = GridSearchCV(xgb_clf_t, 
                        param_grid=params, 
                        cv=5, 
                        scoring='accuracy',  #default=accuracy  #options : roc_auc, f1
                        refit=True,
                        n_jobs=-1, 
                        verbose=0
                       )

xgb_grid.fit(X_df, y_df)


## [Early stopping code]
# gridcv.fit(X_train, y_train, early_stopping_rounds=100, eval_metric='auc',
#           eval_set=[(X_train, y_train), (X_test, y_test)])

In [None]:
#Result

print('Best score :', xgb_grid.best_score_, end='\n\n')
print('Best parameters :')
xgb_grid.best_params_

In [None]:
#Result details

cv_res_df = pd.DataFrame(xgb_grid.cv_results_)
cv_res_df.sort_values(by=['rank_test_score'], inplace=True)
cv_res_df[['params', 'mean_test_score', 'rank_test_score']].head()

# Prepare Testset

## Load data

In [None]:
# data load 
#df_test = pd.read_csv("C:\\Users\\icecr\\OneDrive\\CSV_file\\titanic\\test.csv")
df_test = pd.read_csv("C:\\Users\\82103\\OneDrive\\CSV_file\\titanic\\test.csv")  #for notebook

pd.set_option('display.max_rows',50)  #row 출력개수 설정(자동생략 option off 기능)
pd.set_option('display.max_columns',20)  #column 출력개수 설정(자동생략 option off 기능)
df_test[1:20]

## details

In [None]:
# Verify column and types
df_test.info()

In [None]:
# # Value count : X
# print("<Column : Pclass> \n", df_test['Sex'].value_counts(), end='\n\n')
# print("<Column : Name> \n", df_test['Name'].value_counts(), end='\n\n')
# print("<Column : Sex> \n", df_test['Sex'].value_counts(), end='\n\n')
# print("<Column : Age> \n", df_test['Age'].value_counts(), end='\n\n')
# print("<Column : SibSp> \n", df_test['SibSp'].value_counts(), end='\n\n')
# print("<Column : Parch> \n", df_test['Parch'].value_counts(), end='\n\n')
# print("<Column : Ticket> \n", df_test['Ticket'].value_counts(), end='\n\n')
# print("<Column : Fare> \n", df_test['Fare'].value_counts(), end='\n\n')
# print("<Column : Embarked> \n", df_test['Embarked'].value_counts(), end='\n\n')
# print("<Column : Cabin> \n", df_test['Cabin'].value_counts(), end='\n\n')

In [None]:
# Verify null data
print("<Isnull>")
print(df_test.isnull().sum())
print('')
print("Total :", df_test.isnull().sum().sum())

## Pre-processing

In [None]:
df_test = Func_T1.fillna_int(df_test, method="mean", c1="Age")
df_test = Func_T1.fillna_str2(df_test, value='N', c1="Cabin")
df_test = Func_T1.fillna_int(df_test, method='mean', c1="Fare")
print(df_test.isnull().sum())

In [None]:
#Get outlier & Exchange

df_test = Func_T1.transform_outlier(dataframe=df_test, method="both", column="Fare", weight=1.5)

In [None]:
#Checkout outlier

Func_T1.transform_outlier(dataframe=df4, method="both", column="Fare", weight=1.5)

In [None]:
# Save id data for submission
test_id = df_test.PassengerId

In [None]:
# Drop features
df_test = Func_T1.drop_features(df_test, c1="PassengerId", c2="Name", c3="Ticket", c4="Cabin")
df_test

In [None]:
#Create dummy column 

df_test = copy.deepcopy(df_test)
df_test = pd.get_dummies(df_test, drop_first=True)  #drop_first help modeling better**
df_test

In [None]:
#Define X_df_test & check out

X_df_test = df_test
X_df_test.info()

## Predict

In [None]:
#y_hat = xgb_clf.predict(X_df_test)  #XGB_raw(not tuning)
y_hat = xgb_grid.predict(X_df_test)  #GridCV tuned
print(len(y_hat))

In [None]:
#Save submission in local path

submission = pd.DataFrame({'PassengerId':test_id, 'Survived':y_hat})
#submission.to_csv("C:\\Users\\icecr\\Desktop\\downloads\\Sub001.csv", index=False)
submission.to_csv("C:\\Users\\82103\\Desktop\\downloads\\Sub001.csv", index=False) For notebook

In [None]:
#Log

print(y_hat)
for  i,a  in  enumerate(y_hat):
    print (str(i+892) + ',' + str(a))