# load data

In [None]:
import numpy as np # np.random.seed(2020)
import pandas as pd; pd.options.display.float_format='{:.1f}'.format
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style='white',context='talk')
import sklearn 
import lightgbm as lgb
import optuna

In [None]:
train=pd.read_csv('../input/titanic/train.csv')
test=pd.read_csv('../input/titanic/test.csv')
sub=pd.read_csv('../input/titanic/gender_submission.csv')

In [None]:
data=pd.concat([train,test],sort=True).reset_index(drop=True)
data.head()

# browse data

In [None]:
def datainfo(df):
    return pd.DataFrame([(col,df[col].nunique(),df[col].isna().sum(),df[col].dtype,
                         df[col].unique()[:5]) for col in df.columns],
                       columns=['name','nunique','missing','dtype','value :5'])

datainfo(data)

In [None]:
print(f'train shape {train.shape}, test shape {test.shape}')

# age

In [None]:
def corr(df,col):
    df_corr=df.corr().abs().unstack().reset_index()
    return df_corr[df_corr['level_0']==col].sort_values(by=0,ascending=False)

corr(data,'Age')

* related with Pclass

In [None]:
data.groupby(['Sex','Pclass'])['Age'].median()

In [None]:
data['Age']=data.groupby(['Sex','Pclass'])['Age'].apply(lambda x:x.fillna(x.median()))

# embarked

In [None]:
data[data['Embarked'].isna()]

* find she boarded S [Martha Evelyn Stone: Titanic Survivor](https://www.encyclopedia-titanica.org/titanic-survivor/martha-evelyn-stone.html)

In [None]:
data['Embarked'].fillna('S',inplace=True)

# fare

In [None]:
corr(data,'Fare')

In [None]:
data[data['Fare'].isna()]

* Age 60, no Parch & SibSp 

In [None]:
mean_fare=data.groupby(['Pclass','Parch','SibSp','Age'])['Fare'].median()[3][0][0].iloc[55:].mean()
data['Fare'].fillna(mean_fare,inplace=True)

# cabin

In [None]:
data['Deck'] = data['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')

* find google Deck related with Pclass

In [None]:
deck=data.groupby(['Pclass','Deck'])['Name'].count()

In [None]:
deck

In [None]:
deck_counts={'A':{},'B':{},'C':{},'D':{},'E':{},'F':{},'G':{},'M':{},'T':{}}
decks=['A','B','C','D','E','F','G','M','T']

for i in range(1,4):
    for deck_ in decks:
        try:
            count=deck[i][deck_]
            deck_counts[deck_][i]=count
        except:
            deck_counts[deck_][i]=0
deck=pd.DataFrame(deck_counts)

In [None]:
for col in deck.columns:
    deck[col]=deck[col].divide(sum(deck[col]))
    
deck

In [None]:
deck=data.groupby(['Survived','Deck'])['Name'].count()
deck_counts={'A':{},'B':{},'C':{},'D':{},'E':{},'F':{},'G':{},'M':{},'T':{}}

for i in range(2):
    for deck_ in decks:
        try:
            count=deck[i][deck_]
            deck_counts[deck_][i]=count
        except:
            deck_counts[deck_][i]=0
            
deck=pd.DataFrame(deck_counts)

In [None]:
for col in deck.columns:
    deck[col]=deck[col].divide(sum(deck[col]))

deck

* A,B,C,T & D,E & F,G & M 

In [None]:
data['Deck']=data['Deck'].replace(['A','B','C','T'],'ABCT')
data['Deck']=data['Deck'].replace(['D','E'],'DE')
data['Deck']=data['Deck'].replace(['F','G'],'FG')

In [None]:
datainfo(data)

# feature engineering

In [None]:
data['Fare']=pd.qcut(data['Fare'],10)
data['Age']=pd.qcut(data['Age'],10)

In [None]:
data['Family']=data['Parch']+data['SibSp']+1
Family_map = {1:'Alone',2:'Small',3:'Small',4:'Small',5:'Medium',
              6:'Medium',7:'Large',8:'Large',11:'Large'}
data['Family']=data['Family'].map(Family_map)

In [None]:
data['Title']=data['Name'].str.split(', ',expand=True)[1].str.split('.',expand=True)[0]
data['Title']=data['Title'].replace(['Miss','Mrs','Ms','Mlle','Lady',
                                     'Mme','the Countess','Dona'], 'Miss')
data['Title']=data['Title'].replace(['Dr','Col','Major','Jonkheer','Capt','Sir','Don','Rev'],'Dr')

In [None]:
data.drop(['Cabin','Name','PassengerId','Ticket'],axis=1,inplace=True)

In [None]:
features=['Age','Embarked','Fare','Sex','Deck','Title','Family']

for feature in features:
    data[feature]=sklearn.preprocessing.LabelEncoder().fit_transform(data[feature])

In [None]:
data

In [None]:
features=['Age','Embarked','Fare','Sex','Deck','Title','Pclass','Family']
encodes=[]

for feature in features:
    encoded_value=sklearn.preprocessing.OneHotEncoder().fit_transform(data[feature].values.reshape(-1,1)).toarray()
    n=data[feature].nunique()
    cols=[f'{feature}_{n}' for n in range(1,n+1)]
    encoded_df=pd.DataFrame(encoded_value,columns=cols)
    encoded_df.index = data.index
    encodes.append(encoded_df)
    
data=pd.concat([data,*encodes],axis=1)

In [None]:
X_train=data.iloc[:train.shape[0]].drop('Survived',axis=1)
Y_train=data.loc[:train.shape[0]-1,'Survived']
X_test=data.iloc[train.shape[0]:].drop('Survived',axis=1)

# model

In [None]:
def rmse(y_true,y_pred):
    return np.sqrt(sklearn.metrics.mean_squared_error(y_true,y_pred))

In [None]:
def objective(trial):
    params={
        'objective':'regression',
        'metric':'rmse',
        'verbosity':-1,
        'max_bin':15,
        "lambda_l1": trial.suggest_float("lambda_l1", 0.01,0.1, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.01,0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 31, 128),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4,0.7),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4,0.7),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 50),
        "max_depth":trial.suggest_int("max_depth",5,10)
    }
    
    lgb_train=lgb.Dataset(X_train,Y_train)
    model=lgb.train(params,lgb_train,num_boost_round=1000,verbose_eval=200)
    
    pred=model.predict(X_train)
    score=rmse(Y_train,pred)
    return score

In [None]:
study=optuna.create_study(direction='minimize')
study.optimize(objective,n_trials=100)

In [None]:
best_params=study.best_trial.params
x={
    'objective':'regression',
    'metric':'rmse',
    'verbosity':-1,
    'max_bin':15
}
best_params.update(x)

In [None]:
folds=sklearn.model_selection.KFold(n_splits=5,shuffle=True,random_state=2020)
preds=[]

for train_idx,valid_idx in folds.split(X_train):
    X_tr,X_val=X_train.loc[train_idx,:],X_train.loc[valid_idx,:]
    y_tr,y_val=Y_train[train_idx],Y_train[valid_idx]
    
    lgb_train=lgb.Dataset(X_tr,y_tr)
    lgb_val=lgb.Dataset(X_val,y_val,reference=lgb_train)
    
    model=lgb.train(best_params,lgb_train,num_boost_round=1000,
                   valid_sets=[lgb_train,lgb_val],verbose_eval=200)
    
    pred=model.predict(X_test)
    preds.append(pred)

In [None]:
final_pred=sum(preds)/len(preds)

In [None]:
final_pred

In [None]:
final_pred=[1 if x>=0.5 else 0 for x in final_pred]

In [None]:
sub['Survived']=final_pred

In [None]:
sub.to_csv('submission.csv',index=False)