In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk

from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer,MinMaxScaler
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import KFold,StratifiedKFold,cross_val_score,GridSearchCV,train_test_split

In [2]:
def read_files(Train,Test):
    titanic_train=pd.read_csv(Train,index_col=0)
    titanic_test=pd.read_csv(Test,index_col=0)
    return titanic_train,titanic_test

In [3]:
def fix_names(df_for_names):
    Titles_dict={'Mrs.':'Mrs.','Miss.':'Miss.','Master.':'Master.','Lady.':'Lady.','Dr.':'Sir.','Sir.':'Sir.','Major.':'Sir.',
             'Capt.':'Sir.','Col.':'Sir.','Mr.':'Mr.','Rev.':'Sir.','Don.':'Sir.','Countess.':'Lady.',
             'Mlle.':'Miss.','Ms.':'Miss.','Mme.':'Mr.','Jonkheer.':'Mr.','Dona.':'Miss.'}
    for lab,row in df_for_names.iterrows():
        words=row['Name'].split()
        for word in words:
            if word in Titles_dict.keys():
                df_for_names.loc[lab,'Name']=Titles_dict[word]
    return df_for_names

In [4]:
def fix_tickets(df_tickets):
    Ticket_dict={'1':1,'2':2,'3':3,'A':4,'C':5,'P':6,'S':7}
    for lab,row in df_tickets.iterrows():
        if row['Ticket'][0] in Ticket_dict.keys():
            df_tickets.loc[lab,'Ticket']=Ticket_dict[row['Ticket'][0]]
        else:
            df_tickets.loc[lab,'Ticket']=0     
    return  df_tickets

In [5]:
def fix_cabin_with_similarities(df_cabin):
    for lab, row in df_cabin.iterrows():
        if pd.notna(row['Cabin']):
            df_cabin.loc[lab,'Cabin']=row['Cabin'][0]
    Cabin_df=df_cabin.groupby(['Cabin']).mean()
    Cabin_labels=list(Cabin_df.index)
    Cabin_Fare=Cabin_df[['Fare']]
    Cabin_Parch=Cabin_df[['Parch']]
    Cabin_Fare_np=Cabin_Fare.to_numpy()
    Cabin_Parch_np=Cabin_Parch.to_numpy()
    Cabin_Fare_Kmeans=KMeans(n_clusters=7).fit(Cabin_Fare_np)
    Cabin_Parch_Kmeans=KMeans(n_clusters=7).fit(Cabin_Parch_np)
    Cabin_Fare_Labels=list(Cabin_Fare_Kmeans.labels_)
    Cabin_Parch_Labels=list(Cabin_Parch_Kmeans.labels_)
    Cabin_Parch_dict=dict(zip(Cabin_Parch_Labels,Cabin_labels))
    Cabin_Fare_dict=dict(zip(Cabin_Fare_Labels,Cabin_labels))
    for lab, row in df_cabin.iterrows():
        if pd.isna(row['Cabin']):
            if pd.notna(row['Fare']):
                point=row[['Fare']].to_numpy()
                CB_key=Cabin_Fare_Kmeans.predict([point])[0]
                CB_Value=Cabin_Fare_dict[CB_key]
                df_cabin.loc[lab,'Cabin']=CB_Value
            else:
                point=row[['Parch']].to_numpy()
                CB_key=Cabin_Parch_Kmeans.predict([point])[0]
                CB_Value=Cabin_Parch_dict[CB_key]
                df_cabin.loc[lab,'Cabin']=CB_Value
    return df_cabin

In [6]:
def fix_cabin_with_most_frequent(df_cabin):
    for lab, row in df_cabin.iterrows():
        if pd.notna(row['Cabin']):
            df_cabin.loc[lab,'Cabin']=row['Cabin'][0]
    Max_Frequency=df_cabin['Cabin'].value_counts().idxmax()
    for lab, row in df_cabin.iterrows():
        if pd.isna(row['Cabin']):
            df_cabin.loc[lab,'Cabin']=Max_Frequency
    return df_cabin

In [7]:
def fix_age(df_age):
    for lab,row in df_age.iterrows():
        if pd.notna(row['Age']):
            if row['Age']<7:
                df_age.loc[lab,'Age']=0
            elif row['Age']>55:
                df_age.loc[lab,'Age']=2
            else:
                df_age.loc[lab,'Age']=1
        else:
            df_age.loc[lab,'Age']=3
    return df_age

In [8]:
def fix_age_median(df_age):
    Title_Age_Table=df_age.groupby(['Name']).median()[['Age']]
    Count=0
    for lab,row in df_age.iterrows():
        if pd.isna(row['Age']):
            label=row['Name']
            df_age.loc[lab,'Age']=Title_Age_Table.loc[label,'Age']
    df_age.loc[:,'Age'].fillna(df_age['Age'].mean(),inplace=True)
    return df_age

In [9]:
def model_maker(n_neurons1,optimizer='adam',initializer='glorot_uniform'):
    model=Sequential()
    model.add(Dense(units=n_neurons1,activation='relu',kernel_initializer=initializer,input_shape=(28,)))
    #model.add(Dense(units=n_neurons2,activation='relu'))
    model.add(Dense(units=1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'],verbose=0)
    return model

In [10]:
titanic_train,titanic_test=read_files('train.csv','test.csv')
X0=fix_tickets(titanic_train)
T0=fix_tickets(titanic_test)
X1=fix_names(X0)
T1=fix_names(T0)
X2=fix_cabin_with_most_frequent(X1)
T2=fix_cabin_with_most_frequent(T1)
X3=fix_age_median(X2)
T3=fix_age_median(T2)

In [11]:
Parch_dict={0:0,1:1,2:1,3:2,4:2,5:2,6:2,9:2}
X3.loc[:,'Parch'].replace(Parch_dict,inplace=True)
T3.loc[:,'Parch'].replace(Parch_dict,inplace=True)

In [12]:
SibSP_dict={0:0,1:1,2:2,3:3,4:3,5:3,8:2}
X3.loc[:,'SibSp'].replace(SibSP_dict,inplace=True)
T3.loc[:,'SibSp'].replace(SibSP_dict,inplace=True)

In [13]:
Embark_list=X3.loc[:,'Embarked'].value_counts()[:].index.tolist()
print(Embark_list)

['S', 'C', 'Q']


In [14]:
X3.loc[:,'Embarked'].fillna(Embark_list[0],inplace=True)
T3.loc[:,'Embarked'].fillna(Embark_list[0],inplace=True)

In [15]:
X3.isna().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [16]:
Train=X3.iloc[:,1:]
Test=T3
y=X3['Survived']

In [17]:
cs = MinMaxScaler()

In [18]:
Titanic_Columns=list(Train.columns)
print(Titanic_Columns)

['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [19]:
ContinousParameters=['Age','Fare']

In [20]:
CategoricalParameters=[member for member in Titanic_Columns if (member not in ContinousParameters and member!='Survived' and member!='Parch' and member!='Ticket')]

In [21]:
TrainContinuous = cs.fit_transform(Train[ContinousParameters])
TestContinuous = cs.transform(Test[ContinousParameters])
One_Hot = OneHotEncoder().fit(Train[CategoricalParameters])
TrainCategorical = One_Hot.transform(Train[CategoricalParameters]).toarray()
TestCategorical = One_Hot.transform(Test[CategoricalParameters]).toarray()

In [22]:
TTrainX = np.hstack((TrainCategorical, TrainContinuous))
TTestX = np.hstack((TestCategorical, TestContinuous))

In [23]:
my_KerasClassifier=KerasClassifier(build_fn=model_maker,batch_size=10,epochs=20)

In [24]:
SKF=StratifiedKFold(n_splits=5,shuffle=True,random_state=10)

initializers=['glorot_uniform','uniform','normal']
epochss=[20,50]
batch_sizes=[10,20]
param_grids={'initializer':initializers,'epochs':epochss,'batch_size':batch_sizes}

In [25]:
N11=[4,6]
param_grids={'n_neurons1':N11}

In [26]:
print(param_grids)

{'n_neurons1': [4, 6]}


In [27]:
grid_search=GridSearchCV(estimator=my_KerasClassifier,param_grid=param_grids,cv=SKF)

In [28]:
results=grid_search.fit(TTrainX,y)

Train on 712 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 713 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 713 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 713 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20


Epoch 19/20
Epoch 20/20
Train on 713 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Train on 712 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 713 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 713 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 713 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20


Epoch 19/20
Epoch 20/20
Train on 713 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Train on 891 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [29]:
results.cv_results_

{'mean_fit_time': array([2.57189412, 2.56415915]),
 'std_fit_time': array([0.21299448, 0.13841984]),
 'mean_score_time': array([0.21691279, 0.17159996]),
 'std_score_time': array([0.06838845, 0.00249793]),
 'param_n_neurons1': masked_array(data=[4, 6],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neurons1': 4}, {'n_neurons1': 6}],
 'split0_test_score': array([0.84916198, 0.84357542]),
 'split1_test_score': array([0.75842696, 0.78089887]),
 'split2_test_score': array([0.86516851, 0.86516851]),
 'split3_test_score': array([0.77528089, 0.76966292]),
 'split4_test_score': array([0.8202247 , 0.85955054]),
 'mean_test_score': array([0.81365261, 0.82377125]),
 'std_test_score': array([0.04118314, 0.04037784]),
 'rank_test_score': array([2, 1])}

In [30]:
results.best_score_

0.8237712502479553

In [31]:
results.best_params_

{'n_neurons1': 6}

In [32]:
results.refit_time_

3.082000732421875

In [33]:
results.n_splits_

5

In [34]:
Titanic_prediction_SVM=results.predict(TTestX)

  return (proba > 0.5).astype('int32')


In [35]:
TTestX.shape

(418, 28)

In [36]:
Titanic_prediction_SVM.shape

(418, 1)

In [37]:
T3['Survived']=Titanic_prediction_SVM

In [38]:
titanic_test.head(10)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,3,Mr.,male,34.5,0,0,3,7.8292,C,Q,0
893,3,Mrs.,female,47.0,1,0,3,7.0,C,S,1
894,2,Mr.,male,62.0,0,0,2,9.6875,C,Q,0
895,3,Mr.,male,27.0,0,0,3,8.6625,C,S,0
896,3,Mrs.,female,22.0,1,1,3,12.2875,C,S,1
897,3,Mr.,male,14.0,0,0,0,9.225,C,S,0
898,3,Miss.,female,30.0,0,0,3,7.6292,C,Q,1
899,2,Mr.,male,26.0,1,1,2,29.0,C,S,0
900,3,Mrs.,female,18.0,0,0,2,7.2292,C,C,1
901,3,Mr.,male,21.0,2,0,4,24.15,C,S,0


In [39]:
prediction_df=T3[['Survived']]

In [40]:
prediction_df.to_csv('Downloads/Datascience/KaggleStuff/NN_prediction.csv')