In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.svm import SVR,SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer,MinMaxScaler
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import KFold,StratifiedKFold,cross_val_score,GridSearchCV,train_test_split
from sklearn.feature_selection import VarianceThreshold,SelectKBest,SelectPercentile,GenericUnivariateSelect,chi2
from sklearn.feature_selection import SelectKBest, f_classif,mutual_info_classif,f_regression,RFECV

In [2]:
def read_files(Train,Test):
    titanic_train=pd.read_csv(Train,index_col=0)
    titanic_test=pd.read_csv(Test,index_col=0)
    return titanic_train,titanic_test

In [3]:
def fix_names(df_for_names):
    Titles_dict={'Mrs.':'Mrs.','Miss.':'Miss.','Master.':'Master.','Lady.':'Lady.','Dr.':'Officer.','Sir.':'Royalty.','Major.':'Officer.',
             'Capt.':'Officer.','Col.':'Officer.','Mr.':'Mr.','Rev.':'Officer.','Don.':'Royalty.','Countess.':'Royalty.',
             'Mlle.':'Miss.','Ms.':'Mrs.','Mme.':'Mr.','Jonkheer.':'Royalty.','Dona.':'Miss.'}
    for lab,row in df_for_names.iterrows():
        words=row['Name'].split()
        for word in words:
            if word in Titles_dict.keys():
                df_for_names.loc[lab,'Name']=Titles_dict[word]
    return df_for_names

In [4]:
def fix_tickets(df_tickets):
    Ticket_dict={'1':1,'2':2,'3':3,'A':4,'C':5,'P':6,'S':7}
    for lab,row in df_tickets.iterrows():
        if row['Ticket'][0] in Ticket_dict.keys():
            df_tickets.loc[lab,'Ticket']=Ticket_dict[row['Ticket'][0]]
        else:
            df_tickets.loc[lab,'Ticket']=0     
    return  df_tickets

In [5]:
def fix_cabin_with_similarities(df_cabin):
    for lab, row in df_cabin.iterrows():
        if pd.notna(row['Cabin']):
            df_cabin.loc[lab,'Cabin']=row['Cabin'][0]
    Cabin_df=df_cabin.groupby(['Cabin']).mean()
    Cabin_labels=list(Cabin_df.index)
    Cabin_Fare=Cabin_df[['Fare']]
    Cabin_Parch=Cabin_df[['Parch']]
    Cabin_Fare_np=Cabin_Fare.to_numpy()
    Cabin_Parch_np=Cabin_Parch.to_numpy()
    Cabin_Fare_Kmeans=KMeans(n_clusters=7).fit(Cabin_Fare_np)
    Cabin_Parch_Kmeans=KMeans(n_clusters=7).fit(Cabin_Parch_np)
    Cabin_Fare_Labels=list(Cabin_Fare_Kmeans.labels_)
    Cabin_Parch_Labels=list(Cabin_Parch_Kmeans.labels_)
    Cabin_Parch_dict=dict(zip(Cabin_Parch_Labels,Cabin_labels))
    Cabin_Fare_dict=dict(zip(Cabin_Fare_Labels,Cabin_labels))
    for lab, row in df_cabin.iterrows():
        if pd.isna(row['Cabin']):
            if pd.notna(row['Fare']):
                point=row[['Fare']].to_numpy()
                CB_key=Cabin_Fare_Kmeans.predict([point])[0]
                CB_Value=Cabin_Fare_dict[CB_key]
                df_cabin.loc[lab,'Cabin']=CB_Value
            else:
                point=row[['Parch']].to_numpy()
                CB_key=Cabin_Parch_Kmeans.predict([point])[0]
                CB_Value=Cabin_Parch_dict[CB_key]
                df_cabin.loc[lab,'Cabin']=CB_Value
    return df_cabin

In [6]:
def fix_cabin_with_most_frequent(df_cabin):
    for lab, row in df_cabin.iterrows():
        if pd.notna(row['Cabin']):
            df_cabin.loc[lab,'Cabin']=row['Cabin'][0]
    Max_Frequency=df_cabin['Cabin'].value_counts().idxmax()
    for lab, row in df_cabin.iterrows():
        if pd.isna(row['Cabin']):
            df_cabin.loc[lab,'Cabin']=Max_Frequency
    return df_cabin

In [7]:
def fix_age(df_age):
    for lab,row in df_age.iterrows():
        if pd.notna(row['Age']):
            if row['Age']<7:
                df_age.loc[lab,'Age']=0
            elif row['Age']>55:
                df_age.loc[lab,'Age']=2
            else:
                df_age.loc[lab,'Age']=1
        else:
            df_age.loc[lab,'Age']=3
    return df_age

In [8]:
def fix_age_median(df_age):
    Title_Age_Table=df_age.groupby(['Name']).median()[['Age']]
    Count=0
    for lab,row in df_age.iterrows():
        if pd.isna(row['Age']):
            label=row['Name']
            df_age.loc[lab,'Age']=Title_Age_Table.loc[label,'Age']
    df_age.loc[:,'Age'].fillna(df_age['Age'].mean(),inplace=True)
    return df_age

In [9]:
def model_maker(optimizer='adam',initializer='glorot_uniform'):
    model=Sequential()
    model.add(Dense(units=8,activation='relu',kernel_initializer=initializer,input_shape=(28,)))
    #model.add(Dense(units=n_neurons2,activation='relu'))
    model.add(Dense(units=1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'],verbose=0)
    return model

In [10]:
titanic_train,titanic_test=read_files('train.csv','test.csv')
X0=fix_tickets(titanic_train)
T0=fix_tickets(titanic_test)
X1=fix_names(X0)
T1=fix_names(T0)
X2=fix_cabin_with_most_frequent(X1)
T2=fix_cabin_with_most_frequent(T1)
X3=fix_age_median(X2)
T3=fix_age_median(T2)

In [11]:
Parch_dict={6:6,9:6}
X3.loc[:,'Parch'].replace(Parch_dict,inplace=True)
T3.loc[:,'Parch'].replace(Parch_dict,inplace=True)

In [12]:
Embark_list=X3.loc[:,'Embarked'].value_counts()[:].index.tolist()
print(Embark_list)

['S', 'C', 'Q']


In [13]:
X3.loc[:,'Embarked'].fillna(Embark_list[0],inplace=True)
T3.loc[:,'Embarked'].fillna(Embark_list[0],inplace=True)

In [14]:
T3.loc[:,'Fare'].fillna(T3.loc[:,'Fare'].median(),inplace=True)

In [15]:
T3.isna().sum()

Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [16]:
Train=X3.iloc[:,1:]
Test=T3
y=X3[['Survived']].to_numpy()

In [17]:
cs = MinMaxScaler()

In [18]:
Titanic_Columns=list(Train.columns)
print(Titanic_Columns)

['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [19]:
ContinousParameters=['Age','Fare']

In [20]:
CategoricalParameters=[member for member in Titanic_Columns if (member not in ContinousParameters)]

In [21]:
TrainContinuous = cs.fit_transform(Train[ContinousParameters])
TestContinuous = cs.transform(Test[ContinousParameters])
One_Hot = OneHotEncoder().fit(Train[CategoricalParameters])
TrainCategorical = One_Hot.transform(Train[CategoricalParameters]).toarray()
TestCategorical = One_Hot.transform(Test[CategoricalParameters]).toarray()

In [22]:
TTrainX = np.hstack((TrainCategorical, TrainContinuous))
TTestX = np.hstack((TestCategorical, TestContinuous))

In [23]:
TrainX,TestX,TrainY,TestY=train_test_split(TTrainX,y,test_size=0.25,shuffle=True)

In [23]:
svc = SVC(kernel="linear")

In [24]:
selector= RFECV(estimator=svc, step=1, cv=5)

In [26]:
selector.fit(TTrainX, y)

  y = column_or_1d(y, warn=True)


RFECV(cv=5,
      estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                    coef0=0.0, decision_function_shape='ovr', degree=3,
                    gamma='scale', kernel='linear', max_iter=-1,
                    probability=False, random_state=None, shrinking=True,
                    tol=0.001, verbose=False),
      min_features_to_select=1, n_jobs=None, scoring=None, step=1, verbose=0)

In [27]:
selector.support_

array([False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False])

In [28]:
selector.ranking_

array([15, 17, 16, 23,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  5,  6, 22,  1,  2,  4,
       11, 13,  8, 12,  1,  9,  7, 10, 20, 21, 19, 18, 14])

In [29]:
X_newTest=selector.transform(TTestX)
X_newTrain=selector.transform(TTrainX)

In [43]:
X_newTest=selector.transform(TestX)
X_newTrain=selector.transform(TrainX)

In [30]:
print(X_newTest.shape,X_newTrain.shape)

(418, 25) (891, 25)


In [32]:
model=Sequential()
model.add(Dropout(0.1, input_shape=(25,)))
model.add(Dense(units=8,activation='relu'))
#model.add(Dropout(0.2))
#model.add(Dense(units=8,activation='relu'))
model.add(Dense(units=1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'],verbose=0)
NN_model=model.fit(X_newTrain,y,epochs=30,batch_size=20)

Train on 891 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [49]:
scores = model.evaluate(X_newTest, TestY, verbose=0)
print(scores[1])

0.83856505


In [32]:
AllScores={}

In [55]:
AllScores.update(Features25Dropout2Epoch200Batch10NN9Chi2=scores[1])
print(AllScores)

{'Features46Dropout2Epoch200Batch10NN9': 0.793722, 'Features20Dropout2Epoch200Batch10NN9Chi2': 0.7892377, 'Features25Dropout2Epoch200Batch10NN9Chi2': 0.80717486}


In [33]:
Titanic_prediction_NN=model.predict_classes(X_newTest)

In [56]:
print(Titanic_prediction_NN)

[[0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]


In [34]:
Titanic_prediction_NN.shape

(418, 1)

In [35]:
T3['Survived']=Titanic_prediction_NN

In [59]:
titanic_test.head(10)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,3,Mr.,male,34.5,0,0,3,7.8292,C,Q,0
893,3,Mrs.,female,47.0,1,0,3,7.0,C,S,1
894,2,Mr.,male,62.0,0,0,2,9.6875,C,Q,0
895,3,Mr.,male,27.0,0,0,3,8.6625,C,S,0
896,3,Mrs.,female,22.0,1,1,3,12.2875,C,S,1
897,3,Mr.,male,14.0,0,0,0,9.225,C,S,0
898,3,Miss.,female,30.0,0,0,3,7.6292,C,Q,1
899,2,Mr.,male,26.0,1,1,2,29.0,C,S,0
900,3,Mrs.,female,18.0,0,0,2,7.2292,C,C,1
901,3,Mr.,male,21.0,2,0,4,24.15,C,S,0


In [36]:
prediction_df=T3[['Survived']]

In [37]:
prediction_df.to_csv('Downloads/Datascience/KaggleStuff/NN_prediction.csv')