In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

pd.options.display.max_columns = 999

In [2]:
data = sns.load_dataset('titanic')
print(data.shape)
data.head(5)

(891, 15)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


In [4]:
data.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


Column 'survived' looks like our target. 
Column 'alive' looks like the duplication of 'survived', so we have to remove it from the dataset.
Columns 'who' and 'adult_male' combine columns 'age' and 'sex', so we'll remove it.
Column 'pclass' and 'class' are also duplicates, we'll drop 'class'.
The same is with columns 'embarked' and 'embark_town', we'll leave only 'embarked'.
Also, we'll remove column 'alone', because it sums up columns 'sibsp' and 'parch', that we'll use as features.
There are some categorical features in the dataset, that we will have to preprocess with one-hot encoding.
Also, columns 'age', 'deck' and 'embarked' contain missing values, that we will have to fill in.

# Data preprocessing

In [5]:
data.drop(['alive', 'who', 'adult_male', 'class', 'embark_town', 'alone'], axis=1, inplace=True)

data.age.fillna(data.age.mean(), inplace=True)

data['gender_female'] = data.sex == 'female'
data.drop('sex', axis=1, inplace=True)

In [6]:
categorical_columns = data.select_dtypes(include=['category', 'object', 'string']).columns
for cat in categorical_columns:
    data[cat] = data[cat].astype(str).replace('nan', 'unknown')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('survived', axis=1), data['survived'], test_size=0.2)

In [8]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False, drop=None, dtype=int)\
    .fit(X_train[categorical_columns])
X_train = np.concatenate((
    X_train.drop(categorical_columns, axis=1).values,
    ohe.transform(X_train[categorical_columns])
), axis=1)
X_test = np.concatenate((
    X_test.drop(categorical_columns, axis=1).values,
    ohe.transform(X_test[categorical_columns])
), axis=1)

In [9]:
ss = StandardScaler().fit(X_train)
X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)

# Tensorflow implementation

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

In [11]:
model_tf = Sequential([
    Input(shape=(X_train_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [12]:
# Let's see model results using 'Adam' optimizer
model_tf.compile(
    optimizer=Adam(learning_rate=0.01),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
model_tf.fit(
    X_train_scaled, y_train, 
    epochs=20, 
    batch_size=32, 
    validation_split=0.2, 
    callbacks=[early_stop]
)
print(f'Train accuracy: {model_tf.evaluate(X_train_scaled, y_train)[1]: .4f}, test accuracy: {model_tf.evaluate(X_test_scaled, y_test)[1]: .4f}')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Train accuracy:  0.8427, test accuracy:  0.8045


In [13]:
# Let's see model results using 'SGD' optimizer
model_tf.compile(
    optimizer=SGD(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
model_tf.fit(
    X_train_scaled, y_train, 
    epochs=20, 
    batch_size=32, 
    validation_split=0.2,
    callbacks=[early_stop]
)
print(f'Train accuracy: {model_tf.evaluate(X_train_scaled, y_train)[1]: .4f}, test accuracy: {model_tf.evaluate(X_test_scaled, y_test)[1]: .4f}')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Train accuracy:  0.8427, test accuracy:  0.8045


In [14]:
# Let's add to the model l2 regularization
model_tf = Sequential([
    Input(shape=(X_train_scaled.shape[1],)),
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
    Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01))
])
model_tf.compile(
    optimizer=Adam(learning_rate=0.01),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
model_tf.fit(
    X_train_scaled, y_train, 
    epochs=20, 
    batch_size=32, 
    validation_split=0.2,
    callbacks=[early_stop]
)
print(f'Train accuracy: {model_tf.evaluate(X_train_scaled, y_train)[1]: .4f}, test accuracy: {model_tf.evaluate(X_test_scaled, y_test)[1]: .4f}')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Train accuracy:  0.8371, test accuracy:  0.8101


In [15]:
# Let's also add the Dropout to the model
model_tf = Sequential([
    Input(shape=(X_train_scaled.shape[1],)),
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01))
])
model_tf.compile(
    optimizer=Adam(learning_rate=0.01),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
model_tf.fit(
    X_train_scaled, y_train, 
    epochs=20, 
    batch_size=32, 
    validation_split=0.2,
    callbacks=[early_stop]
)
print(f'Train accuracy: {model_tf.evaluate(X_train_scaled, y_train)[1]: .4f}, test accuracy: {model_tf.evaluate(X_test_scaled, y_test)[1]: .4f}')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Train accuracy:  0.8258, test accuracy:  0.8324


In [16]:
# Let's remove l2 regularization, but leave Dropout
model_tf = Sequential([
    Input(shape=(X_train_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
model_tf.compile(
    optimizer=Adam(learning_rate=0.01),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
model_tf.fit(
    X_train_scaled, y_train, 
    epochs=20, 
    batch_size=32, 
    validation_split=0.2,
    callbacks=[early_stop]
)
print(f'Train accuracy: {model_tf.evaluate(X_train_scaled, y_train)[1]: .4f}, test accuracy: {model_tf.evaluate(X_test_scaled, y_test)[1]: .4f}')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Train accuracy:  0.8272, test accuracy:  0.8045


# Pytorch implementation

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim

In [18]:
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

In [19]:
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
    def forward(self, X):
        return self.layers(X)

model = MLP(input_dim=X_train_scaled.shape[1])
criterion = nn.BCELoss()

In [20]:
# Let's experiment with Adam optimizer
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(20):
    model.train()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor)
        test_loss = criterion(test_outputs, y_test_tensor)
        
    print(f'Epoch {epoch+1}, Train loss: {loss.item(): .4f}, Test loss: {test_loss.item(): .4f}')

model.eval()
with torch.no_grad():
    y_train_pred = (model(X_train_tensor)>=0.5).float()
    train_accuracy = (y_train_pred == y_train_tensor).float().mean()
    y_test_pred = (model(X_test_tensor)>=0.5).float()
    test_accuracy = (y_test_pred == y_test_tensor).float().mean()
print(f'Train accuracy: {train_accuracy: .4f}, Test accuracy: {test_accuracy: .4f}')

Epoch 1, Train loss:  0.6831, Test loss:  0.6346
Epoch 2, Train loss:  0.6430, Test loss:  0.5946
Epoch 3, Train loss:  0.6062, Test loss:  0.5542
Epoch 4, Train loss:  0.5687, Test loss:  0.5145
Epoch 5, Train loss:  0.5309, Test loss:  0.4778
Epoch 6, Train loss:  0.4946, Test loss:  0.4472
Epoch 7, Train loss:  0.4616, Test loss:  0.4284
Epoch 8, Train loss:  0.4358, Test loss:  0.4282
Epoch 9, Train loss:  0.4229, Test loss:  0.4445
Epoch 10, Train loss:  0.4219, Test loss:  0.4672
Epoch 11, Train loss:  0.4255, Test loss:  0.4834
Epoch 12, Train loss:  0.4269, Test loss:  0.4873
Epoch 13, Train loss:  0.4236, Test loss:  0.4807
Epoch 14, Train loss:  0.4175, Test loss:  0.4688
Epoch 15, Train loss:  0.4105, Test loss:  0.4563
Epoch 16, Train loss:  0.4032, Test loss:  0.4457
Epoch 17, Train loss:  0.3951, Test loss:  0.4385
Epoch 18, Train loss:  0.3879, Test loss:  0.4345
Epoch 19, Train loss:  0.3830, Test loss:  0.4322
Epoch 20, Train loss:  0.3807, Test loss:  0.4301
Train acc

In [21]:
# Let's experiment with SGD optimizer
optimizer = optim.SGD(model.parameters(), lr=0.01)

for epoch in range(20):
    model.train()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor)
        test_loss = criterion(test_outputs, y_test_tensor)
        
    print(f'Epoch {epoch+1}, Train loss: {loss.item(): .4f}, Test loss: {test_loss.item(): .4f}')

model.eval()
with torch.no_grad():
    y_train_pred = (model(X_train_tensor)>0.5).float()
    train_accuracy = (y_train_pred == y_train_tensor).float().mean()
    y_test_pred = (model(X_test_tensor)>0.5).float()
    test_accuracy = (y_test_pred == y_test_tensor).float().mean()
print(f'Train accuracy: {train_accuracy: .4f}, Test accuracy: {test_accuracy: .4f}')

Epoch 1, Train loss:  0.3795, Test loss:  0.4300
Epoch 2, Train loss:  0.3795, Test loss:  0.4299
Epoch 3, Train loss:  0.3794, Test loss:  0.4298
Epoch 4, Train loss:  0.3794, Test loss:  0.4297
Epoch 5, Train loss:  0.3793, Test loss:  0.4296
Epoch 6, Train loss:  0.3793, Test loss:  0.4295
Epoch 7, Train loss:  0.3792, Test loss:  0.4295
Epoch 8, Train loss:  0.3792, Test loss:  0.4294
Epoch 9, Train loss:  0.3791, Test loss:  0.4293
Epoch 10, Train loss:  0.3791, Test loss:  0.4292
Epoch 11, Train loss:  0.3790, Test loss:  0.4291
Epoch 12, Train loss:  0.3790, Test loss:  0.4291
Epoch 13, Train loss:  0.3789, Test loss:  0.4290
Epoch 14, Train loss:  0.3789, Test loss:  0.4289
Epoch 15, Train loss:  0.3789, Test loss:  0.4289
Epoch 16, Train loss:  0.3788, Test loss:  0.4288
Epoch 17, Train loss:  0.3788, Test loss:  0.4287
Epoch 18, Train loss:  0.3787, Test loss:  0.4287
Epoch 19, Train loss:  0.3787, Test loss:  0.4286
Epoch 20, Train loss:  0.3786, Test loss:  0.4286
Train acc

In [22]:
#Let's return optimization Adam and add l2 regularization
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.01)

for epoch in range(20):
    model.train()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor)
        test_loss = criterion(test_outputs, y_test_tensor)
        
    print(f'Epoch {epoch+1}, Train loss: {loss.item(): .4f}, Test loss: {test_loss.item(): .4f}')

model.eval()
with torch.no_grad():
    y_train_pred = (model(X_train_tensor)>=0.5).float()
    train_accuracy = (y_train_pred == y_train_tensor).float().mean()
    y_test_pred = (model(X_test_tensor)>=0.5).float()
    test_accuracy = (y_test_pred == y_test_tensor).float().mean()
print(f'Train accuracy: {train_accuracy: .4f}, Test accuracy: {test_accuracy: .4f}')

Epoch 1, Train loss:  0.3786, Test loss:  0.4136
Epoch 2, Train loss:  0.3759, Test loss:  0.4132
Epoch 3, Train loss:  0.3737, Test loss:  0.4145
Epoch 4, Train loss:  0.3729, Test loss:  0.4112
Epoch 5, Train loss:  0.3721, Test loss:  0.4073
Epoch 6, Train loss:  0.3715, Test loss:  0.4042
Epoch 7, Train loss:  0.3709, Test loss:  0.4030
Epoch 8, Train loss:  0.3699, Test loss:  0.4027
Epoch 9, Train loss:  0.3689, Test loss:  0.4016
Epoch 10, Train loss:  0.3677, Test loss:  0.3998
Epoch 11, Train loss:  0.3662, Test loss:  0.3995
Epoch 12, Train loss:  0.3646, Test loss:  0.4007
Epoch 13, Train loss:  0.3632, Test loss:  0.4017
Epoch 14, Train loss:  0.3620, Test loss:  0.4008
Epoch 15, Train loss:  0.3610, Test loss:  0.3988
Epoch 16, Train loss:  0.3600, Test loss:  0.3981
Epoch 17, Train loss:  0.3588, Test loss:  0.3997
Epoch 18, Train loss:  0.3576, Test loss:  0.4023
Epoch 19, Train loss:  0.3565, Test loss:  0.4019
Epoch 20, Train loss:  0.3555, Test loss:  0.4005
Train acc

In [23]:
# Now let's add Dropout layer to the model class
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
    def forward(self, X):
        return self.layers(X)

model = MLP(input_dim=X_train_scaled.shape[1])
criterion = nn.BCELoss()

In [24]:
#Let's use Adam optimizer again
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(20):
    model.train()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor)
        test_loss = criterion(test_outputs, y_test_tensor)
        
    print(f'Epoch {epoch+1}, Train loss: {loss.item(): .4f}, Test loss: {test_loss.item(): .4f}')

model.eval()
with torch.no_grad():
    y_train_pred = (model(X_train_tensor)>=0.5).float()
    train_accuracy = (y_train_pred == y_train_tensor).float().mean()
    y_test_pred = (model(X_test_tensor)>=0.5).float()
    test_accuracy = (y_test_pred == y_test_tensor).float().mean()
print(f'Train accuracy: {train_accuracy: .4f}, Test accuracy: {test_accuracy: .4f}')

Epoch 1, Train loss:  0.7104, Test loss:  0.6779
Epoch 2, Train loss:  0.6792, Test loss:  0.6473
Epoch 3, Train loss:  0.6562, Test loss:  0.6113
Epoch 4, Train loss:  0.6261, Test loss:  0.5699
Epoch 5, Train loss:  0.5957, Test loss:  0.5248
Epoch 6, Train loss:  0.5593, Test loss:  0.4818
Epoch 7, Train loss:  0.5096, Test loss:  0.4461
Epoch 8, Train loss:  0.5065, Test loss:  0.4232
Epoch 9, Train loss:  0.4622, Test loss:  0.4142
Epoch 10, Train loss:  0.4735, Test loss:  0.4189
Epoch 11, Train loss:  0.4780, Test loss:  0.4358
Epoch 12, Train loss:  0.4502, Test loss:  0.4562
Epoch 13, Train loss:  0.4700, Test loss:  0.4735
Epoch 14, Train loss:  0.4570, Test loss:  0.4858
Epoch 15, Train loss:  0.4552, Test loss:  0.4847
Epoch 16, Train loss:  0.4699, Test loss:  0.4730
Epoch 17, Train loss:  0.4663, Test loss:  0.4597
Epoch 18, Train loss:  0.4285, Test loss:  0.4485
Epoch 19, Train loss:  0.4353, Test loss:  0.4402
Epoch 20, Train loss:  0.4325, Test loss:  0.4332
Train acc

In [25]:
#Let's add l2 regularization to the last version
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.01)

for epoch in range(20):
    model.train()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor)
        test_loss = criterion(test_outputs, y_test_tensor)
        
    print(f'Epoch {epoch+1}, Train loss: {loss.item(): .4f}, Test loss: {test_loss.item(): .4f}')

model.eval()
with torch.no_grad():
    y_train_pred = (model(X_train_tensor)>=0.5).float()
    train_accuracy = (y_train_pred == y_train_tensor).float().mean()
    y_test_pred = (model(X_test_tensor)>=0.5).float()
    test_accuracy = (y_test_pred == y_test_tensor).float().mean()
print(f'Train accuracy: {train_accuracy: .4f}, Test accuracy: {test_accuracy: .4f}')

Epoch 1, Train loss:  0.4250, Test loss:  0.4387
Epoch 2, Train loss:  0.4341, Test loss:  0.4359
Epoch 3, Train loss:  0.4199, Test loss:  0.4274
Epoch 4, Train loss:  0.4097, Test loss:  0.4186
Epoch 5, Train loss:  0.4026, Test loss:  0.4128
Epoch 6, Train loss:  0.4045, Test loss:  0.4105
Epoch 7, Train loss:  0.4024, Test loss:  0.4109
Epoch 8, Train loss:  0.4096, Test loss:  0.4137
Epoch 9, Train loss:  0.4236, Test loss:  0.4173
Epoch 10, Train loss:  0.3994, Test loss:  0.4194
Epoch 11, Train loss:  0.3948, Test loss:  0.4188
Epoch 12, Train loss:  0.3949, Test loss:  0.4176
Epoch 13, Train loss:  0.4012, Test loss:  0.4154
Epoch 14, Train loss:  0.3927, Test loss:  0.4138
Epoch 15, Train loss:  0.3965, Test loss:  0.4124
Epoch 16, Train loss:  0.3958, Test loss:  0.4109
Epoch 17, Train loss:  0.4090, Test loss:  0.4095
Epoch 18, Train loss:  0.3927, Test loss:  0.4077
Epoch 19, Train loss:  0.3932, Test loss:  0.4056
Epoch 20, Train loss:  0.4012, Test loss:  0.4038
Train acc

# Conclusions

In the homework I trained binary classification models using Temsorflow and Pytorch.
I conducted following exeperiments for both implementations:
1. Used Adam optimizer.
2. Used SGD Optimizer.
3. Returned to Adam optimizer and applied l2 regularization.
4. Adam + l2, and also added Dropout.
5. Adam optimizer + Dropout without l2.

In general, without l2 and Dropout, the models did not overfit a lot, so l2 and Dropout just slightly improved the results.
Experiments showed that regularization helped to improve results more in tensorflow implementation than in pytorch.

Please see below the accuracy results from experiments.
The best accuracy on the test set was on tensorflow implementation with Adam optimizer including Dropout and l2 - 0.8324. The train accuracy in that experiment was on 0.66p.p. lower than the test accuracy, because Dropout is applied only on the train set, so the results are more strict during the training. 


Tensorflow implementation
Adam optimizer
Train accuracy:  0.8427, test accuracy:  0.8045
SGD optimizer
Train accuracy:  0.8427, test accuracy:  0.8045
Adam + l2 regularization
Train accuracy:  0.8371, test accuracy:  0.8101
Adam + Dropout
Train accuracy:  0.8272, test accuracy:  0.8045
Adam + Dropout + l2
Train accuracy:  0.8258, test accuracy:  0.8324

Pytorch implementation
Adam optimizer
Train accuracy:  0.8497, test accuracy:  0.8101
SGD optimizer
Train accuracy:  0.8497, test accuracy:  0.8212
Adam + l2 regularization
Train accuracy:  0.8596, test accuracy:  0.8212
Adam + Dropout
Train accuracy:  0.8315, test accuracy:  0.7821
Adam + Dropout + l2
Train accuracy:  0.8525, test accuracy:  0.8268