In [337]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Data acquisition and cleaning

In [338]:
train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')
labels = train.pop('Survived')

In [339]:
train.tail()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [340]:
test.tail()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
417,1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [341]:
data = train.append(test, ignore_index = True)
data.tail()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1304,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
1305,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
1306,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
1307,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
1308,1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [342]:
data.drop(columns = ['Name', 'Ticket', 'Cabin', 'Fare', 'PassengerId', 'SibSp', 'Parch'], inplace=True)

In [343]:
data['Embarked'] = data['Embarked'].replace({'S' : 0, 'C' : 1, 'Q' : 2})
data['Sex'] = data['Sex'].replace({'male' : 0, 'female' : 1})
data

Unnamed: 0,Pclass,Sex,Age,Embarked
0,3,0,22.0,0.0
1,1,1,38.0,1.0
2,3,1,26.0,0.0
3,1,1,35.0,0.0
4,3,0,35.0,0.0
...,...,...,...,...
1304,3,0,,0.0
1305,1,1,39.0,1.0
1306,3,0,38.5,0.0
1307,3,0,,0.0


In [344]:
nan_values = data.isna()
nan_columns = nan_values.any()
columns_with_nan = data.columns[nan_columns].tolist()
print(columns_with_nan)

['Age', 'Embarked']


In [345]:
#RECTIFYING AGE
df = ((data.groupby(['Pclass', 'Sex']).mean())['Age']).round(0)
print(df)
filt = np.isnan(data['Age'])
data.loc[filt, ['Pclass', 'Sex', 'Age']]
#df.loc[train[['Pclass', 'Sex']]]

Pclass  Sex
1       0      41.0
        1      37.0
2       0      31.0
        1      27.0
3       0      26.0
        1      22.0
Name: Age, dtype: float64


Unnamed: 0,Pclass,Sex,Age
5,3,0,
17,2,0,
19,3,1,
26,3,0,
28,3,1,
...,...,...,...
1299,3,1,
1301,3,1,
1304,3,0,
1307,3,0,


In [346]:
#Establishing filters for each group
filt1 = (data['Pclass']==1) & (data['Sex']==0)
filt2 = (data['Pclass']==1) & (data['Sex']==1)
filt3 = (data['Pclass']==2) & (data['Sex']==0)
filt4 = (data['Pclass']==2) & (data['Sex']==1)
filt5 = (data['Pclass']==3) & (data['Sex']==0)
filt6 = (data['Pclass']==3) & (data['Sex']==1)


In [347]:
data.loc[filt1&filt, 'Age'] = df[1,0]
data.loc[filt2&filt, 'Age'] = df[1,1]
data.loc[filt3&filt, 'Age'] = df[2,0]
data.loc[filt4&filt, 'Age'] = df[2,1]
data.loc[filt5&filt, 'Age'] = df[3,0]
data.loc[filt6&filt, 'Age'] = df[3,1]

In [348]:
data

Unnamed: 0,Pclass,Sex,Age,Embarked
0,3,0,22.0,0.0
1,1,1,38.0,1.0
2,3,1,26.0,0.0
3,1,1,35.0,0.0
4,3,0,35.0,0.0
...,...,...,...,...
1304,3,0,26.0,0.0
1305,1,1,39.0,1.0
1306,3,0,38.5,0.0
1307,3,0,26.0,0.0


In [349]:
#RECTIFYING EMBARKED
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

In [350]:
data

Unnamed: 0,Pclass,Sex,Age,Embarked
0,3,0,22.0,0.0
1,1,1,38.0,1.0
2,3,1,26.0,0.0
3,1,1,35.0,0.0
4,3,0,35.0,0.0
...,...,...,...,...
1304,3,0,26.0,0.0
1305,1,1,39.0,1.0
1306,3,0,38.5,0.0
1307,3,0,26.0,0.0


In [351]:
train_df = data.loc[:train.shape[0]-1,:]
test_df = data.loc[train.shape[0]:,:]
train_df

Unnamed: 0,Pclass,Sex,Age,Embarked
0,3,0,22.0,0.0
1,1,1,38.0,1.0
2,3,1,26.0,0.0
3,1,1,35.0,0.0
4,3,0,35.0,0.0
...,...,...,...,...
886,2,0,27.0,0.0
887,1,1,19.0,0.0
888,3,1,22.0,0.0
889,1,0,26.0,1.0


In [352]:

test_df

Unnamed: 0,Pclass,Sex,Age,Embarked
891,3,0,34.5,2.0
892,3,1,47.0,0.0
893,2,0,62.0,2.0
894,3,0,27.0,0.0
895,3,1,22.0,0.0
...,...,...,...,...
1304,3,0,26.0,0.0
1305,1,1,39.0,1.0
1306,3,0,38.5,0.0
1307,3,0,26.0,0.0


In [353]:
labels = labels.replace({ 0 : -1})

In [354]:

h_1 = train_df.pop('Sex')
nb_1 = 2
h_2 = train_df.pop('Embarked')
nb_2 = 3
ht_1 = test_df.pop('Sex')
nbt_1 = 2
ht_2 = test_df.pop('Embarked')
nbt_2 = 3
titanic_train = train_df.to_numpy()
titanic_test = test_df.to_numpy()
titanic_labels = labels.to_numpy()
oh_1 = h_1.to_numpy()
oh_2 = h_2.to_numpy()
oht_1 = ht_1.to_numpy()
oht_2 = ht_2.to_numpy()

In [355]:
titanic_train.shape

(891, 2)

In [356]:
titanic_test.shape

(418, 2)

In [357]:
titanic_labels = titanic_labels.reshape((-1,1))

In [358]:
labels
#titanic_labels

0     -1
1      1
2      1
3      1
4     -1
      ..
886   -1
887    1
888   -1
889    1
890   -1
Name: Survived, Length: 891, dtype: int64

ONE HOT ENCODING

In [359]:
def get_one_hot(targets, nb_classes):
    res = np.eye(nb_classes)[targets]
    return res

In [360]:
oh_2 = oh_2.astype(int)
oht_2 = oht_2.astype(int)

train_sex = get_one_hot(oh_1,nb_1)
train_embarked = get_one_hot(oh_2,nb_2)

test_sex = get_one_hot(oht_1,nb_1)
test_embarked = get_one_hot(oht_2,nb_2)
print(train_sex.shape, train_embarked.shape)

(891, 2) (891, 3)


In [361]:
train_final = np.concatenate((titanic_train, train_sex, train_embarked), axis=1)
train_final.shape

(891, 7)

In [362]:
test_final = np.concatenate((titanic_test, test_sex, test_embarked), axis=1)

In [363]:
train_final = train_final.T
test_final = test_final.T
titanic_labels = titanic_labels.T
titanic_labels.shape

(1, 891)

# LOGISTIC REGRESSION

# MAXIMUM MARGIN CLASSIFIER

In [364]:
def decay(j):
    return 1/(2*j)

def Dist(x, th, th0):
    return np.dot(th.T,x) + th0

def Hinge_Loss(x, y, th, th0):
    d = Dist(x, th, th0)
    return np.where(y*d>1, 0, 1-(y*d))

def d_th(x, y, th, th0):
    d = Dist(x, th, th0)
    return np.where(y*d>1, np.zeros((x.shape[0], 1)), -y*x)
    

def d_th0(x, y, th, th0):
    d = Dist(x, th, th0)
    return np.where(y*d>1, 0, -y)

def grad_desc(x, y, epoch, rate=0.03):
    d, n = x.shape
    th = np.zeros((d,1))
    th0 = 0
    k = 1
    for i in range(epoch):
        for j in range(n):
            th = th - rate*d_th(x[:,j].reshape(-1,1), y[0,j], th, th0)
            th0 = th0 - rate*d_th0(x[:,j].reshape(-1,1), y[0,j], th, th0)
            k += 1
            
    return th, th0
     

In [365]:
th, th0 = grad_desc(train_final, titanic_labels, 20)
print(th, th0)


[[-7.77 ]
 [ 0.042]
 [-7.59 ]
 [15.45 ]
 [ 0.09 ]
 [ 6.   ]
 [ 1.77 ]] [[14.94]]


In [366]:
#Prediction
print(th, th0)
pred = np.dot(th.T,test_final) + th0
sig_pred =  1/(1 + np.exp(-pred)) 
#print(sig_pred)
a = np.array([1, 2, 3, 4])
final_pred = np.where(sig_pred>0.5, 1,0)
final_pred

[[-7.77 ]
 [ 0.042]
 [-7.59 ]
 [15.45 ]
 [ 0.09 ]
 [ 6.   ]
 [ 1.77 ]] [[14.94]]


array([[0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
        1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
        1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,
        1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
        1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
        0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
        1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
        0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
        1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
        1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
        0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 

In [367]:
surv = pd.DataFrame(final_pred.T, columns=['Survived'])
index = test['PassengerId']
surv = surv.join(index)
survived = surv[['PassengerId', 'Survived']]
survived.set_index('PassengerId')
Answer = survived.to_csv('SGD_MaxMargin_Prediction.csv')
survived

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
