In [61]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [62]:
path = '/content/drive/MyDrive/Colab_Notebooks/keggle_titanic/'
train_df = pd.read_csv(path + 'train.csv')
test_df = pd.read_csv(path + 'test.csv')

Exploring and cleaning data


> Training data



In [63]:
print(train_df.index.size)
train_df.isna().sum()

891


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


> Columns ['Age', 'Cabin', 'Embarked'] need treatment.



Dropping passengers with unknown 'Embarked' status;
at the cost of 2 entries.

In [64]:
nan_index = train_df[train_df['Embarked'].isna() == True].index
train_df.drop(index=nan_index, inplace=True)

Switching null values in 'Age' with average age.

In [65]:
age_mean = train_df['Age'].mean()
train_df['Age'].fillna(age_mean, inplace=True)

For now, leaving unknown cabin values to their own separate group, as cabin number and position in the ship could be an important factor in identifying survivability. LabelEncoder already encodes NaN values in a new category.



> Dropping columns evaluated as uninformative



In [66]:
train_df.drop(columns=['Name', 'PassengerId'], inplace=True)
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,female,35.0,1,0,113803,53.1,C123,S
4,0,3,male,35.0,0,0,373450,8.05,,S


Encoding non numerical values
['Sex', 'Ticket', 'Cabin', 'Embarked']
[:1], [:5], [:7], [:8]

In [67]:
encoder_sex = LabelEncoder()
encoder_ticket = LabelEncoder()
encoder_cabin = LabelEncoder()
encoder_embarked = LabelEncoder()

In [68]:
x_train = train_df.iloc[:, 1:10].values
y_train = train_df.iloc[:, 0].values
x_train[0,:].size

9

In [69]:
x_train[:, 1] = encoder_sex.fit_transform(x_train[:,1])
x_train[:, 5] = encoder_ticket.fit_transform(x_train[:,5])
x_train[:, 7] = encoder_cabin.fit_transform(x_train[:,7])
x_train[:, 8] = encoder_embarked.fit_transform(x_train[:,8])



> Testing data



In [70]:
test_df.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

Switching null values in 'Age' with average age.

In [71]:
age_mean = test_df['Age'].mean()
test_df['Age'].fillna(age_mean, inplace=True)

Switching null values in 'Fare' with average age.

In [72]:
fare_mean = test_df['Fare'].mean()
test_df['Fare'].fillna(fare_mean, inplace=True)

Dropping uninformative columns and preparing results df

In [73]:
results = test_df['PassengerId'].to_frame()

test_df.drop(columns=['Name', 'PassengerId'], inplace=True)
test_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,male,34.5,0,0,330911,7.8292,,Q
1,3,female,47.0,1,0,363272,7.0,,S
2,2,male,62.0,0,0,240276,9.6875,,Q
3,3,male,27.0,0,0,315154,8.6625,,S
4,3,female,22.0,1,1,3101298,12.2875,,S


Encoding

In [74]:
x_test = test_df.values

x_test[:, 1] = encoder_sex.fit_transform(x_test[:,1])
x_test[:, 5] = encoder_ticket.fit_transform(x_test[:,5])
x_test[:, 7] = encoder_cabin.fit_transform(x_test[:,7])
x_test[:, 8] = encoder_embarked.fit_transform(x_test[:,8])

x_test.sum()

133520.10755957363

> SUPPORT VECTOR MACHINE


In [76]:
from sklearn.svm import SVC

svm_titanic = SVC(kernel='linear', random_state=1, tol=1e-4, C=2.0) # 2 -> 4
svm_titanic.fit(x_train, y_train)

svm_predictions = svm_titanic.predict(x_test)

results['Survived'] = svm_predictions

In [77]:
results.to_csv('Results', encoding='utf-8', index=False)

Kaggle Score = 0.77511