In [19]:
!pip install numpy
!pip install pandas
!pip install -U scikit-learn



In [20]:
# Imports
import numpy as np
import pandas as pd
import sklearn as sk

In [21]:
# Obtain CSV Files
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test_comp = pd.read_csv('gender_submission.csv')

# Get info
df_train.info()
df_test.info()
df_test_comp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass  

In [22]:
# Can see that there is only 204 of 889 entries in the Cabin column, so I will drop that column
df_train = df_train.drop(['Cabin', 'Name', 'Ticket'], axis=1)
df_test = df_test.drop(['Cabin', 'Name', 'Ticket'], axis=1)

# Show
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 62.8+ KB


In [23]:
# Drop All rows with NaN in the age
df_train = df_train.dropna()
df_test = df_test.dropna()
# Show shape
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Sex          712 non-null    object 
 4   Age          712 non-null    float64
 5   SibSp        712 non-null    int64  
 6   Parch        712 non-null    int64  
 7   Fare         712 non-null    float64
 8   Embarked     712 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 55.6+ KB


In [24]:
# Split into label and feature for each
train_label = df_train['Survived']
df_train = df_train.drop('Survived', axis=1)

In [25]:
# Tweak test to match on passengerId
df_test = pd.merge(df_test, df_test_comp, on='PassengerId', how='inner')
test_label = df_test['Survived']
df_test = df_test.drop(['Survived'], axis=1)

In [33]:
# Change Male and Female fro male =0 and female =1
df_train['Sex'] = df_train['Sex'].replace({'male':0, 'female':1})
df_test['Sex'] = df_test['Sex'].replace({'male':0, 'female':1})
df_train.info()

# Check which values are in the Embarked column
print(df_train['Embarked'].unique())

# Replace with numeric values
df_train['Embarked'] = df_train['Embarked'].replace({'S':0, 'C':1, 'Q':2})
df_test['Embarked'] = df_test['Embarked'].replace({'S':0, 'C':1, 'Q':2})

df_test.sample(20)

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Pclass       712 non-null    int64  
 2   Sex          712 non-null    int64  
 3   Age          712 non-null    float64
 4   SibSp        712 non-null    int64  
 5   Parch        712 non-null    int64  
 6   Fare         712 non-null    float64
 7   Embarked     712 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 50.1 KB
[0 1 2]


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
314,1288,3,0,24.0,0,0,7.25,2
281,1248,1,1,59.0,2,0,51.4792,0
255,1218,2,1,12.0,2,1,39.0,0
181,1121,2,0,36.0,0,0,13.0,0
135,1063,3,0,27.0,0,0,7.225,1
51,953,2,0,32.0,0,0,13.5,0
187,1128,1,0,64.0,1,0,75.25,1
90,1004,1,1,36.0,0,0,31.6792,1
124,1048,1,1,29.0,0,0,221.7792,0
12,905,2,0,63.0,1,0,26.0,0


# Using SVM as a Benchmark



In [38]:
from sklearn import svm
from sklearn import metrics

In [41]:
svm_model = svm.SVC()
svm_model.fit(df_train, train_label)

In [43]:
# Test Data
y_pred = svm_model.predict(df_test)
acc = metrics.accuracy_score(test_label, y_pred)
print(acc)

0.6344410876132931
