In [26]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV



## Data manipulation & Feature engineering

In [51]:
test_data = pd.read_csv("test.csv")
train_data = pd.read_csv("train.csv")
feat = train_data.columns

print(train_data.head(10))
#mapping = {'S':1, 'C':2, 'Q':3}
#test_data['Embarked'] = [mapping[i] for i in test_data['Embarked']]
#train_data['Embarked'] = [mapping[i] for i in train_data['Embarked']]
test_data.replace(to_replace=['S', 'C', 'Q'], value=[1,2,3], inplace=True)
train_data.replace(to_replace=['S', 'C', 'Q'], value=[1,2,3], inplace=True)

test_data.replace(to_replace=['male', 'female'], value=[1,2], inplace=True)
train_data.replace(to_replace=['male', 'female'], value=[1,2], inplace=True)

# Dropping NaNs and replacing with median
test_data['Age'].fillna(test_data['Age'].dropna().median(), inplace=True)
train_data['Age'].fillna(train_data['Age'].dropna().median(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].dropna().median(), inplace=True)
train_data['Fare'].fillna(train_data['Fare'].dropna().median(), inplace=True)
test_data.fillna(0, inplace=True)
train_data.fillna(0, inplace=True)

# Adding FamilySize
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1

# Adding IsAlone
test_data['IsAlone'] = 0
test_data.loc[test_data['FamilySize'] == 1, 'IsAlone'] = 1
train_data['IsAlone'] = 0
train_data.loc[train_data['FamilySize'] == 1, 'IsAlone'] = 1

# Adding AgeClass
test_data['AgeClass'] = test_data['Age'] * test_data['Pclass']
train_data['AgeClass'] = train_data['Age'] * train_data['Pclass']


# Extracting desired features from array 
used_feats = ['Pclass', 'Sex', 'Age', 'Fare',
              'Embarked', 'FamilySize', 'IsAlone', 'AgeClass']
# 'Ticket', 'Cabin', 'SibSp', 'Parch'
train_x = train_data[used_feats]
test_x  = test_data[used_feats]
train_y = train_data['Survived']
test_id = test_data['PassengerId']

# The final features used
train_x.head(10)
 

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   
5                                   Moran, Mr. James    male   NaN      0   
6                            McCarthy, Mr. Timothy J    male  54

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize,IsAlone,AgeClass
0,3,1,22.0,7.25,1.0,2,0,66.0
1,1,2,38.0,71.2833,2.0,2,0,38.0
2,3,2,26.0,7.925,1.0,1,1,78.0
3,1,2,35.0,53.1,1.0,2,0,35.0
4,3,1,35.0,8.05,1.0,1,1,105.0
5,3,1,28.0,8.4583,3.0,1,1,84.0
6,1,1,54.0,51.8625,1.0,1,1,54.0
7,3,1,2.0,21.075,1.0,5,0,6.0
8,3,2,27.0,11.1333,1.0,3,0,81.0
9,2,2,14.0,30.0708,2.0,2,0,28.0


## Training a model - Support Vector Machines

In [59]:

# Picking the best parameters

params = {'kernel': ('linear', 'rbf', 'poly'), 'C': np.linspace(0.01,0.1, 10)}

model = svm.SVC()
cf = GridSearchCV(model, params, cv=5, n_jobs=-1)
cf.fit(train_x, train_y)
cf.best_params_



#model = svm.SVC(kernel='rbf', gamma=1, C=1000)
#model.fit(train_x, train_y)
#pred = model.predict(test_x)
#acc = round(model.score(train_x, train_y) * 100, 2)
#print(acc) # 0.976
# Cross validation


{'C': 0.020000000000000004, 'kernel': 'linear'}

In [60]:
model = svm.SVC(kernel='linear', C=0.0242)
model.fit(train_x, train_y)
pred = model.predict(test_x)
acc = round(model.score(train_x, train_y) * 100, 2) # Don't know what this does...
print(acc) # 0.976

79.46


## Saving predictions

In [61]:
df_pred = pd.DataFrame({
    'PassengerId': test_id,
    'Survived': pred
})

# Saving as csv
preds = df_pred.to_csv('dbjrelind2_pred.csv', index = False)

https://www.kaggle.com/soham1024/titanic-data-science-eda-with-meme-solution/comments