In [26]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV



## Data manipulation & Feature engineering

In [49]:
test_data = pd.read_csv("test.csv")
train_data = pd.read_csv("train.csv")
feat = train_data.columns

print(train_data.head(10))
#mapping = {'S':1, 'C':2, 'Q':3}
#test_data['Embarked'] = [mapping[i] for i in test_data['Embarked']]
#train_data['Embarked'] = [mapping[i] for i in train_data['Embarked']]
test_data.replace(to_replace=['S', 'C', 'Q'], value=[1,2,3], inplace=True)
train_data.replace(to_replace=['S', 'C', 'Q'], value=[1,2,3], inplace=True)

test_data.replace(to_replace=['male', 'female'], value=[1,2], inplace=True)
train_data.replace(to_replace=['male', 'female'], value=[1,2], inplace=True)

# Dropping NaNs and replacing with median
test_data['Age'].fillna(test_data['Age'].dropna().median(), inplace=True)
train_data['Age'].fillna(train_data['Age'].dropna().median(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].dropna().median(), inplace=True)
train_data['Fare'].fillna(train_data['Fare'].dropna().median(), inplace=True)
test_data.fillna(0, inplace=True)
train_data.fillna(0, inplace=True)

# Adding FamilySize
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1

# Adding IsAlone
test_data['IsAlone'] = 0
test_data.loc[test_data['FamilySize'] == 1, 'IsAlone'] = 1
train_data['IsAlone'] = 0
train_data.loc[train_data['FamilySize'] == 1, 'IsAlone'] = 1

# Extracting desired features from array 
used_feats = ['Pclass', 'Sex', 'Age', 'Fare',
              'Embarked', 'FamilySize', 'IsAlone']
# 'Ticket', 'Cabin', 'SibSp', 'Parch'
train_x = train_data[used_feats]
test_x  = test_data[used_feats]
train_y = train_data['Survived']
test_id = test_data['PassengerId']

# Features that are left
train_x.head(10)
 

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   
5                                   Moran, Mr. James    male   NaN      0   
6                            McCarthy, Mr. Timothy J    male  54

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,3,1,22.0,1,0,7.25,1.0,2,0
1,1,2,38.0,1,0,71.2833,2.0,2,0
2,3,2,26.0,0,0,7.925,1.0,1,1
3,1,2,35.0,1,0,53.1,1.0,2,0
4,3,1,35.0,0,0,8.05,1.0,1,1
5,3,1,0.0,0,0,8.4583,3.0,1,1
6,1,1,54.0,0,0,51.8625,1.0,1,1
7,3,1,2.0,3,1,21.075,1.0,5,0
8,3,2,27.0,0,2,11.1333,1.0,3,0
9,2,2,14.0,1,0,30.0708,2.0,2,0


## Training a model - Support Vector Machines

In [36]:

# Picking the best parameters

params = {'kernel': ('linear', 'rbf'), 'C': np.linspace(0.01,0.1, 20)}

model = svm.SVC()
cf = GridSearchCV(model, params, cv=5, n_jobs=-1)
cf.fit(train_x, train_y)
cf.best_params_



#model = svm.SVC(kernel='rbf', gamma=1, C=1000)
#model.fit(train_x, train_y)
#pred = model.predict(test_x)
#acc = round(model.score(train_x, train_y) * 100, 2)
#print(acc) # 0.976
# Cross validation


{'C': 0.024210526315789474, 'kernel': 'linear'}

In [37]:
model = svm.SVC(kernel='linear', C=0.0242)
model.fit(train_x, train_y)
pred = model.predict(test_x)
acc = round(model.score(train_x, train_y) * 100, 2) # Don't know what this does...
print(acc) # 0.976

78.68


{'mean_fit_time': array([0.04501185, 0.02895441, 0.06817493, 0.02430801, 0.06762671,
        0.02272654, 0.12958961, 0.02405605, 0.12351298, 0.02738976,
        0.17139697, 0.03240724, 0.16719594, 0.02593617, 0.13120065,
        0.02281418, 0.13587008, 0.0244072 , 0.13776484, 0.02407498,
        0.16894159, 0.02877078, 0.16627059, 0.02498698, 0.17185802,
        0.024125  , 0.21397295, 0.02407727, 0.20486426, 0.02356763,
        0.22764516, 0.0259706 , 0.27614479, 0.02286644, 0.30704842,
        0.02607384, 0.28843465, 0.03230572, 0.22842693, 0.02282825]),
 'std_fit_time': array([0.00704413, 0.00694943, 0.01610348, 0.00247647, 0.01211737,
        0.00109054, 0.05908278, 0.00151138, 0.0417564 , 0.00456923,
        0.09728765, 0.01012022, 0.09267753, 0.00286219, 0.05783339,
        0.00098275, 0.04604452, 0.00268628, 0.03655053, 0.00219636,
        0.05781352, 0.01042427, 0.07761406, 0.00434191, 0.06081962,
        0.00255172, 0.07115896, 0.00362867, 0.06796088, 0.00275162,
        0.084

## Saving predictions

In [40]:
df_pred = pd.DataFrame({
    'PassengerId': test_id,
    'Survived': pred
})

# Saving as csv
preds = df_pred.to_csv('dbjrelind2_pred.csv', index = False)

Notes no self:

- Do metric for test preds
- Find what params are best
- Cross validation?

https://www.kaggle.com/soham1024/titanic-data-science-eda-with-meme-solution/comments