In [1]:
#%matplotlib inline

import pandas as pd
import numpy as np
from scipy import stats
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
#import xgboost as xgb

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

sns.set(style='white', context='notebook', palette='bright')
from sklearn.externals.six import StringIO
#import pydotplus
import sys
sys.executable
pd.options.display.max_rows = 999
pd.options.mode.chained_assignment = None



Load the train and test dataset. We will use train variable to learn the model 

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_Survived = df_train['Survived']
print ("The shape of the Train set",df_train.shape )
print ("The shape of the Test set",df_test.shape )

The shape of the Train set (891, 12)
The shape of the Test set (418, 11)


We need to understand the type of attributes it has.

In [3]:
def attributes(df_data):
    df_attributes = df_data.dtypes.reset_index()
    df_attributes.columns = ['Column_name','Column_type']
    return df_attributes
print("Datails of the Train set Column")
data = attributes(df_train)
print(data)
print(" ")
print("Datails of the Test set Column")
data = attributes(df_test)
print(data)

Datails of the Train set Column
    Column_name Column_type
0   PassengerId       int64
1      Survived       int64
2        Pclass       int64
3          Name      object
4           Sex      object
5           Age     float64
6         SibSp       int64
7         Parch       int64
8        Ticket      object
9          Fare     float64
10        Cabin      object
11     Embarked      object
 
Datails of the Test set Column
    Column_name Column_type
0   PassengerId       int64
1        Pclass       int64
2          Name      object
3           Sex      object
4           Age     float64
5         SibSp       int64
6         Parch       int64
7        Ticket      object
8          Fare     float64
9         Cabin      object
10     Embarked      object


Calculate the missing count for each column and the missing ratio

In [4]:
def missing_values(df_data):
    df_missing = df_data.isnull().sum().reset_index()
    df_missing.columns = ['Column_name','missing_count']
    df_missing['missing_ratio'] = df_missing.missing_count/df_data.shape[0]
    return df_missing.loc[df_missing.missing_ratio > 0.1]

print ("Missing values from the Train set")
data = missing_values(df_train)
print(data)
print(" ")
data = missing_values(df_test)
print ("Missing values from the Test set")
print(data)

Missing values from the Train set
   Column_name  missing_count  missing_ratio
5          Age            177       0.198653
10       Cabin            687       0.771044
 
Missing values from the Test set
  Column_name  missing_count  missing_ratio
4         Age             86       0.205742
9       Cabin            327       0.782297


We will tackle the Age column 

In [5]:
df_train.Age.describe().reset_index()

Unnamed: 0,index,Age
0,count,714.0
1,mean,29.699118
2,std,14.526497
3,min,0.42
4,25%,20.125
5,50%,28.0
6,75%,38.0
7,max,80.0


As we will update the missing values in the Age attribute, we need to look into the Name attribute as we cannot update directly with mean value. So we will update the missing value according to the title used for the Name in the Name attribute

In [6]:
def updating_missing_values (df_data):
    name_title = ['Mrs.','Miss.','Mr.','Dr.','Rev.','Major.','Capt.','Col.','Master.','Don.','Ms.','Mme.','Mlle.','Lady.','Countess.','Sir.','Jonkheer.']
    for title in name_title:
        df_update = df_data.loc[df_train.Name.str.contains(title,regex=False)]
        df_update.Age.fillna(inplace=True,value=np.around(df_update.Age.mean()))
        print("Mean value for the title",title, "is",np.around(df_update.Age.mean()))
        df_data.update(df_update,join='left',overwrite=True)

    df_missing = df_data.isnull().sum().reset_index()
    df_missing.columns = ['Column_name','missing_count']
    df_missing['missing_ratio'] = df_missing.missing_count/df_data.shape[0]
    return df_missing.loc[df_missing.missing_ratio > 0.1],df_data
    
missing_data , df_train = updating_missing_values(df_train)
print(" ")
missing_data , df_test = updating_missing_values(df_test)

Mean value for the title Mrs. is 36.0
Mean value for the title Miss. is 22.0
Mean value for the title Mr. is 32.0
Mean value for the title Dr. is 42.0
Mean value for the title Rev. is 43.0
Mean value for the title Major. is 48.0
Mean value for the title Capt. is 70.0
Mean value for the title Col. is 58.0
Mean value for the title Master. is 5.0
Mean value for the title Don. is 40.0
Mean value for the title Ms. is 28.0
Mean value for the title Mme. is 24.0
Mean value for the title Mlle. is 24.0
Mean value for the title Lady. is 48.0
Mean value for the title Countess. is 33.0
Mean value for the title Sir. is 49.0
Mean value for the title Jonkheer. is 38.0
 
Mean value for the title Mrs. is 29.0
Mean value for the title Miss. is 32.0
Mean value for the title Mr. is 30.0
Mean value for the title Dr. is 27.0
Mean value for the title Rev. is 26.0
Mean value for the title Major. is nan
Mean value for the title Capt. is nan
Mean value for the title Col. is nan
Mean value for the title Master. i

In [7]:
df_train.drop(inplace=True,labels=['Cabin','Ticket','Embarked'],axis=1)
df_test.drop(inplace=True,labels=['Cabin','Ticket','Embarked'],axis=1)

In [8]:
df_pclass = df_train.Pclass.value_counts().reset_index()
df_pclass.columns = ['Pclass','Pclass_count']
df_pclass.sort_values(by='Pclass',inplace=True)
df_pclass
df_survived_plcass  = df_train.Pclass[df_train.Survived == 1.0].value_counts().reset_index()
df_survived_plcass.columns = ['Pclass','Pclass_survived_count']
df_survived_plcass.sort_values(by='Pclass',inplace=True)
df_survived_plcass['survived_ratio'] = df_survived_plcass['Pclass_survived_count']/df_pclass['Pclass_count']
df_survived_plcass['Pclass_count'] = df_pclass['Pclass_count']
df_survived_plcass

Unnamed: 0,Pclass,Pclass_survived_count,survived_ratio,Pclass_count
0,1.0,136,0.276986,491
2,2.0,87,0.472826,184
1,3.0,119,0.550926,216


In [9]:
df_train.head(100)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare
0,1.0,0.0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,7.25
1,2.0,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,71.2833
2,3.0,1.0,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,7.925
3,4.0,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,53.1
4,5.0,0.0,3.0,"Allen, Mr. William Henry",male,35.0,0.0,0.0,8.05
5,6.0,0.0,3.0,"Moran, Mr. James",male,32.0,0.0,0.0,8.4583
6,7.0,0.0,1.0,"McCarthy, Mr. Timothy J",male,54.0,0.0,0.0,51.8625
7,8.0,0.0,3.0,"Palsson, Master. Gosta Leonard",male,2.0,3.0,1.0,21.075
8,9.0,1.0,3.0,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0.0,2.0,11.1333
9,10.0,1.0,2.0,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1.0,0.0,30.0708


In [19]:
df_train['Family_size'] = df_train['SibSp'] + df_train['Parch']
df_family_size = df_train.Family_size.value_counts().reset_index()
df_family_size.columns = ['Family_size','Family_size_count']
df_family_size.sort_values(by='Family_size',inplace=True)
df_family_size
df_survived_family_size  = df_train.Family_size[df_train.Survived == 1.0].value_counts().reset_index()
df_survived_family_size.columns = ['Family_size','Family_size_survived_count']
df_survived_family_size.sort_values(by='Family_size',inplace=True)
df_survived_family_size['survived_ratio'] = df_survived_family_size['Family_size_survived_count']/df_family_size['Family_size_count']
df_survived_family_size['Family_size_count'] = df_family_size['Family_size_count']
df_survived_family_size

Unnamed: 0,Family_size,Family_size_survived_count,survived_ratio,Family_size_count
0,0.0,163,0.303538,537
1,1.0,89,0.552795,161
2,2.0,59,0.578431,102
3,3.0,21,0.724138,29
5,4.0,3,0.2,15
6,5.0,3,0.25,12
4,6.0,4,0.181818,22


In [29]:
df_test['Family_size'] = df_test['SibSp'] + df_test['Parch']
df_family_size = df_test.Family_size.value_counts().reset_index()
df_family_size.columns = ['Family_size','Family_size_count']
df_family_size.sort_values(by='Family_size',inplace=True)
df_family_size
df_survived_family_size  = df_test.Family_size[df_train.Survived == 1.0].value_counts().reset_index()
df_survived_family_size.columns = ['Family_size','Family_size_survived_count']
df_survived_family_size.sort_values(by='Family_size',inplace=True)
df_survived_family_size['survived_ratio'] = df_survived_family_size['Family_size_survived_count']/df_family_size['Family_size_count']
df_survived_family_size['Family_size_count'] = df_family_size['Family_size_count']
df_survived_family_size

Unnamed: 0,Family_size,Family_size_survived_count,survived_ratio,Family_size_count
0,0.0,106,0.418972,253
1,1.0,28,0.378378,74
2,2.0,21,0.368421,57
4,3.0,3,0.428571,7
3,4.0,3,0.214286,14
5,5.0,2,0.5,4


In [31]:
#df_train.drop(inplace=True,labels=['SibSp','Parch'],axis=1)
df_test.drop(inplace=True,labels=['SibSp','Parch'],axis=1)

In [26]:
x_fit = svm.LinearSVC()
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Fare',
       'Family_size'],
      dtype='object')

In [28]:
x_fit.fit(df_train[['Age','Pclass','Family_size']],y=df_Survived)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [32]:
x_fit.predict(df_test[['Age','Pclass','Family_size']])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0,

In [38]:
clf = svm.SVC(kernel='linear',C =0.1)
clf.fit(df_train[['Age','Pclass','Family_size']],df_Survived)
test_value_predict = clf.predict(df_test[['Age','Pclass','Family_size']])
test_value = pd.DataFrame(test_value_predict)
#print(metrics.log_loss(y_true=df_Survived,y_pred=df_train[['Age','Pclass','Family_size']]))
test_value.columns = ['Survived']
test_value.insert(0,'PassengerId',pd.Series(np.arange(892,1310,1),index= test_value.index))
test_value.to_csv(path_or_buf='values.csv',index=False)