# Imports

- Here we import the libraries that we are using for this project. Most are machine learning related
- In this step we are also importing the training and test set.

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

train_set = pd.read_csv("train.csv")
test_set = pd.read_csv("test.csv")

train_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Data Wrangling

- Upon review, we decide to remove some features, because they cannot be easily processed, because they have NaN values or because they cannot give us some value for the experiment that we are conducting.

In [3]:
#Drop Features
train_set = train_set.drop(['PassengerId', 'Ticket',"Name","Cabin"], axis=1)
test_set = test_set.drop(['PassengerId', 'Ticket',"Name","Cabin"], axis=1)

#Treat NaNs values
train_set = train_set.dropna(how='any', subset=['Embarked'])
test_set = test_set.dropna(how='any', subset=['Embarked'])

train_set = train_set.fillna(train_set.mean())
test_set = test_set.fillna(test_set.mean())

train_set['Age'] = train_set['Age'].round().astype('Int64')
test_set['Age'] = test_set['Age'].round().astype('Int64')

both_sets = [train_set,test_set]


## Feature Engineering

- In the following lines we are converting some categorical features to numeric ones. Machine learning approaches require numeric features eventually.

In [4]:
#Convert Discrete features to Numeric Features
for dataset in both_sets:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    dataset['Fare'] = dataset['Fare'].astype(int)
    


## Machine Learning - Model Training 1

- In the following line, we train the algorithm using a cross validation split using the SVM algorithm

In [5]:
X = train_set.loc[:, train_set.columns != 'Survived' ]
y = train_set.loc[:, train_set.columns == 'Survived' ]
y = y["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)


0.7640449438202247

## Machine Learning - Model Training 2

- In the following line we are using different algorithms to check their accuracy score

In [48]:

svc = SVC(max_iter=1000,gamma='scale')
knn = KNeighborsClassifier(n_neighbors = 3)
gaussian = GaussianNB()
perceptron = Perceptron()
linear_svc = LinearSVC(max_iter=1000)
random_forest = RandomForestClassifier(n_estimators=100)
decision_tree = DecisionTreeClassifier()
sgd = SGDClassifier(max_iter=1000, tol=1e-3)
logreg = LogisticRegression(solver = "lbfgs")

list_clf = [logreg, svc,knn,gaussian,perceptron,linear_svc,random_forest,decision_tree,sgd]

for clsf in list_clf:
    clsf.fit(X_train,y_train)
    Y_pred = clsf.predict(X_test)
    score_final = round(clsf.score(X_train,y_train)*100,2)
    print (clsf)
    print (score_final)
    print ()
    print ()
    print ()

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)
82.36



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
67.35



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')
82.36



GaussianNB(priors=None, var_smoothing=1e-09)
81.43



Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
      fit_intercept=True, max_iter=None, n_iter=None, n_iter_no_change=5,
      n_jobs=None, penalty=None, random_state=0, shuffle=True, tol=None,
      validation_fraction=0.1, verbo



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
97.56



DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
97.56



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=100