# Titanic Challenge Enrico

Useful libraries import

In [None]:
import pandas as pd
import matplotlib as plt
import numpy as np
import sklearn as sl
import matplotlib.pyplot as plt

## Importing training set

In [None]:
training_set = pd.read_csv('/kaggle/input/titanic/train.csv')

In [None]:
training_set.info()

In [None]:
training_set.head()

## Importing testing set

In [None]:
testing_set = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
testing_set.info()

In [None]:
testing_set.head()

## Data selection

We assume that some of the columns are not useful to reach our goal. 
The first step is therefore selecting columns based on their (hypothetical) influence on avoiding passengers death. 

* **PassengerId** (we keep it only to have a reference);
* **Pclass** (i.e. the third class was located in bow and stern: it is likely that passengers in these positions died in different proportions);
* **Sex** (it may seem a cognitive bias, but if we study the disposition of unmarried people we can see that they were located in different parts of the ship);
* **Age** (we can assume old people were most likely to die in an emergency situation: this is a weak hypothesis);
* **Parch** (to be parent of a child might have been a reason of having more chances to get on a lifeboat);
* **Survived** (if survived or not). 

In [None]:
clean_training_set = training_set[["PassengerId", "Pclass", "Sex", "Age", "Parch", "SibSp", "Survived"]]

In [None]:
clean_training_set.info()

### Correlation between class and death

Passengers death is highly correlated to their (travel) class: 

In [None]:
class_scheme = clean_training_set.groupby('Pclass').Survived.mean().plot(kind='bar')

## Correlation between sex and death

As we can see, females death was less that males one. 

In [None]:
sex_scheme = clean_training_set.groupby('Sex').Survived.mean().plot(kind='bar')

## Correlation between age and death

Children on Titanic died less that middle age and old people (probably because they had priority on lifeboats): 

In [None]:
age_scheme = clean_training_set.groupby('Age').Survived.mean().plot(kind='line')

## Correlation between Sisp and death

In [None]:
clean_training_set.groupby('SibSp').Survived.mean().plot(kind='bar')

## Training set normalization

## Training set traduction (strings to numbers)

Here we convert males to value 0 and females to value 1

In [None]:
df = clean_training_set[clean_training_set['Sex'].notnull()].copy()
df['Sex'] = df['Sex'].astype(str).copy()
df["Sex"]=df.Sex.map({"male":0,"female":1})

df

As we can see Age column has NaN values that have to be replaced with the medium value of age

In [None]:
check_nan = df['Age'].isnull().values.any()
print(check_nan)

In [None]:
mean_age = int(df['Age'].mean())
print("Mean age: " + str(mean_age))

In [None]:
df.loc[ pd.isna(df['Age']), 'Age'] = mean_age

In [None]:
check_nan = df['Age'].isnull().values.any()
print("Now we don't have NaN values anymore: " + str(check_nan))

In [None]:
df.isnull().sum()

In [None]:
df

## Now we're going to obtain a range of age to define better death probability

We split Age into three different columns as follows: 

In [None]:
child_list = df['Age'].apply(lambda x: 1 if x < 18 else 0)
df.insert(4, "Child", child_list, True)

adult_list = df['Age'].apply(lambda x: 1 if x >= 18 and x < 50 else 0)
df.insert(5, "Adult", adult_list, True)

elderly_list = df['Age'].apply(lambda x: 1 if x > 50 else 0)
df.insert(6, "Elderly", elderly_list, True)

df = df.drop(columns=['Age'], axis = 1)

In [None]:
df.groupby('Child').Survived.mean().plot(kind='bar')
#df.groupby('Adult').Survived.mean().plot(kind='bar')
#df.groupby('Elderly').Survived.mean().plot(kind='bar')

In [None]:
clean_training_set = df.copy()
clean_training_set.info()

## Testing set preparation

In [None]:
clean_testing_set = testing_set[["PassengerId", "Pclass", "Sex", "Age", "Parch", "SibSp"]]

In [None]:
df = clean_testing_set[clean_testing_set['Sex'].notnull()].copy()
df['Sex'] = df['Sex'].astype(str).copy()
df["Sex"]=df.Sex.map({"male":0,"female":1})

df

In [None]:
check_nan = df['Age'].isnull().values.any()
print(check_nan)

In [None]:
mean_age = int(df['Age'].mean())
print("Mean age: " + str(mean_age))

In [None]:
df.loc[ pd.isna(df['Age']), 'Age'] = mean_age

In [None]:
check_nan = df['Age'].isnull().values.any()
print("Now we don't have NaN values anymore: " + str(check_nan))

In [None]:
df.isnull().sum()

In [None]:
child_list = df['Age'].apply(lambda x: 1 if x < 18 else 0)
df.insert(4, "Child", child_list, True)

adult_list = df['Age'].apply(lambda x: 1 if x >= 18 and x < 50 else 0)
df.insert(5, "Adult", adult_list, True)

elderly_list = df['Age'].apply(lambda x: 1 if x > 50 else 0)
df.insert(6, "Elderly", elderly_list, True)

df = df.drop(columns=['Age'], axis = 1)

In [None]:
clean_testing_set = df.copy()
clean_testing_set.info()

# Now we try to implement a Support Vector Machine algorithm

In [None]:
from sklearn.model_selection import train_test_split

#X_train = clean_training_set[["Pclass", "Sex", "Child", "Adult", "Elderly", "Parch", "SibSp"]]

#Y_train = clean_training_set['Survived']

# -----------------------------------------------------------------------------------------------
X_test = clean_testing_set[["Pclass", "Sex", "Child", "Adult", "Elderly", "Parch", "SibSp"]]


X = clean_training_set[["Pclass", "Sex", "Child", "Adult", "Elderly", "Parch", "SibSp"]]
Y = clean_training_set['Survived']
X_train, x_test, Y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [None]:
X_train.info()

In [None]:
X_train.head()

In [None]:
Y_train.head()

In [None]:
# Importing C-Support Vector Classification from scikit-learn
from sklearn.svm import SVC

# Declaring the SVC with no tunning
classifier = SVC()

# Fitting the data. This is where the SVM will learn
classifier.fit(X_train, Y_train)

# Predicting the result and giving the accuracy
score = classifier.score(x_test, y_test)

print(score)

In [None]:
print('So the accuracy of the Support Vector Machines Classifier is',round(score*100,2))

In [None]:
prediction = classifier.predict(X_test)
prediction