## Titanic
The objective of this project is to use Machine Learning to create a model that predicts which passengers survived the Titanic shipwreck.

In [55]:
# First the imports

import pandas as pd
import statistics  as sts
from sklearn import model_selection 
from sklearn import linear_model
from sklearn import preprocessing
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [58]:
# Here i read the data file that later will be used to train the model
dataset = pd.read_csv('/content/drive/MyDrive/Ciência de Dados/train.csv')
dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [14]:
dataset_test = pd.read_csv('/content/drive/MyDrive/Ciência de Dados/test.csv')
dataset_test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [None]:
# Here i check if the "Survived" column is consistent and the distribution of the two groups. This column is the most important, since it shows the objective of the model.
dataset.groupby(['Survived']).size()

Survived
0    549
1    342
dtype: int64

In [None]:
# Then i check how many classes the ship had. This column is important because i'm assuming that the higher the class, higher is the chance for survivor, since they've must prioritized saving the higher class people.
dataset.groupby(['Pclass']).size()

Pclass
1    216
2    184
3    491
dtype: int64

In [None]:
# The Name column will be important later, because it contains the person's title. 
dataset.groupby(['Name']).size()

Name
Abbing, Mr. Anthony                      1
Abbott, Mr. Rossmore Edward              1
Abbott, Mrs. Stanton (Rosa Hunt)         1
Abelson, Mr. Samuel                      1
Abelson, Mrs. Samuel (Hannah Wizosky)    1
                                        ..
de Mulder, Mr. Theodore                  1
de Pelsmaeker, Mr. Alfons                1
del Carlo, Mr. Sebastiano                1
van Billiard, Mr. Austin Blyler          1
van Melkebeke, Mr. Philemon              1
Length: 891, dtype: int64

In [None]:
# The following sex and age columns are important because i'm assuming they prioritized saving woman and children on the lifeboats.
dataset.groupby(['Sex']).size()

Sex
female    314
male      577
dtype: int64

In [None]:
dataset.groupby(['Age']).size()

Age
0.42     1
0.67     1
0.75     2
0.83     2
0.92     1
        ..
70.00    2
70.50    1
71.00    2
74.00    1
80.00    1
Length: 88, dtype: int64

In [None]:
# Now that i separeted the most important data, is time to check the inconsistent data, such as null values.
dataset.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
dataset['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [None]:
meanAge = dataset['Age'].mean()
meanAge

29.69911764705882

In [None]:
dataset['Age'].fillna(meanAge, inplace=True)

In [None]:
dataset.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
dataset.groupby(['Embarked']).size()

Embarked
C    168
Q     77
S    644
dtype: int64

In [None]:
dataset.groupby(['Ticket']).size()

Ticket
110152         3
110413         3
110465         2
110564         1
110813         1
              ..
W./C. 6608     4
W./C. 6609     1
W.E.P. 5734    1
W/C 14208      1
WE/P 5735      2
Length: 681, dtype: int64

In [None]:
onlyNumbers = dataset.Ticket.str.extract('(\d+)')
onlyNumbers

Unnamed: 0,0
0,5
1,17599
2,2
3,113803
4,373450
...,...
886,211536
887,112053
888,6607
889,111369


In [None]:
df = dataset['Ticket'].squeeze()
print (type(df))



<class 'pandas.core.series.Series'>


In [None]:
onlyNumbers = df.str.isnumeric()
onlyNumbers

0      False
1      False
2      False
3       True
4       True
       ...  
886     True
887     True
888    False
889     True
890     True
Name: Ticket, Length: 891, dtype: bool

In [None]:
dataset.loc[onlyNumbers == True, 'Ticket'] = "Common"

In [None]:
dataset.groupby(['Ticket']).size()

Ticket
A./5. 2152     1
A./5. 3235     1
A.5. 11206     1
A.5. 18509     1
A/4 45380      1
              ..
W./C. 6608     4
W./C. 6609     1
W.E.P. 5734    1
W/C 14208      1
WE/P 5735      2
Length: 168, dtype: int64

In [None]:
g = dataset.groupby('Ticket')

g.groups.keys()

dict_keys(['A./5. 2152', 'A./5. 3235', 'A.5. 11206', 'A.5. 18509', 'A/4 45380', 'A/4 48871', 'A/4. 20589', 'A/4. 34244', 'A/4. 39886', 'A/5 21171', 'A/5 21172', 'A/5 21173', 'A/5 21174', 'A/5 2466', 'A/5 2817', 'A/5 3536', 'A/5 3540', 'A/5 3594', 'A/5 3902', 'A/5. 10482', 'A/5. 13032', 'A/5. 2151', 'A/5. 3336', 'A/5. 3337', 'A/5. 851', 'A/S 2816', 'A4. 54510', 'C 17369', 'C 4001', 'C 7075', 'C 7076', 'C 7077', 'C.A. 17248', 'C.A. 18723', 'C.A. 2315', 'C.A. 24579', 'C.A. 24580', 'C.A. 2673', 'C.A. 29178', 'C.A. 29395', 'C.A. 29566', 'C.A. 31026', 'C.A. 31921', 'C.A. 33111', 'C.A. 33112', 'C.A. 33595', 'C.A. 34260', 'C.A. 34651', 'C.A. 37671', 'C.A. 5547', 'C.A. 6212', 'C.A./SOTON 34068', 'CA 2144', 'CA. 2314', 'CA. 2343', 'Common', 'F.C. 12750', 'F.C.C. 13528', 'F.C.C. 13529', 'F.C.C. 13531', 'Fa 265302', 'LINE', 'P/PP 3381', 'PC 17318', 'PC 17473', 'PC 17474', 'PC 17475', 'PC 17476', 'PC 17477', 'PC 17482', 'PC 17483', 'PC 17485', 'PC 17558', 'PC 17569', 'PC 17572', 'PC 17582', 'PC 175

In [60]:
def get_title(s):
  title = s.split(", ")
  title = title[1].split(". ")
  s = title[0]  
  return s


In [61]:
dataset['Name'] = dataset['Name'].apply(get_title)
dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Mr,male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,Mrs,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,Miss,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,Mrs,female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,Mr,male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,Rev,male,27.0,0,0,211536,13.0000,,S
887,888,1,1,Miss,female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,Miss,female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,Mr,male,26.0,0,0,111369,30.0000,C148,C


In [62]:
dataset_test['Name'] = dataset_test['Name'].apply(get_title)
dataset_test

IndexError: ignored

In [None]:
dataset.groupby(['Name']).size()

Name
Capt              1
Col               2
Don               1
Dr                7
Jonkheer          1
Lady              1
Major             2
Master           40
Miss            182
Mlle              2
Mme               1
Mr              517
Mrs             125
Ms                1
Rev               6
Sir               1
the Countess      1
dtype: int64

In [None]:
dataset_test.groupby(['Name']).size()

Name
Col         2
Dona        1
Dr          1
Master     21
Miss       78
Mr        240
Mrs        72
Ms          1
Rev         2
dtype: int64

In [63]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Mr,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,Mrs,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,Miss,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,Mrs,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,Mr,male,35.0,0,0,373450,8.05,,S


In [16]:
dataset_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,Mr,male,34.5,0,0,330911,7.8292,,Q
1,893,3,Mrs,female,47.0,1,0,363272,7.0,,S
2,894,2,Mr,male,62.0,0,0,240276,9.6875,,Q
3,895,3,Mr,male,27.0,0,0,315154,8.6625,,S
4,896,3,Mrs,female,22.0,1,1,3101298,12.2875,,S


In [65]:
df = dataset.drop(['PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1)
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked
0,0,3,Mr,male,22.0,7.2500,S
1,1,1,Mrs,female,38.0,71.2833,C
2,1,3,Miss,female,26.0,7.9250,S
3,1,1,Mrs,female,35.0,53.1000,S
4,0,3,Mr,male,35.0,8.0500,S
...,...,...,...,...,...,...,...
886,0,2,Rev,male,27.0,13.0000,S
887,1,1,Miss,female,19.0,30.0000,S
888,0,3,Miss,female,,23.4500,S
889,1,1,Mr,male,26.0,30.0000,C


In [17]:
df_test = dataset_test.drop(['PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1)
df_test

Unnamed: 0,Pclass,Name,Sex,Age,Fare
0,3,Mr,male,34.5,7.8292
1,3,Mrs,female,47.0,7.0000
2,2,Mr,male,62.0,9.6875
3,3,Mr,male,27.0,8.6625
4,3,Mrs,female,22.0,12.2875
...,...,...,...,...,...
413,3,Mr,male,,8.0500
414,1,Dona,female,39.0,108.9000
415,3,Mr,male,38.5,7.2500
416,3,Mr,male,,8.0500


In [67]:
meanAge = dataset['Age'].mean()
meanAge

29.69911764705882

In [68]:
meanAgeTest = dataset_test['Age'].mean()
meanAgeTest

30.272590361445783

In [69]:
mean_fare_test = dataset_test['Fare'].mean()
mean_fare_test

35.627188489208635

In [70]:
df['Age'].fillna(meanAge, inplace=True)

In [71]:
df_test['Age'].fillna(meanAgeTest, inplace=True)

In [66]:
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
Fare          0
Embarked      2
dtype: int64

In [74]:
df.groupby(['Embarked']).size()

Embarked
C    168
Q     77
S    644
dtype: int64

In [76]:
df['Embarked'].fillna("S", inplace=True)

In [77]:
df_test.isnull().sum()

Pclass    0
Name      0
Sex       0
Age       0
Fare      0
dtype: int64

In [22]:
df_test['Fare'].fillna(mean_fare_test, inplace=True)

In [23]:
df_test.isnull().sum()

Pclass    0
Name      0
Sex       0
Age       0
Fare      0
dtype: int64

In [78]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked
0,0,3,Mr,male,22.000000,7.2500,S
1,1,1,Mrs,female,38.000000,71.2833,C
2,1,3,Miss,female,26.000000,7.9250,S
3,1,1,Mrs,female,35.000000,53.1000,S
4,0,3,Mr,male,35.000000,8.0500,S
...,...,...,...,...,...,...,...
886,0,2,Rev,male,27.000000,13.0000,S
887,1,1,Miss,female,19.000000,30.0000,S
888,0,3,Miss,female,29.699118,23.4500,S
889,1,1,Mr,male,26.000000,30.0000,C


In [24]:
df_test

Unnamed: 0,Pclass,Name,Sex,Age,Fare
0,3,Mr,male,34.50000,7.8292
1,3,Mrs,female,47.00000,7.0000
2,2,Mr,male,62.00000,9.6875
3,3,Mr,male,27.00000,8.6625
4,3,Mrs,female,22.00000,12.2875
...,...,...,...,...,...
413,3,Mr,male,30.27259,8.0500
414,1,Dona,female,39.00000,108.9000
415,3,Mr,male,38.50000,7.2500
416,3,Mr,male,30.27259,8.0500


In [97]:
le = preprocessing.LabelEncoder()
df['Name'] = le.fit_transform(df['Name'])

In [98]:
leSex = preprocessing.LabelEncoder()
df['Sex'] = leSex.fit_transform(df['Sex'])

In [99]:
leEmb = preprocessing.LabelEncoder()
df['Embarked'] = leEmb.fit_transform(df['Embarked'])

In [100]:
X = df.drop(['Survived'], axis=1)
X

Unnamed: 0,Pclass,Name,Sex,Age,Fare,Embarked
0,3,11,1,22.000000,7.2500,2
1,1,12,0,38.000000,71.2833,0
2,3,8,0,26.000000,7.9250,2
3,1,12,0,35.000000,53.1000,2
4,3,11,1,35.000000,8.0500,2
...,...,...,...,...,...,...
886,2,14,1,27.000000,13.0000,2
887,1,8,0,19.000000,30.0000,2
888,3,8,0,29.699118,23.4500,2
889,1,11,1,26.000000,30.0000,0


In [101]:
y = df['Survived']
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [102]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked
0,0,3,11,1,22.000000,7.2500,2
1,1,1,12,0,38.000000,71.2833,0
2,1,3,8,0,26.000000,7.9250,2
3,1,1,12,0,35.000000,53.1000,2
4,0,3,11,1,35.000000,8.0500,2
...,...,...,...,...,...,...,...
886,0,2,14,1,27.000000,13.0000,2
887,1,1,8,0,19.000000,30.0000,2
888,0,3,8,0,29.699118,23.4500,2
889,1,1,11,1,26.000000,30.0000,0


In [103]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

In [104]:
X_train

Unnamed: 0,Pclass,Name,Sex,Age,Fare,Embarked
5,3,11,1,29.699118,8.4583,1
487,1,11,1,58.000000,29.7000,0
830,3,12,0,15.000000,14.4542,0
735,3,11,1,28.500000,16.1000,2
37,3,11,1,21.000000,8.0500,2
...,...,...,...,...,...,...
747,2,8,0,30.000000,13.0000,2
805,3,11,1,31.000000,7.7750,2
658,2,11,1,23.000000,13.0000,2
779,1,12,0,43.000000,211.3375,2


In [105]:
lm = linear_model.LogisticRegression()
lm.fit(X_train, y_train)

LogisticRegression()

In [106]:
lm.score(X_test, y_test)

0.7932960893854749

In [107]:
xgb_cl = xgb.XGBClassifier()

In [108]:
xgb_cl.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [109]:
preds = xgb_cl.predict(X_test)

In [110]:
accuracy_score(y_test, preds)

0.8268156424581006