## Importing important libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport
sns.set_theme(style='darkgrid')

### Importing our Titanic_dataset

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

### EDA Preprocessing

In [4]:
df_train.shape

(891, 12)

In [5]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
df_train.Sex.min()

'female'

In [8]:
df_train.Sex.max()

'male'

In [9]:
df_train.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

### Sex and Survived columns correlation

In [10]:
pd.crosstab(df_train['Sex'], df_train['Survived'])

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,81,233
male,468,109


In [11]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [12]:
df_train.columns.value_counts().sum()

12

In [13]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [14]:
df_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [15]:
df_train['Survived'].mean()

0.3838383838383838

In [16]:
len(df_train)

891

In [17]:
df_train.iloc[:5]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [18]:
## Let's remove Survived column and fit to the end of the table
survived = df_train.pop('Survived')

In [19]:
df_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [20]:
df_train.columns.value_counts().sum()

11

In [21]:
col_name = 'Survived'

In [22]:
## Let's fit the column to the end of the table

df_train.insert(11, col_name ,survived)

In [23]:
df_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [24]:
df_train.iloc[:3,2:]

Unnamed: 0,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1


In [25]:
df_train[df_train.Survived == 1].head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
8,9,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,1
9,10,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,1


In [26]:
df_age = np.where((df_train.Age <= 20), df_train.Survived == 1,0)

In [27]:
df_train['age'] = np.where((df_train.Age <=20) & (df_train.Survived == 1), 'yes','no')

In [28]:
df_train[(df_train.Age <= 20) & (df_train.Survived == 1) & (df_train.Sex == 'male')].head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived,age
78,79,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0,,S,1,yes
125,126,3,"Nicola-Yarred, Master. Elias",male,12.0,1,0,2651,11.2417,,C,1,yes
165,166,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9.0,0,2,363291,20.525,,S,1,yes
183,184,2,"Becker, Master. Richard F",male,1.0,2,1,230136,39.0,F4,S,1,yes
193,194,2,"Navratil, Master. Michel M",male,3.0,1,1,230080,26.0,F2,S,1,yes


In [29]:
df_train['new'] = np.where((df_train.Sex == 'male') & (df_train.Age <= 20) & (df_train.Survived == 1), 'yes', 'no')

In [30]:
df_train[df_train.new == 'yes'].head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived,age,new
78,79,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0,,S,1,yes,yes
125,126,3,"Nicola-Yarred, Master. Elias",male,12.0,1,0,2651,11.2417,,C,1,yes,yes
165,166,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9.0,0,2,363291,20.525,,S,1,yes,yes
183,184,2,"Becker, Master. Richard F",male,1.0,2,1,230136,39.0,F4,S,1,yes,yes
193,194,2,"Navratil, Master. Michel M",male,3.0,1,1,230080,26.0,F2,S,1,yes,yes


In [31]:
np.where(df_train.age == 'yes')

(array([  9,  10,  22,  39,  43,  44,  58,  68,  78,  84, 125, 136, 156,
        165, 172, 183, 184, 192, 193, 204, 208, 220, 226, 233, 237, 261,
        283, 291, 305, 307, 311, 329, 340, 348, 381, 389, 407, 417, 427,
        435, 445, 446, 448, 469, 479, 489, 504, 530, 535, 546, 549, 550,
        585, 618, 622, 644, 651, 664, 677, 689, 691, 700, 720, 750, 751,
        755, 762, 777, 780, 781, 786, 788, 802, 803, 827, 830, 831, 853,
        855, 869, 875, 887], dtype=int64),)

In [32]:
df_train[(df_train.age == 'yes') & (df_train.new == 'yes')].head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived,age,new
78,79,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0,,S,1,yes,yes
125,126,3,"Nicola-Yarred, Master. Elias",male,12.0,1,0,2651,11.2417,,C,1,yes,yes
165,166,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9.0,0,2,363291,20.525,,S,1,yes,yes
183,184,2,"Becker, Master. Richard F",male,1.0,2,1,230136,39.0,F4,S,1,yes,yes
193,194,2,"Navratil, Master. Michel M",male,3.0,1,1,230080,26.0,F2,S,1,yes,yes


In [33]:
df_train['new'] = np.where((df_train.Age <= 18), 'adult', 
                           np.where((df_train.Age <=5 ), 'baby', 'agad'))

where function use for filtering...
np.where(condition, 'value one', if false 'value two')
np.where(condition_one 'operator' condition_two 'operrator' condition_three, 'value one', 'value two')

> np.where(df.age >= 18, 'adult', 'teenage')

> np.where((df.age >= 18 ) & (df.age <=30), 'adult', np.where(df.age <= 5), 'baby', 'aged')

In [34]:
df_train[['Age', 'Survived']].groupby(df_train['Sex']).median()

Unnamed: 0_level_0,Age,Survived
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,27.0,1
male,29.0,0


In [35]:
df_train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).median().sort_values(by='Survived')

Unnamed: 0,Sex,Survived
1,male,0
0,female,1


In [36]:
df_train[['Pclass', 'Survived']].groupby(df_train['Sex']).median()

Unnamed: 0_level_0,Pclass,Survived
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,2,1
male,3,0


In [37]:
df_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived,age,new
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,no,agad
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,no,agad
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,no,agad
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,no,agad
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,no,agad


## Data Visualization with matplotlib and Seaborn

In [56]:
df_train.shape

(891, 14)

In [57]:
df_train = df_train.drop('Cabin', axis=1)

In [58]:
df_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived,age,new
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0,no,agad
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1,no,agad
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1,no,agad
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,1,no,agad
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0,no,agad


In [59]:
df_train = df_train.dropna()

In [60]:
df_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived,age,new
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0,no,agad
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1,no,agad
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1,no,agad
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,1,no,agad
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0,no,agad


In [61]:
df_train.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Survived       0
age            0
new            0
dtype: int64

In [62]:
df_train.Age.sum()

21105.17

In [63]:
df_train.mean()

PassengerId    448.589888
Pclass           2.240169
Age             29.642093
SibSp            0.514045
Parch            0.432584
Fare            34.567251
Survived         0.404494
dtype: float64

In [64]:
df_train.median()

PassengerId    445.00000
Pclass           2.00000
Age             28.00000
SibSp            0.00000
Parch            0.00000
Fare            15.64585
Survived         0.00000
dtype: float64

In [65]:
df_train.loc[:3]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived,age,new
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0,no,agad
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1,no,agad
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1,no,agad
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,1,no,agad


In [66]:
df_train.iloc[:4,:2]

Unnamed: 0,PassengerId,Pclass
0,1,3
1,2,1
2,3,3
3,4,1


In [67]:
df_train.iloc[6:10, 3:8]

Unnamed: 0,Sex,Age,SibSp,Parch,Ticket
7,male,2.0,3,1,349909
8,female,27.0,0,2,347742
9,female,14.0,1,0,237736
10,female,4.0,1,1,PP 9549


In [68]:
df_train[df_train.Sex == 'female'].head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived,age,new
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1,no,agad
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1,no,agad


In [69]:
df_train.Age[df_train.Age == 1].value_counts()

1.0    7
Name: Age, dtype: int64

In [70]:
df_train[(df_train.Age >= 1) & (df_train.Survived == 1)].head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived,age,new
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1,no,agad
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1,no,agad
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,1,no,agad
8,9,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,S,1,no,agad
9,10,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,C,1,yes,adult


In [71]:
pd.crosstab(df_train.Survived, df_train.Age)

Age,0.42,0.67,0.75,0.83,0.92,1.00,2.00,3.00,4.00,5.00,...,62.00,63.00,64.00,65.00,66.00,70.00,70.50,71.00,74.00,80.00
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,2,7,1,3,0,...,2,0,2,3,1,2,1,2,1,0
1,1,1,2,2,1,5,3,5,7,4,...,1,2,0,0,0,0,0,0,0,1


In [72]:
df_train['Name'] = df_train.Name.str.lower()

In [73]:
df_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived,age,new
0,1,3,"braund, mr. owen harris",male,22.0,1,0,A/5 21171,7.25,S,0,no,agad
1,2,1,"cumings, mrs. john bradley (florence briggs th...",female,38.0,1,0,PC 17599,71.2833,C,1,no,agad
2,3,3,"heikkinen, miss. laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1,no,agad
3,4,1,"futrelle, mrs. jacques heath (lily may peel)",female,35.0,1,0,113803,53.1,S,1,no,agad
4,5,3,"allen, mr. william henry",male,35.0,0,0,373450,8.05,S,0,no,agad


In [74]:
df_train['Name'] = df_train.Name.str.upper()

In [75]:
df_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived,age,new
0,1,3,"BRAUND, MR. OWEN HARRIS",male,22.0,1,0,A/5 21171,7.25,S,0,no,agad
1,2,1,"CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS TH...",female,38.0,1,0,PC 17599,71.2833,C,1,no,agad
2,3,3,"HEIKKINEN, MISS. LAINA",female,26.0,0,0,STON/O2. 3101282,7.925,S,1,no,agad
3,4,1,"FUTRELLE, MRS. JACQUES HEATH (LILY MAY PEEL)",female,35.0,1,0,113803,53.1,S,1,no,agad
4,5,3,"ALLEN, MR. WILLIAM HENRY",male,35.0,0,0,373450,8.05,S,0,no,agad


In [76]:
df_train.Name = df_train.Name.str.title()

In [77]:
df_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived,age,new
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0,no,agad
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1,no,agad
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1,no,agad
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,1,no,agad
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0,no,agad


In [78]:
df_train = pd.read_csv('train.csv')

In [79]:
df_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [80]:
df_train.Embarked.fillna('Missing', inplace=True)

In [81]:
df_train.Embarked.replace(['Missing'], 'Unknown', inplace=True)

In [82]:
df_train[df_train.Embarked == 'Unknown']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,Unknown
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,Unknown


In [83]:
df_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [84]:
df_train.Age.mean()

29.69911764705882

In [85]:
df_train.Age.median()

28.0

In [86]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [87]:
df_train['Age Level'] = np.where((df_train.Age <=18), 'Teen Age', 
                                 np.where((df_train.Age <=8 ), 'Baby', 'aged'))

In [88]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Level
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,aged
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,aged
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,aged
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,aged
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,aged


In [89]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
Age Level       object
dtype: object

In [90]:
df_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Age Level        0
dtype: int64

In [91]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Level
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,aged
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,aged
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,aged
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,aged
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,aged


In [92]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Level
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,aged
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,aged
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,aged
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,aged
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,aged


In [93]:
df_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Age Level        0
dtype: int64

In [94]:
df_train['Age'] = df_train.Age.fillna(df_train.Age.median())

In [95]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Level
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,aged
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,aged
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,aged
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,aged
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,aged


In [96]:
df_train['Cabin'] = df_train.Cabin.fillna(method='bfill')

In [97]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Level
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,C85,S,aged
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,aged
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,C123,S,aged
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,aged
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,E46,S,aged


In [98]:
df_train['Cabin'] = df_train['Cabin'].fillna(method='ffill')

In [99]:
df_train.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Age Level      0
dtype: int64

In [100]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
Age Level       object
dtype: object

In [101]:
df_model = df_train.copy()

In [102]:
df_model.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age Level
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,C85,S,aged
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,aged
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,C123,S,aged
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,aged
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,E46,S,aged


In [103]:
df_train.describe(include=['object'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Age Level
count,891,891,891,891,891,891
unique,891,2,681,147,4,2
top,"Harris, Mr. Henry Birkhardt",male,CA. 2343,C78,S,aged
freq,1,577,7,33,644,752


In [104]:
df_train.rename(columns={'Age Level': 'Age stages'}, inplace=True)

In [105]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age stages
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,C85,S,aged
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,aged
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,C123,S,aged
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,aged
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,E46,S,aged


In [106]:
for label, content in df_train.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

Name
Sex
Ticket
Cabin
Embarked
Age stages


In [107]:
df_train.Age.dtype

dtype('float64')

In [108]:
for label, content in df_train.items():
    if pd.api.types.is_string_dtype(content):
        df_train[label] = content.astype('category').cat.as_ordered()

In [109]:
df_train.dtypes

PassengerId       int64
Survived          int64
Pclass            int64
Name           category
Sex            category
Age             float64
SibSp             int64
Parch             int64
Ticket         category
Fare            float64
Cabin          category
Embarked       category
Age stages     category
dtype: object

In [110]:
for label, content in df_train.items():
    if pd.api.types.is_categorical_dtype(content):
        df_train[label] = content.cat.codes + 1

In [111]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age stages
0,1,0,3,109,2,22.0,1,0,524,7.25,82,3,2
1,2,1,1,191,1,38.0,1,0,597,71.2833,82,1,2
2,3,1,3,354,1,26.0,0,0,670,7.925,56,3,2
3,4,1,1,273,1,35.0,1,0,50,53.1,56,3,2
4,5,0,3,16,2,35.0,0,0,473,8.05,130,3,2


In [112]:
x = df_train[df_train['Age'] < 1]

In [113]:
df_train['Age'] = df_train['Age'].replace([x], 'NaN')

In [114]:
df_train[df_train.Age == 'NaN']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age stages
1,2,1,1,191,1,,1,0,597,71.2833,82,1,2
6,7,0,1,516,2,,0,0,86,51.8625,130,3,2
7,8,0,3,625,2,,3,1,396,21.0750,146,3,1
15,16,1,2,360,1,,0,0,154,16.0000,112,3,2
16,17,0,3,683,2,,4,1,481,29.1250,112,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
824,825,0,3,631,2,,4,1,250,39.6875,22,3,1
827,828,1,2,504,2,,0,2,619,37.0042,22,1,1
831,832,1,2,687,2,,1,1,238,18.7500,131,3,1
844,845,0,3,190,2,,0,0,265,8.6625,87,3,1


In [115]:
df_train 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age stages
0,1,0,3,109,2,22,1,0,524,7.2500,82,3,2
1,2,1,1,191,1,,1,0,597,71.2833,82,1,2
2,3,1,3,354,1,26,0,0,670,7.9250,56,3,2
3,4,1,1,273,1,35,1,0,50,53.1000,56,3,2
4,5,0,3,16,2,35,0,0,473,8.0500,130,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,549,2,27,0,0,102,13.0000,31,3,2
887,888,1,1,304,1,19,0,0,15,30.0000,31,3,2
888,889,0,3,414,1,28,1,2,676,23.4500,61,3,2
889,890,1,1,82,2,26,0,0,9,30.0000,61,1,2


In [116]:
df_train.to_csv('Filled-titanic.csv')

## Modeling our data with Sklearn

importing important models for classification from sklearn library

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

importing important libraries from sklearn for feature-engineering

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

**Split the Data with the help of train_test_split**

In [None]:
X = df_model.drop('Survived', axis=1)
y = df_model['Survived']

In [None]:
for label, content in df_model.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

In [None]:
for label, content in df_model.items():
    if pd.api.types.is_string_dtype(content):
        df_model[label] = content.astype(str)

        
df_model.info()

## Feature Engineering

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
conde = df_model[df_model.columns[:]].apply(le.fit_transform)

In [None]:
conde

## Feature engineering with Pandas categorical to numeic

In [None]:
XX = X.copy()

In [None]:
for label, content in XX.items():
    if pd.api.types.is_string_dtype(content):
        XX[label] = content.astype('category').cat.as_ordered()

In [None]:
for label, content in XX.items():
    if pd.api.types.is_categorical_dtype(content):
        XX[label] = content.cat.codes+1

In [None]:
XX.head()

In [None]:
X.head()

split the test and train data and fit into train_test_split()

In [None]:
XX_train, XX_test, y_train, y_test = train_test_split(XX, y, test_size=0.2)

## let's fit different models
1. LogisticRegression()

In [None]:
model = LogisticRegression()
model.fit(XX_train, y_train)

In [None]:
model.score(XX_test, y_test)

2. RandomForestClassifier()

In [None]:
model = RandomForestClassifier()
model.fit(XX_train, y_train)

In [None]:
model.score(XX_test, y_test)

3. KneighboresClassifier

In [None]:
model = KNeighborsClassifier()
model.fit(XX_train, y_train)

In [None]:
model.score(XX_test, y_test)

4. DecisionTreeClassifier()

In [None]:
model = DecisionTreeClassifier()
model.fit(XX_train, y_train)

In [None]:
model.score(XX_test, y_test)

5. GradientBoostingClassifier

In [None]:
model = GradientBoostingClassifier()
model.fit(XX_train, y_train)
model.score(XX_test, y_test)

#### will pick GradientBoostingClassifier()

### Let's predict on our data
* predict()
* predict_proba()

In [None]:
model.predict(XX_test)

In [None]:
y_preds = model.predict(XX_test)
np.mean(y_preds == y_test)

In [None]:
model.score(XX_test, y_test)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_preds)

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(model, XX, y)

In [None]:
cross_val_score(model, XX, y, cv=5)

In [None]:
np.random.seed()

model_single_score = model.score(XX_test, y_test)

model_cross_val_mean = np.mean(cross_val_score(model, XX, y, cv=10))

model_single_score, model_cross_val_mean

In [None]:
model_cross_val_mean = np.mean(cross_val_score(model, XX, y, cv=10, scoring=None))
model_cross_val_mean

classification model evaluation metrics
1. Accuracy
2. Area under ROC curve
3. Confusion Metrix
4. Classification Report

In [None]:
print(f'Survived people in titanic score on titanic dataset: {model_cross_val_mean * 100: .2f}%')

In [None]:
sns.set(font_scale = 1.5)

con_mat = confusion_matrix(y_test, y_preds)

sns.heatmap(con_mat);

In [None]:
con_mat

In [None]:
def plot_con_mat(con_mat):
    
    fig, ax = plt.subplots(figsize=(3,3))
    ax = sns.heatmap(con_mat, annot=True, cbar=False)
    
    plt.xlabel('Actual Label')
    plt.ylabel('Predicted Label')
    
    
plot_con_mat(con_mat)

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(model, XX_test, y_test)

In [None]:

from sklearn.metrics import classification_report
class_rep = classification_report(y_test, y_preds)

print(class_rep)

In [None]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [None]:
reg = Ridge(normalize=True)
lasso = Lasso(alpha=0.4, normalize=True)

In [None]:
Xn = XX
yn = y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xn, yn, test_size=0.2, random_state=42)

In [None]:
reg.fit(X_train, y_train)
reg.score(X_train, y_train)

In [None]:
reg.score(X_test, y_test)

### Cross Validation


In [None]:
cross_val = cross_val_score(reg, XX, y, cv=10)

In [None]:
cross_val = cross_val.mean()
cross_val

In [None]:
lasso.fit(X_train, y_train).coef_
lasso.score(X_train, y_train)

### Classification model Evaluation metrics
1. Accuracy
2. roc_curve
3. confusion_metrics
4. Classification Report

In [None]:
df_model.head()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = df_model.d