## Import the dependencies

In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer


## Data collection and analysis

In [2]:
titanic=pd.read_csv("titanic.csv")

print(titanic.shape)
titanic.head(15)

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
#getting some information about the data
titanic.info()

#check the number of missing values in each column
titanic.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
#feature selection
titanic=titanic.drop(columns=["PassengerId","Ticket","Cabin"]).sample(frac=1,random_state=55)
titanic.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
713,0,3,"Larsson, Mr. August Viktor",male,29.0,0,0,9.4833,S
726,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30.0,3,0,21.0,S
262,0,1,"Taussig, Mr. Emil",male,52.0,1,1,79.65,S
266,0,3,"Panula, Mr. Ernesti Arvid",male,16.0,4,1,39.6875,S
186,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,15.5,Q


In [5]:
titanic.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64

## Separate features and target

In [6]:
y=titanic.pop("Survived")
X=titanic

## Instantiate preprocessors and RandomForestClassifier

In [7]:
ohe=OneHotEncoder()
imp_median=SimpleImputer(strategy="median",add_indicator=True)
imp_constant=SimpleImputer(strategy="constant")
tfidf=TfidfVectorizer()
scaler=StandardScaler()
clf=RandomForestClassifier(random_state=1)

## Preprocessing

In [8]:
ct=make_column_transformer(
(make_pipeline(imp_median,scaler),["Pclass","Age","SibSp","Parch","Fare"]),
(make_pipeline(imp_constant,ohe),["Sex","Embarked"]),
(tfidf,"Name"))

In [9]:
pipe=Pipeline([
    ("preprocessor",ct),
    ("classifier",clf)])

## Model training and evaluation

In [13]:
#cross-validate the pipeline using the default parameters
cross_val_score(pipe,X,y,cv=5,scoring="accuracy").mean()

0.8294394576611637