# First we Import necessary library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn import set_config
from IPython.display import display
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import pickle

# Importing our Dataset
 # train.csv

In [None]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [None]:
df = pd.read_csv("train.csv")

In [None]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Info.
  * Here is some information about our datset

In [None]:
df.shape

(891, 12)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [None]:
df["Sex"].value_counts()

Unnamed: 0_level_0,count
Sex,Unnamed: 1_level_1
male,577
female,314


In [None]:
df["Embarked"].value_counts()

Unnamed: 0_level_0,count
Embarked,Unnamed: 1_level_1
S,644
C,168
Q,77


# Info-conclusion
  * By that info we conclude that all of the columns is not necessary to use so we remove some.
  * Removable columns :
 1. Passenger id
 2. Name.
 3. Cabin.
 4. Ticket
  * And Age , Embarked had missing values.
 * first is we need to fill missing values

# Removing unnecessary columns

In [None]:
df.drop(columns = ["PassengerId",
"Name", "Ticket", "Cabin"], inplace= True)

In [None]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Train-Test-Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:8], df.iloc[:, 0],
test_size=0.2, random_state=42)

In [None]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [50]:
y_train.head()

Unnamed: 0,Survived
331,0
733,0
382,0
704,0
813,0


In [None]:
y_test.head()

Unnamed: 0,Survived
709,1
439,0
840,0
720,1
39,1


# SIMPLE IMPUTER
 * This is a test prediction model so we just assign all values of age with their mean.
 * And embarked with their most frequent value.

# All things will done Using Column Transformer

In [None]:
trf1 = ColumnTransformer([
("Impute_Age", SimpleImputer(), [2]),
("Impute_embark", SimpleImputer(strategy="most_frequent"), [6])],
remainder="passthrough")

# OneHotEncoding

In [None]:
trf2 = ColumnTransformer([
("Ohe_sex_embarked", OneHotEncoder(sparse_output=False,handle_unknown="ignore"), [1,6])],
remainder="passthrough")

# Scaling

In [None]:
trf3 = ColumnTransformer([
("Scaler", MinMaxScaler(), slice(0,10))])

# Here we train the model

In [None]:
trf4 = DecisionTreeClassifier()

# Creating Pipelines

In [None]:
set_config(display="diagram")

In [None]:
pipe = Pipeline([
('trf1',trf1),
('trf2',trf2),
('trf3',trf3),
('trf4', trf4)
])

# Pipeline V/S make_pipeline
 * pipeline requires name of our object but make pipeline not
 * make_pipeline(trf1, trf2....)
# Same with ColumnTransformer and make_ColumnTransformer

# TRAINING THE DATSET

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('Impute_Age', SimpleImputer(), [2]),
                                 ('Impute_embark',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('Ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('Scaler', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': DecisionTreeClassifier()}

In [None]:
display(pipe)

# Our model Predictions and it's Accuracy Score

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0])

In [None]:
y_test

Unnamed: 0,Survived
709,1
439,0
840,0
720,1
39,1
...,...
433,0
773,0
25,1
84,1


In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_test, y_pred)

0.6256983240223464

# Cross Validation For increasing accuracy

In [None]:
new_acc = cross_val_score(pipe , X_train, y_train, cv=5, scoring="accuracy").mean()
print(new_acc)

0.6391214419383433


# GridSearchCV

In [None]:
params = {
'trf4__max_depth':[1,2,3,4,5,None]
}

In [None]:
Grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')

In [None]:
Grid.fit(X_train, y_train)

In [None]:
pickle.dump(pipe,open('pipe.pkl','wb'))

In [None]:
files.download('pipe.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>