<h3>Imports</h3>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
import pickle

<h3>Loading the data into the data frame</h3>

In [2]:
df = pd.read_csv("../../Data/train.csv")

<h3>Descovering Data</h3>

In [3]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
#remove PassengerId, Name not important 
df = df.drop(["PassengerId","Name"],axis=1)
df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,female,35.0,1,0,113803,53.1,C123,S
4,0,3,male,35.0,0,0,373450,8.05,,S


In [5]:
df.dtypes


Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [6]:
df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

<ol>
    <li>Age,Cabin,Embarked has null values</li>
    <li>Age is Quantitative and Cabin,Embarked Categorical</li>
</ol>


<h3>Dealing with Categorized Data</h3>

In [7]:
#sex,ticket,cabin,Embarked are categorical data
#cabin,Embarked have missing data

In [8]:
percent_Cabin_missing = (df.isna().sum().Cabin)/len(df.Cabin)
print(percent_Cabin_missing)
#more than 70% drop Col
df = df.drop(["Cabin"],axis=1)

0.7710437710437711


In [9]:
#all Tickets has 1 to 4 survivors not important
df = df.drop(["Ticket"],axis=1)

<h3>Splitting Data</h3>

In [10]:
y = df.Survived#Target Variable
X = df.drop(["Survived"],axis=1)#
# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

<h3>Dealing with Missing Data</h3>

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

categorical_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
numerical_cols = [col for col in X_train.columns if col not in  categorical_cols]
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='most_frequent')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

<h3>Building model</h3>

In [12]:
from sklearn import metrics
model = XGBClassifier(n_estimators=130,learning_rate=0.03,use_label_encoder=False,gamma=0.08)
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)
print("Validation Accuracy:",(metrics.accuracy_score(y_valid, preds))*100)
filename = f'model_v1.sav'
pickle.dump(my_pipeline, open(filename, 'wb'))

Validation Accuracy: 86.03351955307262
