In [1]:
#Getting modules ready
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Imputing the data in age and dropping Name, Ticket, Cabin.PassengerId
x_data = train.drop(columns=['Name', 'PassengerId', 'Ticket', 'Cabin'])

In [6]:
x_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [7]:
x_data.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [8]:
x_data.dropna(subset=["Embarked"],inplace=True)
x_data.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64

In [9]:
x_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [23]:
len(x_data)

889

In [24]:
# Fill the age by the mean of the ages and encode the non numeric data
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
cat_features = ['Sex','Embarked']
cat_transformer = Pipeline([('One_hot', OneHotEncoder(handle_unknown='ignore'))])
age_feature = ["Age"]
age_transformer = Pipeline([('imputer', SimpleImputer(strategy='mean'))])

# Setup preprocessing steps (fill missing values, then convert to numbers)

preprocessor = ColumnTransformer(transformers=[
                                ('cat',cat_transformer,cat_features),
                                ('age',age_transformer,age_feature)])
# Create a preprocessing and modelling pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier())
])
# splitting the x_data in train and test
x = x_data.drop('Survived',axis =1)
y = x_data['Survived']
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2)
clf.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('One_hot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Sex', 'Embarked']),
                                                 ('age',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  ['Age'])])),
                ('clf', RandomForestClassifier())])

In [25]:
clf.score(x_test,y_test)

0.7303370786516854

In [26]:
clf.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'clf', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__verbose_feature_names_out', 'preprocessor__cat', 'preprocessor__age', 'preprocessor__cat__memory', 'preprocessor__cat__steps', 'preprocessor__cat__verbose', 'preprocessor__cat__One_hot', 'preprocessor__cat__One_hot__categories', 'preprocessor__cat__One_hot__drop', 'preprocessor__cat__One_hot__dtype', 'preprocessor__cat__One_hot__handle_unknown', 'preprocessor__cat__One_hot__sparse', 'preprocessor__age__memory', 'preprocessor__age__steps', 'preprocessor__age__verbose', 'preprocessor__age__imputer', 'preprocessor__age__imputer__add_indicator', 'preprocessor__age__imputer__copy', 'preprocessor__age__imputer__fill_value', 'preprocessor__age__imputer__missing_values', 'preprocessor__age__imputer__strategy', 'preprocessor__age__imputer__ver

In [27]:
# Doing Evaluation using GridSearchCV
pipe_grid = {
    'clf__n_estimators': [100,1000],
    'clf__max_depth': [None, 5],
    'clf__max_features' : ['auto', 'sqrt'],
    'clf__min_samples_split':[2, 4]
}
gs_clf = GridSearchCV(clf, 
                        pipe_grid,
                        cv =5,
                       verbose=2)
gs_clf.fit(x_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END clf__max_depth=None, clf__max_features=auto, clf__min_samples_split=2, clf__n_estimators=100; total time=   0.0s
[CV] END clf__max_depth=None, clf__max_features=auto, clf__min_samples_split=2, clf__n_estimators=100; total time=   0.0s
[CV] END clf__max_depth=None, clf__max_features=auto, clf__min_samples_split=2, clf__n_estimators=100; total time=   0.0s
[CV] END clf__max_depth=None, clf__max_features=auto, clf__min_samples_split=2, clf__n_estimators=100; total time=   0.0s
[CV] END clf__max_depth=None, clf__max_features=auto, clf__min_samples_split=2, clf__n_estimators=100; total time=   0.0s
[CV] END clf__max_depth=None, clf__max_features=auto, clf__min_samples_split=2, clf__n_estimators=1000; total time=   1.0s
[CV] END clf__max_depth=None, clf__max_features=auto, clf__min_samples_split=2, clf__n_estimators=1000; total time=   1.0s
[CV] END clf__max_depth=None, clf__max_features=auto, clf__min_samples_split=2, clf

[CV] END clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_split=2, clf__n_estimators=1000; total time=   1.0s
[CV] END clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_split=2, clf__n_estimators=1000; total time=   1.0s
[CV] END clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_split=4, clf__n_estimators=100; total time=   0.0s
[CV] END clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_split=4, clf__n_estimators=100; total time=   0.0s
[CV] END clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_split=4, clf__n_estimators=100; total time=   0.0s
[CV] END clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_split=4, clf__n_estimators=100; total time=   0.0s
[CV] END clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_split=4, clf__n_estimators=100; total time=   0.0s
[CV] END clf__max_depth=5, clf__max_features=sqrt, clf__min_samples_split=4, clf__n_estimators=1000; total time=   0.9s
[CV] END clf__max_depth=5, clf__max_features=

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('One_hot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['Sex',
                                                                          'Embarked']),
                                                                        ('age',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer())]),
                                                                         ['Age'])])),
                                       ('clf', RandomForestClassi

In [28]:
gs_clf.score(x_test, y_test)

0.8258426966292135

In [29]:
# getting the values for test data set
test = pd.read_csv('test.csv')

In [30]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [31]:
y_preds = gs_clf.predict(test)

In [32]:
y_preds = pd.DataFrame(y_preds)

In [46]:
PassengerId = test["PassengerId"]
PassengerId

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [52]:
submission = pd.DataFrame({'PassengerId': PassengerId})

submission["Survived"] = y_preds

In [53]:
submission.head()


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [54]:
submission.to_csv('submission2.csv', index = False)