In [6]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.impute import SimpleImputer

In [7]:
train_set=pd.read_csv('train.csv')
test_set=pd.read_csv('test.csv')
gender_submission=pd.read_csv('gender_submission.csv')

In [8]:
#train_set = train_set.dropna(subset=['Embarked'])
train_set.shape

(891, 12)

In [9]:
X_train_all=train_set.drop(['PassengerId','Survived','Name','Ticket','Cabin'],axis=1)
X_test=test_set.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)
Y_train_all=train_set['Survived']

In [10]:
X_train_all.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S
5,3,male,,0,0,8.4583,Q
6,1,male,54.0,0,0,51.8625,S
7,3,male,2.0,3,1,21.075,S
8,3,female,27.0,0,2,11.1333,S
9,2,female,14.0,1,0,30.0708,C


In [11]:
s = (X_train_all.dtypes == 'object')
object_cols = list(s[s].index)
object_cols

['Sex', 'Embarked']

In [12]:
t= (X_train_all.dtypes != 'object')
numerical_cols = list(t[t].index)
numerical_cols

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [53]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
help(SimpleImputer)

Help on class SimpleImputer in module sklearn.impute._base:

class SimpleImputer(_BaseImputer)
 |  SimpleImputer(missing_values=nan, strategy='mean', fill_value=None, verbose=0, copy=True, add_indicator=False)
 |  
 |  Imputation transformer for completing missing values.
 |  
 |  Read more in the :ref:`User Guide <impute>`.
 |  
 |  Parameters
 |  ----------
 |  missing_values : number, string, np.nan (default) or None
 |      The placeholder for the missing values. All occurrences of
 |      `missing_values` will be imputed.
 |  
 |  strategy : string, default='mean'
 |      The imputation strategy.
 |  
 |      - If "mean", then replace missing values using the mean along
 |        each column. Can only be used with numeric data.
 |      - If "median", then replace missing values using the median along
 |        each column. Can only be used with numeric data.
 |      - If "most_frequent", then replace missing using the most frequent
 |        value along each column. Can be used wi

In [85]:
numerical_transformer = SimpleImputer(strategy='constant')
numerical_transformer1 = SimpleImputer(strategy='median')
numerical_transformer2 = SimpleImputer(strategy='most_frequent')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, object_cols)
    ])
preprocessor1 = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer1, numerical_cols),
        ('cat', categorical_transformer, object_cols)
    ])
preprocessor2 = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer2, numerical_cols),
        ('cat', categorical_transformer, object_cols)
    ])

In [86]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=500,max_depth=9,n_jobs=-1,random_state = 1)
model1 = RandomForestRegressor(n_estimators=500,max_depth=10,n_jobs=-1,random_state = 1)

In [87]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor1),
                              ('model', model)
                             ])

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train,X_val,y_train,y_val = train_test_split(X_train_all, Y_train_all, random_state = 0,test_size=0.2)

In [88]:
my_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0),
                                                  ['Pclass', 'Age', 'SibSp',
                                                   'Parch', 'Fare']),
                                                 ('cat',
                

In [89]:
val = my_pipeline.predict(X_val)
val = np.where(val > 0.5, 1, 0)

In [22]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error,accuracy_score

In [90]:
accuracy_score(y_val, val)

0.8324022346368715

In [91]:
my_pipeline.fit(X_train_all, Y_train_all)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0),
                                                  ['Pclass', 'Age', 'SibSp',
                                                   'Parch', 'Fare']),
                                                 ('cat',
                

In [92]:
preds = my_pipeline.predict(X_test)

In [93]:
preds = np.where(preds > 0.5, 1, 0)

In [94]:
preds

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [95]:
ss1=pd.read_csv('gender_submission.csv')
ss1=ss1.drop('Survived',axis=1)
ss1['Survived']=preds
ss1.to_csv('submission127.csv',index=False)

In [46]:
from sklearn.ensemble import RandomForestClassifier
help(RandomForestClassifier)

Help on class RandomForestClassifier in module sklearn.ensemble._forest:

class RandomForestClassifier(ForestClassifier)
 |  RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
 |  
 |  A random forest classifier.
 |  
 |  A random forest is a meta estimator that fits a number of decision tree
 |  classifiers on various sub-samples of the dataset and uses averaging to
 |  improve the predictive accuracy and control over-fitting.
 |  The sub-sample size is always the same as the original
 |  input sample size but the samples are drawn with replacement if
 |  `bootstrap=True` (default).
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters


In [55]:
model2 = RandomForestClassifier(n_estimators=500,max_depth=9,n_jobs=-1,random_state = 1)
model3 = RandomForestClassifier(n_estimators=500,criterion="entropy",max_depth=9,n_jobs=-1,random_state = 1)
model4 = RandomForestClassifier(n_estimators=500,max_depth=9,n_jobs=-1,random_state = 1)

In [65]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model2)
                             ])

In [66]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='most_frequent',
                                                                verbose=0),
                                                  ['Pclass', 'Age', 'SibSp',
                                                   'Parch', 'Fare']),
                                                 ('cat',
         

In [67]:
y_pred=pipeline.predict(X_val)

In [36]:
from sklearn.metrics import classification_report, confusion_matrix

In [68]:
confusion_matrix(y_val,y_pred)

array([[103,   7],
       [ 17,  52]], dtype=int64)

In [69]:
classification_report(y_val,y_pred)

'              precision    recall  f1-score   support\n\n           0       0.86      0.94      0.90       110\n           1       0.88      0.75      0.81        69\n\n    accuracy                           0.87       179\n   macro avg       0.87      0.84      0.85       179\nweighted avg       0.87      0.87      0.86       179\n'

In [70]:
pipeline.fit(X_train_all, Y_train_all)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='most_frequent',
                                                                verbose=0),
                                                  ['Pclass', 'Age', 'SibSp',
                                                   'Parch', 'Fare']),
                                                 ('cat',
         

In [71]:
Y_pred=pipeline.predict(X_test)

In [72]:
ss2=pd.read_csv('gender_submission.csv')
ss2=ss2.drop('Survived',axis=1)
ss2['Survived']=Y_pred
ss2.to_csv('submission15.csv',index=False)