In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
def  impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        elif Pclass == 3:
            return 24
    else:
        return Age

In [6]:
data['Age'] = data[['Age', 'Pclass']].apply(impute_age, axis=1)
data['Embarked'].fillna('S', inplace=True)
data.drop(columns=['Cabin', 'Ticket'], inplace=True)
data.isnull().sum()

  Age = cols[0]
  Pclass = cols[1]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Embarked'].fillna('S', inplace=True)


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [7]:
dummy_sex = pd.get_dummies(data['Sex'], drop_first=True, dtype=int)
dummy_sex.head()

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1


In [8]:
dummy_embarked = pd.get_dummies(data['Embarked'], drop_first=True, dtype=int, prefix='Embarked', prefix_sep='_')
dummy_embarked.head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


In [9]:
data = pd.concat([data, dummy_sex, dummy_embarked], axis=1)
data.drop(columns=['Sex', 'Embarked'], inplace=True)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Fare,male,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,7.25,1,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,71.2833,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,7.925,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,53.1,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,8.05,1,0,1


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   male         891 non-null    int32  
 9   Embarked_Q   891 non-null    int32  
 10  Embarked_S   891 non-null    int32  
dtypes: float64(2), int32(3), int64(5), object(1)
memory usage: 66.3+ KB


In [11]:
Names = data['Name']
data['Name'][1]

'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'

In [12]:
def  modify_names(Name):
    Name = Name.split(',')
    Surname = Name[0].strip()
    return Surname

data['Name'] = data['Name'].apply(modify_names)

In [13]:
data['Name']

0         Braund
1        Cumings
2      Heikkinen
3       Futrelle
4          Allen
         ...    
886     Montvila
887       Graham
888     Johnston
889         Behr
890       Dooley
Name: Name, Length: 891, dtype: object

In [14]:
dummy_names = pd.get_dummies(data['Name'], dtype=int, prefix='Surname', prefix_sep='_')
dummy_names.head()

Unnamed: 0,Surname_Abbing,Surname_Abbott,Surname_Abelson,Surname_Adahl,Surname_Adams,Surname_Ahlin,Surname_Aks,Surname_Albimona,Surname_Alexander,Surname_Alhomaki,...,Surname_Yousseff,Surname_Yrois,Surname_Zabour,Surname_Zimmerman,Surname_de Messemaeker,Surname_de Mulder,Surname_de Pelsmaeker,Surname_del Carlo,Surname_van Billiard,Surname_van Melkebeke
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
data = pd.concat([data, dummy_names], axis=1)
data.drop('Name', axis=1, inplace=True)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,male,Embarked_Q,Embarked_S,...,Surname_Yousseff,Surname_Yrois,Surname_Zabour,Surname_Zimmerman,Surname_de Messemaeker,Surname_de Mulder,Surname_de Pelsmaeker,Surname_del Carlo,Surname_van Billiard,Surname_van Melkebeke
0,1,0,3,22.0,1,0,7.25,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2,1,1,38.0,1,0,71.2833,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,3,26.0,0,0,7.925,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,4,1,1,35.0,1,0,53.1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,0,3,35.0,0,0,8.05,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [16]:
X = data.drop(columns=['PassengerId', 'Survived'])
y = data['Survived']
all_features = X.columns
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,male,Embarked_Q,Embarked_S,Surname_Abbing,Surname_Abbott,...,Surname_Yousseff,Surname_Yrois,Surname_Zabour,Surname_Zimmerman,Surname_de Messemaeker,Surname_de Mulder,Surname_de Pelsmaeker,Surname_del Carlo,Surname_van Billiard,Surname_van Melkebeke
0,3,22.0,1,0,7.2500,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,38.0,1,0,71.2833,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,26.0,0,0,7.9250,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,35.0,1,0,53.1000,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,35.0,0,0,8.0500,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
887,1,19.0,0,0,30.0000,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
888,3,24.0,1,2,23.4500,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
889,1,26.0,0,0,30.0000,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=104)

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_score(y_pred, y_test)

0.8435754189944135

In [18]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Columns: 675 entries, Pclass to Surname_van Melkebeke
dtypes: float64(2), int32(670), int64(3)
memory usage: 2.3 MB


In [19]:
model.fit(X,y)

# Test

In [20]:
data = pd.read_csv("test.csv")
data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [21]:
data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [22]:
def  impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        elif Pclass == 3:
            return 24
    else:
        return Age

In [23]:
data['Age'] = data[['Age', 'Pclass']].apply(impute_age, axis=1)
data['Embarked'].fillna('S', inplace=True)
data.drop(columns=['Cabin', 'Ticket'], inplace=True)
data['Fare'].fillna(data['Fare'].mean(), inplace=True)
data.isnull().sum()

  Age = cols[0]
  Pclass = cols[1]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Embarked'].fillna('S', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Fare'].fillna(data['Fare'].mean(), inplace=True)


PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [24]:
dummy_sex = pd.get_dummies(data['Sex'], drop_first=True, dtype=int)
dummy_embarked = pd.get_dummies(data['Embarked'], drop_first=True, dtype=int, prefix='Embarked', prefix_sep='_')
data = pd.concat([data, dummy_sex, dummy_embarked], axis=1)
data.drop(columns=['Sex', 'Embarked'], inplace=True)
data.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Fare,male,Embarked_Q,Embarked_S
0,892,3,"Kelly, Mr. James",34.5,0,0,7.8292,1,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,7.0,0,0,1
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,9.6875,1,1,0
3,895,3,"Wirz, Mr. Albert",27.0,0,0,8.6625,1,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,12.2875,0,0,1


In [25]:
def  modify_names(Name):
    Name = Name.split(',')
    Surname = Name[0].strip()
    return Surname

data['Name'] = data['Name'].apply(modify_names)
data['Name']

0              Kelly
1             Wilkes
2              Myles
3               Wirz
4           Hirvonen
           ...      
413          Spector
414    Oliva y Ocana
415          Saether
416             Ware
417            Peter
Name: Name, Length: 418, dtype: object

In [26]:
dummy_names = pd.get_dummies(data['Name'], dtype=int, prefix='Surname', prefix_sep='_')
dummy_names.head()

Unnamed: 0,Surname_Abbott,Surname_Abelseth,Surname_Abrahamsson,Surname_Abrahim,Surname_Aks,Surname_Aldworth,Surname_Allison,Surname_Andersen,Surname_Andersson,Surname_Andrew,...,Surname_Williams,Surname_Wilson,Surname_Wirz,Surname_Wittevrongel,Surname_Wright,Surname_Zakarian,Surname_de Brito,Surname_de Messemaeker,Surname_del Carlo,Surname_van Billiard
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
data = pd.concat([data, dummy_names], axis=1)
data.drop('Name', axis=1, inplace=True)
data.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,male,Embarked_Q,Embarked_S,Surname_Abbott,...,Surname_Williams,Surname_Wilson,Surname_Wirz,Surname_Wittevrongel,Surname_Wright,Surname_Zakarian,Surname_de Brito,Surname_de Messemaeker,Surname_del Carlo,Surname_van Billiard
0,892,3,34.5,0,0,7.8292,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,893,3,47.0,1,0,7.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,894,2,62.0,0,0,9.6875,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,895,3,27.0,0,0,8.6625,1,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,896,3,22.0,1,1,12.2875,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
X = data.drop(columns=['PassengerId'])

In [29]:
for feature in all_features:
    if feature not in X.columns:
        X[feature]=0


  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[feature]=0
  X[featur

In [30]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,male,Embarked_Q,Embarked_S,Surname_Abbott,Surname_Abelseth,...,Surname_Young,Surname_Youseff,Surname_Yousif,Surname_Yousseff,Surname_Yrois,Surname_Zabour,Surname_Zimmerman,Surname_de Mulder,Surname_de Pelsmaeker,Surname_van Melkebeke
0,3,34.5,0,0,7.8292,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,47.0,1,0,7.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,62.0,0,0,9.6875,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,27.0,0,0,8.6625,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,22.0,1,1,12.2875,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
for feature in X.columns:
    if feature not in all_features:
        X.drop(feature, axis=1, inplace=True)

In [32]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,male,Embarked_Q,Embarked_S,Surname_Abbott,Surname_Aks,...,Surname_Young,Surname_Youseff,Surname_Yousif,Surname_Yousseff,Surname_Yrois,Surname_Zabour,Surname_Zimmerman,Surname_de Mulder,Surname_de Pelsmaeker,Surname_van Melkebeke
0,3,34.5,0,0,7.8292,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,47.0,1,0,7.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,62.0,0,0,9.6875,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,27.0,0,0,8.6625,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,22.0,1,1,12.2875,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
all_features

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'male', 'Embarked_Q',
       'Embarked_S', 'Surname_Abbing', 'Surname_Abbott',
       ...
       'Surname_Yousseff', 'Surname_Yrois', 'Surname_Zabour',
       'Surname_Zimmerman', 'Surname_de Messemaeker', 'Surname_de Mulder',
       'Surname_de Pelsmaeker', 'Surname_del Carlo', 'Surname_van Billiard',
       'Surname_van Melkebeke'],
      dtype='object', length=675)

In [34]:
X.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'male', 'Embarked_Q',
       'Embarked_S', 'Surname_Abbott', 'Surname_Aks',
       ...
       'Surname_Young', 'Surname_Youseff', 'Surname_Yousif',
       'Surname_Yousseff', 'Surname_Yrois', 'Surname_Zabour',
       'Surname_Zimmerman', 'Surname_de Mulder', 'Surname_de Pelsmaeker',
       'Surname_van Melkebeke'],
      dtype='object', length=675)

In [35]:
List = []
for feature in all_features:
    index = X.columns.get_loc(feature)
    List.append(index)
List

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 152,
 8,
 153,
 154,
 155,
 156,
 9,
 157,
 158,
 159,
 160,
 161,
 10,
 162,
 163,
 164,
 11,
 165,
 12,
 166,
 13,
 167,
 168,
 169,
 170,
 14,
 15,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 16,
 193,
 194,
 195,
 196,
 197,
 198,
 17,
 199,
 200,
 201,
 202,
 203,
 204,
 205,
 206,
 18,
 207,
 19,
 208,
 20,
 209,
 210,
 21,
 211,
 212,
 213,
 22,
 23,
 214,
 24,
 215,
 216,
 217,
 218,
 219,
 25,
 220,
 221,
 26,
 222,
 223,
 224,
 27,
 225,
 28,
 226,
 29,
 30,
 31,
 32,
 227,
 33,
 228,
 34,
 229,
 35,
 230,
 231,
 232,
 233,
 234,
 36,
 37,
 38,
 235,
 236,
 237,
 238,
 239,
 240,
 241,
 242,
 243,
 39,
 244,
 40,
 245,
 41,
 42,
 246,
 43,
 247,
 248,
 44,
 45,
 249,
 46,
 250,
 251,
 252,
 253,
 47,
 48,
 254,
 255,
 256,
 49,
 50,
 257,
 51,
 52,
 258,
 53,
 259,
 260,
 261,
 54,
 262,
 263,
 264,
 265,
 55,
 266,
 267,
 268,
 56,
 269,
 270,
 57,
 271,


In [36]:
X = X.iloc[:, List]
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,male,Embarked_Q,Embarked_S,Surname_Abbing,Surname_Abbott,...,Surname_Yousseff,Surname_Yrois,Surname_Zabour,Surname_Zimmerman,Surname_de Messemaeker,Surname_de Mulder,Surname_de Pelsmaeker,Surname_del Carlo,Surname_van Billiard,Surname_van Melkebeke
0,3,34.5,0,0,7.8292,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,47.0,1,0,7.0000,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,62.0,0,0,9.6875,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,27.0,0,0,8.6625,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,22.0,1,1,12.2875,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,24.0,0,0,8.0500,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
414,1,39.0,0,0,108.9000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
415,3,38.5,0,0,7.2500,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
416,3,24.0,0,0,8.0500,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
y = model.predict(X)

In [38]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Columns: 675 entries, Pclass to Surname_van Melkebeke
dtypes: float64(2), int32(147), int64(526)
memory usage: 1.9 MB


In [39]:
data['Survived'] = y

In [40]:
submission_data = data[['PassengerId', 'Survived']]
submission_data

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [41]:
submission_data.to_csv('submission_random_forest.csv', index=False)