In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [143]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [144]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [145]:
train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

In [146]:
train['Embarked'].isnull().sum()

2

In [147]:
test['Embarked'].isnull().sum()

0

In [148]:
train['Cabin'].isnull().sum()

687

In [149]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
Has_Cabin      891 non-null int64
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


In [150]:
train['Embarked'] = train['Embarked'].fillna('S')
test['Embarked'] = test['Embarked'].fillna('S')

In [151]:
train['Age'].mean()

29.69911764705882

In [152]:
test['Age'].mean()

30.272590361445783

In [153]:
mean = (train['Age'].mean() + test['Age'].mean()) / 2
mean

29.985854004252303

In [154]:
train['Age'] = train['Age'].fillna(mean)
test['Age'] = test['Age'].fillna(mean)

In [155]:
train = train.drop(['Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1)
test = test.drop(['Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1)

In [156]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Has_Cabin
0,0,3,male,22.0,1,0,7.25,S,0
1,1,1,female,38.0,1,0,71.2833,C,1
2,1,3,female,26.0,0,0,7.925,S,0
3,1,1,female,35.0,1,0,53.1,S,1
4,0,3,male,35.0,0,0,8.05,S,0


In [157]:
train.isnull().sum()

Survived     0
Pclass       0
Sex          0
Age          0
SibSp        0
Parch        0
Fare         0
Embarked     0
Has_Cabin    0
dtype: int64

In [158]:
train['Sex'] = train['Sex'].map( {'female': 0, 'male': 1} )
test['Sex'] = test['Sex'].map( {'female': 0, 'male': 1} )

In [159]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Has_Cabin
0,0,3,1,22.0,1,0,7.25,S,0
1,1,1,0,38.0,1,0,71.2833,C,1
2,1,3,0,26.0,0,0,7.925,S,0
3,1,1,0,35.0,1,0,53.1,S,1
4,0,3,1,35.0,0,0,8.05,S,0


In [160]:
train['Embarked'] = train['Embarked'].map( {'S': 0, 'C': 1, 'Q': 1} )
test['Embarked'] = test['Embarked'].map( {'S': 0, 'C': 1, 'Q': 1} )

In [162]:
tree = DecisionTreeClassifier()

In [163]:
y_train = train['Survived']

In [164]:
x_train = train.copy()

In [165]:
x_train = x_train.drop(['Survived'], axis=1)

In [166]:
x_train.values

array([[  3.      ,   1.      ,  22.      , ...,   7.25    ,   0.      ,
          0.      ],
       [  1.      ,   0.      ,  38.      , ...,  71.2833  ,   1.      ,
          1.      ],
       [  3.      ,   0.      ,  26.      , ...,   7.925   ,   0.      ,
          0.      ],
       ..., 
       [  3.      ,   0.      ,  29.985854, ...,  23.45    ,   0.      ,
          0.      ],
       [  1.      ,   1.      ,  26.      , ...,  30.      ,   1.      ,
          1.      ],
       [  3.      ,   1.      ,  32.      , ...,   7.75    ,   1.      ,
          0.      ]])

In [167]:
x_train = x_train.values

In [168]:
x_train

array([[  3.      ,   1.      ,  22.      , ...,   7.25    ,   0.      ,
          0.      ],
       [  1.      ,   0.      ,  38.      , ...,  71.2833  ,   1.      ,
          1.      ],
       [  3.      ,   0.      ,  26.      , ...,   7.925   ,   0.      ,
          0.      ],
       ..., 
       [  3.      ,   0.      ,  29.985854, ...,  23.45    ,   0.      ,
          0.      ],
       [  1.      ,   1.      ,  26.      , ...,  30.      ,   1.      ,
          1.      ],
       [  3.      ,   1.      ,  32.      , ...,   7.75    ,   1.      ,
          0.      ]])

In [169]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived     891 non-null int64
Pclass       891 non-null int64
Sex          891 non-null int64
Age          891 non-null float64
SibSp        891 non-null int64
Parch        891 non-null int64
Fare         891 non-null float64
Embarked     891 non-null int64
Has_Cabin    891 non-null int64
dtypes: float64(2), int64(7)
memory usage: 62.7 KB


In [170]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Has_Cabin
0,0,3,1,22.0,1,0,7.25,0,0
1,1,1,0,38.0,1,0,71.2833,1,1
2,1,3,0,26.0,0,0,7.925,0,0
3,1,1,0,35.0,1,0,53.1,0,1
4,0,3,1,35.0,0,0,8.05,0,0


In [171]:
train.isnull().sum()

Survived     0
Pclass       0
Sex          0
Age          0
SibSp        0
Parch        0
Fare         0
Embarked     0
Has_Cabin    0
dtype: int64

In [172]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Has_Cabin
0,0,3,1,22.0,1,0,7.25,0,0
1,1,1,0,38.0,1,0,71.2833,1,1
2,1,3,0,26.0,0,0,7.925,0,0
3,1,1,0,35.0,1,0,53.1,0,1
4,0,3,1,35.0,0,0,8.05,0,0


In [173]:
tree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [174]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Has_Cabin
0,0,3,1,22.0,1,0,7.25,0,0
1,1,1,0,38.0,1,0,71.2833,1,1
2,1,3,0,26.0,0,0,7.925,0,0
3,1,1,0,35.0,1,0,53.1,0,1
4,0,3,1,35.0,0,0,8.05,0,0


In [82]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Has_Cabin
0,3,1,34.5,0,0,7.8292,1,0
1,3,0,47.0,1,0,7.0,0,0
2,2,1,62.0,0,0,9.6875,1,0
3,3,1,27.0,0,0,8.6625,0,0
4,3,0,22.0,1,1,12.2875,0,0


In [179]:
x_test = test

In [182]:
x_test.isnull().sum()

Pclass       0
Sex          0
Age          0
SibSp        0
Parch        0
Fare         1
Embarked     0
Has_Cabin    0
dtype: int64

In [177]:
predictions = tree.predict(x_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [86]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Has_Cabin
0,3,1,34.5,0,0,7.8292,1,0
1,3,0,47.0,1,0,7.0,0,0
2,2,1,62.0,0,0,9.6875,1,0
3,3,1,27.0,0,0,8.6625,0,0
4,3,0,22.0,1,1,12.2875,0,0


In [87]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Has_Cabin
0,0,3,1,22.0,1,0,7.25,0,0
1,1,1,0,38.0,1,0,71.2833,1,1
2,1,3,0,26.0,0,0,7.925,0,0
3,1,1,0,35.0,1,0,53.1,0,1
4,0,3,1,35.0,0,0,8.05,0,0


In [88]:
x_test = test.copy()

In [89]:
x_test.drop('Survived', axis=1)

ValueError: labels ['Survived'] not contained in axis

In [90]:
x_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Has_Cabin
0,3,1,34.5,0,0,7.8292,1,0
1,3,0,47.0,1,0,7.0,0,0
2,2,1,62.0,0,0,9.6875,1,0
3,3,1,27.0,0,0,8.6625,0,0
4,3,0,22.0,1,1,12.2875,0,0


In [91]:
x_test = x_test.drop('Survived', axis=1)

ValueError: labels ['Survived'] not contained in axis

In [92]:
x_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Has_Cabin
0,3,1,34.50000,0,0,7.8292,1,0
1,3,0,47.00000,1,0,7.0000,0,0
2,2,1,62.00000,0,0,9.6875,1,0
3,3,1,27.00000,0,0,8.6625,0,0
4,3,0,22.00000,1,1,12.2875,0,0
5,3,1,14.00000,0,0,9.2250,0,0
6,3,0,30.00000,0,0,7.6292,1,0
7,2,1,26.00000,1,1,29.0000,0,0
8,3,0,18.00000,0,0,7.2292,1,0
9,3,1,21.00000,2,0,24.1500,0,0


In [93]:
predictions = tree.predict(x_test.values)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [94]:
score = round(tree.score(x_train, y_train) * 100, 2)
score

98.650000000000006

In [95]:
score = round(tree.score(x_test, y_test) * 100, 2)
score

NameError: name 'y_test' is not defined

In [96]:
predictions

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1,

In [97]:
submission = pd.DataFrame({
        "PassengerId": PassengerId,
        "Survived": predictions
    })

In [98]:
test = pd.read_csv('data/train.csv')

In [99]:
PassengerId = test['PassengerId']

In [100]:
submission = pd.DataFrame({
        "PassengerId": PassengerId,
        "Survived": predictions
    })

In [101]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
dtypes: int64(2)
memory usage: 14.0 KB


In [102]:
# WAY TOO HIGH A SCORE, SCORE IS 98.650000000000006%

#POSSIBLE ISSUES:

# I DIDN'T RUN THE TEST PROPERLY ?

# OR

# I OVERFITTED THE DECISION TREES ?

In [103]:
submission.to_csv('submission.csv', index=False)

In [104]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,1,0
1,2,1
2,3,1
3,4,1
4,5,0


In [105]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
dtypes: int64(2)
memory usage: 14.0 KB


In [106]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
