In [1]:
# linear algebra
import numpy as np 

# data processing
import pandas as pd 

# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

In [8]:
# Read in train data as pandas dataframe and display first 5 rows
trainData = pd.read_csv('train.csv')
testData = pd.read_csv('test.csv')

In [3]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


From the above information, some of the features have missing values

In [4]:
trainData.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
trainData.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


From the table above, we need to one-hot encode some features into numeric ones, so that the machine learning algorithms can process them. Some of the features have widely different ranges, therefore we need to scale the features. We can also spot some more features, that contain missing values (NaN = not a number), that we need to deal with.

The Embarked feature has only 2 missing values, which can easily be filled. It will be much more tricky, to deal with the ‘Age’ feature, which has 177 missing values. The ‘Cabin’ feature needs further investigation, but it looks like we might want to drop it from the dataset, since 77 % of it is missing.

In [7]:
trainData.columns.values

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

## Data Preprocessing

### Missing Data

In [6]:
# Number of columns with missing values
total = trainData.isnull().sum().sort_values(ascending=False)
percent_1 = trainData.isnull().sum()/trainData.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head()

Unnamed: 0,Total,%
Cabin,687,77.1
Age,177,19.9
Embarked,2,0.2
Fare,0,0.0
Ticket,0,0.0


#### Carbin: 

In [10]:
# Drop the Carbin feature in both train and test dataset, since it has 77% of its data missing
trainData = trainData.drop(['Cabin'], axis=1)
testData = testData.drop(['Cabin'], axis=1)

#### Age:

In [14]:
# Create an array that contains random numbers, which are computed based on the mean age value in regards to the 
# standard deviation and is_null

data = [trainData, testData]

for dataset in data:
    mean = trainData["Age"].mean()
    std = testData["Age"].std()
    is_null = dataset["Age"].isnull().sum()

# compute random numbers between the mean, std and is_null
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)

# fill NaN values in Age column with random values generated
    age_slice = dataset["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset["Age"] = age_slice
    dataset["Age"] = trainData["Age"].astype(int)
    
trainData["Age"].isnull().sum()


0

#### Embarked: 

In [15]:
#Since the Embarked feature has only 2 missing values, we will just fill these with the most common one.

trainData['Embarked'].describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [16]:
common_value = 'S'
data = [trainData, testData]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].fillna(common_value)

### Converting features

In [19]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null int32
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Embarked       891 non-null object
dtypes: float64(1), int32(1), int64(5), object(4)
memory usage: 73.2+ KB


#### Fare: 

In [21]:
# We need to convert 'Fare' to integer.
for dataset in data:
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)

#### Name:

In [22]:
#We will use the Name feature to extract the Titles from the Name, so that we can build a new feature out of that.

data = [trainData, testData]
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in data:
    # extract titles
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    # replace titles with a more common title or as Rare
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    # convert titles into numbers
    dataset['Title'] = dataset['Title'].map(titles)
    # filling NaN with 0, to get safe
    dataset['Title'] = dataset['Title'].fillna(0)
trainData = trainData.drop(['Name'], axis=1)
testData = testData.drop(['Name'], axis=1)

#### Tickets:

In [24]:
trainData['Ticket'].describe()


count        891
unique       681
top       347082
freq           7
Name: Ticket, dtype: object

In [25]:
#Also drop the Ticket feature since it has 681 unique tickets
trainData = trainData.drop(['Ticket'], axis=1)
testData = testData.drop(['Ticket'], axis=1)

#### Sex:

In [26]:
#Convert Sex feature to numeric

genders = {"male": 0, "female": 1}
data = [trainData, testData]

for dataset in data:
    dataset['Sex'] = dataset['Sex'].map(genders)

#### Embarked:

In [27]:
#Convert Embarked feature to numeric

ports = {"S": 0, "C": 1, "Q": 2}
data = [trainData, testData]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].map(ports)

In [30]:
trainData.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked', 'Title'],
      dtype='object')

In [36]:
features = ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch','Fare', 'Embarked', 'Title']
train_X = trainData[features]
train_y = trainData.Survived

test_X = testData[features]


### Build Random Forest Model

In [38]:
model = RandomForestClassifier(random_state = 1)
model.fit(train_X, train_y)
predictions = model.predict(test_X)



In [40]:
#Put features and prediction in dataframe for submission
Submission = pd.DataFrame({'PassengerId':testData.PassengerId, 'Survived':predictions})

In [43]:
#Put submission in csv file for kaggle submission
Submission.to_csv('DSNT1.csv', index = False)

### Tune Model

In [59]:
model = RandomForestClassifier( n_estimators = 1000, criterion = 'mae', random_state = 1)
my_model.fit(train_X, train_y)
my_predictions = my_model.predict(test_X)



In [63]:
Submission1 = pd.DataFrame({'PassengerId':testData.PassengerId, 'Survived':my_predictions})

Submission1.to_csv('DSNT3.csv', index = False)

### Build Logistic Regression Model

In [62]:
logreg = LogisticRegression()
logreg.fit(train_X, train_y)

Y_pred = logreg.predict(test_X)

acc_log = round(logreg.score(train_X, train_y) * 100, 2)



In [64]:
Submit = pd.DataFrame({'PassengerId':testData.PassengerId, 'Survived':Y_pred})

Submit.to_csv('DSNT4.csv', index = False)

#### Test new Random Forest Parameters

In [66]:
# Random Forest
random_forest = RandomForestClassifier(criterion = "gini", min_samples_leaf = 1, min_samples_split = 10, n_estimators=100, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)

random_forest.fit(train_X, train_y)

Y_prediction = random_forest.predict(test_X)

random_forest.score(train_X, train_y)

print("oob score:", round(random_forest.oob_score_, 4)*100, "%")

oob score: 82.49 %


In [67]:
Submits = pd.DataFrame({'PassengerId':testData.PassengerId, 'Survived':Y_prediction})

Submits.to_csv('DSNT5.csv', index = False)

### Another RF Tuning

In [92]:
m = RandomForestClassifier(criterion = "gini", min_samples_leaf = 3, min_samples_split = 7, n_estimators=1000, max_features='auto', oob_score=True, random_state=0, n_jobs=-1)

m.fit(train_X, train_y)

m_prediction = m.predict(test_X)

m.score(train_X, train_y)

print("oob score:", round(m.oob_score_, 4)*100, "%")

oob score: 82.49 %


In [93]:
m_pred = m.predict(test_X)


In [81]:
Sub = pd.DataFrame({'PassengerId':testData.PassengerId, 'Survived':m_pred})

Sub.to_csv('DSNT6.csv', index = False)