#Kaggle Tutorial for Machine Learning



Necessary modules to run this notebook:
* Numpy
* Scikit-Learn
* Pandas

In [None]:
#Import the Numpy library
import numpy as np
#Import 'tree' from scikit-learn library
from sklearn import tree
# Import the Pandas library
import pandas as pd

Import Data from the web. We'll be using the Kaggle Database 

In [None]:
# Load the train and test datasets to create two DataFrames
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)

test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)

In [None]:
#Print the `head` of the train and test dataframes
train.head()
#test.head()

Here are some usefull information about our test and train dataset.

In [None]:
print "Train data set shape: ", train.shape
print "Test data set shape: ", test.shape

In [None]:
train.describe()

In [None]:
test.describe()

How many survived on the train dataset?

In [None]:
train["Survived"].value_counts()

In [None]:
train["Survived"].value_counts(normalize = True)*100

In [None]:
# Males that survived vs males that passed away
print(train["Survived"][train["Sex"]=='male'].value_counts())
# Normalized male survival
print(train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True))

In [None]:
# Females that survived vs Females that passed away
print(train["Survived"][train["Sex"] == 'female'].value_counts())
# Normalized female survival
print(train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True))

###Some data treatment

In [None]:
# Create the column Child and assign to 'NaN'
train["Child"] = float('NaN')

In [None]:
# Assign 1 to passengers under 18, 0 to those 18 or older. Print the new column.
train["Child"][train["Age"] < 18] = 1
train["Child"][train["Age"] >= 18] = 0
train.head()

In [None]:
# Print normalized Survival Rates for passengers under 18
print("Survival Rate for under 18:")
print(train["Survived"][train["Child"] == 1].value_counts(normalize = True))

# Print normalized Survival Rates for passengers 18 or older
print("\nSurvival Rate for 18 or older:")
print(train["Survived"][train["Child"] == 0].value_counts(normalize =True))

###Let's start some Machine Learning

In [None]:
# Create a copy of test: test_one
test_one = test.copy()
# Initialize a Survived column to 0
test_one["Survived"] = int(0)

In [None]:
test_one.head()

In [None]:
# Set Survived to 1 if Sex equals "female" and print the `Survived` column from `test_one`
test_one["Survived"][test_one["Sex"] == "female"] = 1
test_one[["Sex", "Survived"]].head()

Now let's build our Decision Tree Model.

But first, we need to work a little more on the data...

In [None]:
#Convert the male and female groups to integer form
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
#Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna("S")

In [None]:
#Convert the Embarked classes to integer form
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2

In [None]:
train.head()

In [None]:
train.dropna(axis = 0, inplace = True)

In [None]:
# Create the target and features numpy arrays: target, features_one
target = train["Survived"].values
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values

Drop NaN values?

In [None]:
# Fit your first decision tree: my_tree_one
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one, target)

The feature_importances_ attribute make it simple to interpret the significance of the predictors you include

In [None]:
# Look at the importance and score of the included features
print(my_tree_one.feature_importances_)

In [None]:
print(my_tree_one.score(features_one, target))

Make some predictions

In [None]:
# Impute the missing value with the median
test.Fare[152] = test.Fare.mean()

#Convert the male and female groups to integer form
test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1
#Impute the Embarked variable
test["Embarked"] = test["Embarked"].fillna("S")

#Convert the Embarked classes to integer form
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2

In [None]:
# Extract the features from the test set: Pclass, Sex, Age, and Fare.
test_features = test[["Pclass", "Sex", "Age", "Fare"]].values

# Make your prediction using the test set
my_prediction = my_tree_one.predict(test_features)

In [None]:
# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
PassengerId =np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])
print(my_solution)

# Check that your data frame has 418 entries
print(my_solution.shape)

# Write your solution to a csv file with the name my_solution.csv
my_solution.to_csv("my_solution_one.csv", index_label = ["PassengerId"])

###Overfitting and how to control it

In [None]:
# Create a new array with the added features: features_two
features_two = train[["Pclass","Age","Sex","Fare", "SibSp","Parch", "Embarked"]].values

#Control overfitting by setting "max_depth" to 10 and "min_samples_split" to 5 : my_tree_two
max_depth = 10
min_samples_split = 5
my_tree_two = tree.DecisionTreeClassifier(max_depth = max_depth, min_samples_split = min_samples_split, random_state = 1)
my_tree_two = my_tree_two.fit(features_two, target)

#Print the score of the new decison tree
print(my_tree_two.score(features_two, target))

###Feature Engineering

In [None]:
# Create train_two with the newly defined feature
train_two = train.copy()
train_two["family_size"] = train_two.SibSp + train_two.Parch + 1

# Create a new feature set and add the new feature
features_three = train_two[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "family_size"]].values

# Define the tree classifier, then fit the model
my_tree_three = tree.DecisionTreeClassifier()
my_tree_three = my_tree_three.fit(features_three, target)

# Print the score of this decision tree
print(my_tree_three.score(features_three, target))