In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import random
import numpy as np
import pandas as pd
from sklearn import datasets, svm, cross_validation, tree, preprocessing, metrics
import sklearn.ensemble as ske

# Titanic Facts
http://www.titanicfacts.net/titanic-passengers.html

Total Passangers: 1317

Details:

https://blog.socialcops.com/engineering/machine-learning-python/

In [None]:
titanic_df = pd.read_csv('/Users/avkashchauhan/learn/seattle-workshop/titanic_list.csv')

In [None]:
titanic_df.describe

In [None]:
titanic_df.shape

In [None]:
titanic_df.columns

In [None]:
titanic_df.head()

# DataSet details

survival: Survival (0 = no; 1 = yes)

class: Passenger class (1 = first; 2 = second; 3 = third)

name: Name

sex: Sex

age: Age

sibsp: Number of siblings/spouses aboard

parch: Number of parents/children aboard

ticket: Ticket number

fare: Passenger fare

cabin: Cabin

embarked: Port of embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

boat: Lifeboat (if survived)

body: Body number (if did not survive and body was recovered)

In [None]:
titanic_df['survived'].mean()

In [None]:
titanic_df.groupby('pclass').mean()

In [None]:
class_sex_grouping = titanic_df.groupby(['pclass','sex']).mean()
class_sex_grouping

In [None]:
class_sex_grouping['survived'].plot.bar()

In [None]:
group_by_age = pd.cut(titanic_df["age"], np.arange(0, 90, 10))
age_grouping = titanic_df.groupby(group_by_age).mean()
age_grouping['survived'].plot.bar()

In [None]:
print "You can see the data set has lots of missing entities"
titanic_df.count()

In [None]:
# Fixing inconsistencies 
titanic_df["home.dest"] = titanic_df["home.dest"].fillna("NA")
#removing body, cabin and boat features
titanic_df = titanic_df.drop(['body','cabin','boat'], axis=1)
#removing all NA values
titanic_df = titanic_df.dropna()

In [None]:
print "You will see the values are consitant now"
titanic_df.count()

In [None]:
# We can also drop 'name','ticket','home.dest' features as it will not help
titanic_df = titanic_df.drop(['name','ticket','home.dest'], axis=1)
titanic_df.count()

In [None]:
titanic_df.sex = preprocessing.LabelEncoder().fit_transform(titanic_df.sex)
titanic_df.sex
# Now SEX convers to 0 and 1 instead of male or female 

In [None]:
titanic_df.embarked = preprocessing.LabelEncoder().fit_transform(titanic_df.embarked)
titanic_df.embarked

In [None]:
# Create a dataframe which has all features we will use for model building
X = titanic_df.drop(['survived'], axis=1).values

In [None]:
y = titanic_df['survived'].values

In [None]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)

In [None]:
#Decision Tree Classifier
classify_dt = tree.DecisionTreeClassifier(max_depth=10)

In [None]:
print " This result means the model correctly predicted survival rate of given value %"
classify_dt.fit (X_train, y_train)
scr = classify_dt.score (X_test, y_test)
print "score : " , scr
print "Model is able to correctly predict survival rate of", scr *100 , "% time.."

In [None]:
# Creating a vlidator data which works on 80%-20% 
shuffle_validator = cross_validation.ShuffleSplit(len(X), n_iter=20, test_size=0.2, random_state=0)

In [None]:
def test_classifier(clf):
    scores = cross_validation.cross_val_score(clf, X, y, cv=shuffle_validator)
    print("Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std()))

In [None]:
test_classifier(classify_dt)
# Note: If you run shuffle_validator again and then run test classifier, you will see different accuracy

# Random Forest
The “Random Forest” classification algorithm will create a multitude of (generally very poor) trees for the data set using different random subsets of the input variables, and will return whichever prediction was returned by the most trees. This helps to avoid “overfitting”, a problem that occurs when a model is so tightly fitted to arbitrary correlations in the training data that it performs poorly on test data.

In [None]:
clf_rf = ske.RandomForestClassifier(n_estimators=50)
test_classifier(clf_rf)

In [None]:
# Performing Prediction

clf_rf.fit(X_train, y_train)
clf_rf.score(X_test, y_test)

# Gradient Boosting

The “Gradient Boosting” classifier will generate many weak, shallow prediction trees and will combine, or “boost”, them into a strong model. This model performs very well on our data set, but has the drawback of being relatively slow and difficult to optimize, as the model construction happens sequentially so it cannot be parallelized.

In [None]:
clf_gb = ske.GradientBoostingClassifier(n_estimators=50)
test_classifier(clf_gb)

In [None]:
# Performing Prediction

clf_gb.fit(X_train, y_train)
clf_gb.score(X_test, y_test)

# Voting Classifier
A “Voting” classifier can be used to apply multiple conceptually divergent classification models to the same data set and will return the majority vote from all of the classifiers. For instance, if the gradient boosting classifier predicts that a passenger will not survive, but the decision tree and random forest classifiers predict that they will live, the voting classifier will chose the latter.

In [None]:
eclf = ske.VotingClassifier([('dt', classify_dt), ('rf', clf_rf), ('gb', clf_gb)])
test_classifier(eclf)

In [None]:
# Performing Prediction

eclf.fit(X_train, y_train)
eclf.score(X_test, y_test)

# Performing Prediction

In [None]:
# Collection 10 records from each passenger class - Create datset of 30 records
passengers_set_1 = titanic_df[titanic_df.pclass == 1].iloc[:10,:].copy()
passengers_set_2 = titanic_df[titanic_df.pclass == 2].iloc[:10,:].copy()
passengers_set_3 = titanic_df[titanic_df.pclass == 3].iloc[:10,:].copy()
passenger_set = pd.concat([passengers_set_1,passengers_set_2,passengers_set_3])
#testing_set = preprocess_titanic_df(passenger_set)

In [None]:
passenger_set.count()
# You must see 30 uniform records

In [None]:
passenger_set.survived.count()

In [None]:
titanic_df.count()

In [None]:
passenger_set_new = passenger_set.drop(['survived'], axis=1)
prediction = clf_rf.predict(passenger_set_new)

In [None]:
passenger_set[passenger_set.survived != prediction]