# Kaggle Titanic Competition - could Jack have lived?

### Load Data Set and Libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
import numpy as np

%matplotlib inline

In [None]:
# Read into our datasets
dataset = pd.read_csv('train.csv', index_col='PassengerId')
submission = pd.read_csv('test.csv')

### Data Exploration and Feature Engineering

In this section, we'll explore some basic structure of our data and try to come up creative ways of reformatting our features to make it more machine readable.

In [None]:
dataset.head()

In [None]:
dataset.describe()

### Basic Data Structure Exploration

#### Gender (Sex)
Now that you have a rough understanding of what each feature entail. Let's first start by exploring how gender is related to the overall survival rate.

In [None]:
dataset.Sex = dataset.Sex.apply(lambda x: 1 if x == 'female' else 0)

In [None]:
survived_by_sex = dataset[['Sex', 'Survived']].groupby('Sex').sum()
survived_by_sex['People'] = dataset.groupby('Sex').count().Survived
survived_by_sex['PctSurvived'] = survived_by_sex.Survived / survived_by_sex.People
survived_by_sex

In [None]:
survived_by_sex[['Survived', 'People']].plot(kind='bar', rot=0)

#### Passenger Class (Pclass)

What about survivial with respect to each passenger class (Pclass)?

In [None]:
survived_by_pclass = dataset[['Pclass', 'Survived']].groupby('Pclass').sum()
survived_by_pclass['People'] = dataset.groupby('Pclass').count().Survived
survived_by_pclass['PctSurvived'] = survived_by_pclass.Survived / survived_by_pclass.People
survived_by_pclass

In [None]:
survived_by_pclass[['Survived', 'People']].plot(kind='bar', rot=0)

How about if we separated social class and gender, how does our survival rate look?

In [None]:
survived_by_pclass_sex = dataset.groupby(['Sex', 'Pclass']) \
    .apply(lambda x: x.Survived.sum() / len(x)) \
    .unstack()

survived_by_pclass_sex

In [None]:
survived_by_pclass_sex.plot(kind='bar', title='Pct Survived by Pclass')

#### Port of Embarkment

In [None]:
survived_by_embarked = dataset[['Embarked', 'Survived']].groupby('Embarked').sum()
survived_by_embarked['People'] = dataset[['Embarked', 'Survived']].groupby('Embarked').count()
survived_by_embarked['PctSurvived'] = survived_by_embarked.Survived/survived_by_embarked.People

survived_by_embarked

In [None]:
survived_by_embarked[['Survived', 'People']].plot(kind='bar', rot=0)

#### Age

Distribution of age

In [None]:
dataset['AgeGroup'] = dataset.Age.dropna().apply(lambda x: int(x/5)*5)
dataset['AgeGroup1'] = dataset.Age.dropna().apply(lambda x: int(x/1))

In [None]:
ax = dataset.Age.hist(bins=len(dataset.AgeGroup.unique()))
dataset[['Sex', 'AgeGroup', 'Name']].groupby(['AgeGroup', 'Sex']).count().unstack('Sex').plot(ax=ax)

In [None]:
survived_by_agegroup = dataset[['AgeGroup', 'Survived']].groupby('AgeGroup').sum()
survived_by_agegroup['People'] = dataset[['AgeGroup', 'Survived']].groupby('AgeGroup').count()
survived_by_agegroup['PctSurvived'] = survived_by_agegroup.Survived/survived_by_agegroup.People

survived_by_agegroup.head()

In [None]:
survived_by_agegroup[['Survived', 'People']].plot(kind='bar', figsize=(16,4))

In [None]:
survived_by_agegroup1 = dataset[['AgeGroup1', 'Survived']].groupby('AgeGroup1').sum()
survived_by_agegroup1['People'] = dataset[['AgeGroup1', 'Survived']].groupby('AgeGroup1').count()
survived_by_agegroup1['PctSurvived'] = survived_by_agegroup1.Survived/survived_by_agegroup.People

survived_by_agegroup1.head()

In [None]:
survived_by_agegroup1[['Survived', 'People']].plot(kind='bar', figsize=(16,4))

In [None]:
survived_by_age = dataset[dataset.Age <= 60][['Age', 'Survived']].sort_values('Age')
survived_by_age['CumSurvived'] = survived_by_age.Survived.cumsum()
survived_by_age['CumCount'] = [x+1 for x in range(len(survived_by_age))]
survived_by_age['CumSurvivalRate'] = survived_by_age.CumSurvived / survived_by_age.CumCount

survived_by_age.plot(kind='scatter', x='Age', y='CumSurvivalRate')

In [None]:
survived_by_age[(survived_by_age.Age >= 5) & (survived_by_age.Age <= 8)]

### Fare

In [None]:
dataset.Fare.hist()

In [None]:
dataset[['Pclass', 'Fare']].groupby('Pclass').describe().unstack()

In [None]:
fare_pclass_mean = dataset[['Pclass', 'Fare']][dataset.Fare > 1].groupby('Pclass').mean()
fare_pclass_mean

In [None]:
dataset.Fare = dataset.apply(lambda x: fare_pclass_mean.loc[x.Pclass][0] if x.Fare < 1 else x.Fare, axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.boxplot(data=dataset, x='Pclass', y='Fare', ax=ax)

Let's see if there's a relationshp between fare price and those who has survived

In [None]:
dataset[['Pclass', 'Fare']][dataset.Survived == 1].groupby('Pclass').describe().unstack()

In [None]:
dataset[['Pclass', 'Fare']][dataset.Survived == 0].groupby('Pclass').describe().unstack()

In [None]:
sns.boxplot(data=dataset[dataset.Survived == 1], x='Pclass', y='Fare')
plt.title("Survivor's Fare Price by Pclass")

plt.figure()
sns.boxplot(data=dataset[dataset.Survived == 0], x='Pclass', y='Fare')
plt.title("Non-Survivor's Fare Price by Pclass")

Percentage of people surivived by pclass

And not surpringly, people in better passenger classes had higher surivival rate

In [None]:
survived_by_fare_pclass = dataset[['Fare', 'Survived', 'Pclass']].groupby(['Pclass', 'Survived']).mean().unstack('Survived')
survived_by_fare_pclass.plot(kind='bar', rot=0)
plt.title('Survived by Pclass')

### Cabin

In [None]:
# Define cabin class
dataset['CabinClass'] = dataset.Cabin.dropna().str[0]

In [None]:
cabinclass = dataset[['CabinClass', 'Survived']].groupby('CabinClass').sum()
cabinclass['People'] = dataset.groupby('CabinClass').count().Survived
cabinclass['PctSurvived'] = cabinclass.Survived / cabinclass.People
cabinclass['AvgFare'] = dataset.groupby('CabinClass').mean().Fare
cabinclass['AvgAge'] = dataset.groupby('CabinClass').mean().Age
cabinclass['PctFemaleInCabin'] = dataset.groupby('CabinClass').apply(lambda x: len(x[x.Sex == 1]) / len(x))

cabinclass

In [None]:
cabinclass[['Survived', 'People']].plot(kind='bar', figsize=(16,8), title='Survived by Cabin Class')

In [None]:
dataset.head()

In [None]:
for i, cabin_class in enumerate(dataset.CabinClass.unique()):
    dataset.loc[dataset.CabinClass == cabin_class, 'CabinClass'] = i

cabin_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'PersonType', 'Mother']

#clf_cabin = sklearn.linear_model.RidgeCV(alphas=[0.1,0.5,1,2,5])
clf_cabin = sklearn.ensemble.RandomForestRegressor(n_estimators=500)

clf_cabin.fit(dataset[cabin_features][~pd.isnull(dataset.CabinClass)], dataset[~pd.isnull(dataset.CabinClass)].CabinClass)

#dataset['PredictedCabinClass'] = clf_cabin.predict(dataset[cabin_features])

dataset[~pd.isnull(dataset.CabinClass)]['CabinClass'] = np.round(clf_cabin.predict(dataset[~pd.isnull(dataset.CabinClass)][cabin_features]))
clf_cabin.feature_importances_

In [None]:
ax = dataset.CabinClass.hist()
#dataset.PredictedCabinClass.hist(ax=ax)

In [None]:
# dataset.PredictedCabinClass.hist()

In [None]:
# dataset.PredictedCabinClass.hist()

In [None]:
dataset.CabinClass.unique()

In [None]:
#np.round(clf_cabin.predict(dataset[cabin_features]))

### Feature Engineering

#### Extracting Titles from Name

And using the title and other features to predict our age

In [None]:
# Read into our datasets
dataset = pd.read_csv('train.csv', index_col='PassengerId')
submission = pd.read_csv('test.csv')
df_union = pd.concat([dataset, submission])

In [None]:
#dataset[pd.isnull(dataset.Age)]
#dataset.sort_values('Name').head(100)

In [None]:
df_union['Title'] = df_union.Name.apply(lambda x: (x.split(',')[1]).split('.')[0][1:])

In [None]:
df_union[['Age','Title']].groupby('Title').describe().unstack()

In [None]:
df_union[['Sex','Title','Survived']].groupby(['Title','Sex']).sum().unstack('Sex')

In [None]:
def title_mapping(x):
    #if x in set(['Capt', 'Col', 'Don', 'Major', 'Rev', 'Sir', 'Jonkheer']):
    if x in set(['Don', 'Rev', 'Sir', 'Jonkheer']):
        return 'Mr'
    elif x in set(['Lady', 'the Countess']):
        return 'Mrs'
    elif x in set(['Mlle', 'Mme', 'Dona', 'Ms']):
        return "Miss"
    elif x in set(['Major', 'Col', 'Capt']):
        return "Officer"
    else:
        return x


In [None]:
dataset['Title'] = df.Name.apply(lambda x: (x.split(',')[1]).split('.')[0][1:])
dataset.Title = dataset.Title.apply(title_mapping)
dataset.loc[(dataset.Sex == 1) & (dataset.Title == 'Dr'), 'Title'] = 'Mrs'
dataset.loc[(dataset.Sex == 0) & (dataset.Title == 'Dr'), 'Title'] = 'Mr'

dataset.Title.unique()

In [None]:
titles = { 'Master' : 0, 'Miss' : 1, 'Mr' : 2, 'Mrs': 4, 'Officer': 5}
dataset.Title = dataset.Title.apply(lambda x: titles.get(x))

In [None]:
dataset[['Age', 'Title']].groupby('Title').describe().unstack()

In [None]:
dataset[dataset.Title == 2].Age.hist()

#### Mother

In [None]:
dataset['Mother'] = dataset.apply(lambda x: (x.Sex == 1) & (x.Age >= 18) & (x.Parch > 0) & (x.Title == 4), axis=1)

In [None]:
dataset[dataset.Mother == 1].head(3)

#### Family Name

In [None]:
dataset['FamilyName'] = dataset.Name.apply(lambda x: x.split(",")[0])

In [None]:
dataset[dataset.FamilyName.isin(dataset[(dataset.FamilySize > 1)].groupby('FamilyName').count().index)].sort_values('FamilyName')

In [None]:
survived_by_family_name = dataset[['Survived', 'FamilyName']].groupby('FamilyName').sum()
# survived_by_family_name['FamilySize'] = dataset[['Survived', 'FamilyName']].groupby('FamilyName')


survived_by_family_name

#### Cabin

**Data Cleaning and Normalization**

In [None]:
# Read into our datasets
dataset = pd.read_csv('train.csv')
submission = pd.read_csv('test.csv')

In [None]:
def clean_data(df): 
    df_union = pd.concat([dataset, submission])
    
    # Map sex into 0 for male, 1 for female
    # df.Sex = df.Sex.apply(lambda x: 0 if x == 'male' else 1)
    # df.Sex = pd.Series(map(lambda x: x == 'female', df.Sex))
    # df.Sex = df.Sex.map({'male':0, 'female':1})
    def convert_sex(x):
        if x == 'male':
            return 0
        elif x == 'female':
            return 1
        else:
            return x
    df.Sex = df.Sex.apply(convert_sex)
    
    # Map embarked location into numbers after filling missing embark as the most common location
    # embarked = {'S':1, 'C':2, 'Q':3}
    embarked = { name : i for i, name in enumerate(df_union.Embarked.unique())}
    df.Embarked.fillna(df_union.Embarked.mode()[0], inplace=True)
    df.Embarked = df.Embarked.apply(lambda x: embarked.get(x))
    
    # Fill missing fare with the median of its pclass
    pclass_median = df_union[['Pclass', 'Fare']].groupby('Pclass').median()
    for pclass in df_union.Pclass.unique():
        #df.Fare[df.Pclass == pclass].fillna(pclass_median.loc[pclass], inplace=True)
        df.loc[(df.Pclass == pclass) & (df.Fare.isnull()), 'Fare'] = pclass_median.loc[pclass][0]
    
    # Normalize Pclass to start at 0
    df.Pclass = df.Pclass - 1
    
    # Family Size
    df['FamilySize'] = df.SibSp + df.Parch + 1
    
    # Fill in missing fare prices
    fare_pclass_mean = dataset[['Pclass', 'Fare']][dataset.Fare > 1].groupby('Pclass').mean()
    df.Fare = df.apply(lambda x: fare_pclass_mean.loc[x.Pclass][0] if x.Fare < 1 else x.Fare, axis=1)
    
    # Create our newly engineered Title feature
    df['Title'] = df.Name.apply(lambda x: (x.split(',')[1]).split('.')[0][1:])
    df.Title = df.Title.apply(title_mapping)
    df.loc[(df.Sex == 1) & (df.Title == 'Dr'), 'Title'] = 'Mrs'
    df.loc[(df.Sex == 0) & (df.Title == 'Dr'), 'Title'] = 'Mr'
    
    #titles = { name : i for i, name in enumerate(df.Title.unique())}
    titles = { 'Master' : 0, 'Miss' : 1, 'Mr' : 2, 'Mrs': 4, 'Officer': 5}
    df.Title = df.Title.apply(lambda x: titles.get(x))
    
    # Cabin 
    df['CabinClass'] = df.Cabin.dropna().str[0]
    for i, cabin_class in enumerate(df.CabinClass.unique()):
        df.loc[df.CabinClass == cabin_class, 'CabinClass'] = i
    
    
def fill_age(df):
    df_union = pd.concat([dataset, submission])
    if False:
        clf_ridge = sklearn.linear_model.RidgeCV(alphas=[0.1,0.5,1,3,5])
        age_features = ['Sex', 'SibSp', 'Parch', 'Fare', 'Title']
        clf_ridge.fit(df_union[~pd.isnull(df_union.Age)][age_features], df_union[~pd.isnull(df_union.Age)].Age)

        #df['PredictedAge'] = df.Age
        df['PredictedAge'] = clf_ridge.predict(df[age_features])
        #df.loc[pd.isnull(df.Age), 'Age'] = clf_ridge.predict(df[pd.isnull(df.Age)][age_features])

    #df.Age.fillna(df_union.Age.median(), inplace=True)
    
    title_mean_age = df_union.groupby('Title').mean().Age
    df.loc[pd.isnull(df.Age), 'Age'] = df[pd.isnull(df.Age)][['Age', 'Title']].apply(lambda x: title_mean_age[x.Title], axis=1)
    
    # Replace Sex with PersonType, which classifies Child
    df['PersonType'] = df.Sex
    df.loc[df.Age <= 18, 'PersonType'] = 3 # For youth
    df.loc[df.Age <= 6, 'PersonType'] = 4 # For child
    
    # Find mothers
    df['Mother'] = df.apply(lambda x: 1 if (x.Sex == 1) & (x.Age >= 18) & (x.Parch > 0) & (x.Title == 4) else 0, axis=1)

def fill_cabin(df):
    df_union = pd.concat([dataset, submission])
    # Cabin 
    cabin_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'Mother', 'PersonType']
    clf_cabin = sklearn.ensemble.RandomForestRegressor(n_estimators=1000)

    clf_cabin.fit(df_union[cabin_features][~pd.isnull(df_union.CabinClass)], df_union[~pd.isnull(df_union.CabinClass)].CabinClass)

    df.loc[pd.isnull(df.CabinClass), 'CabinClass'] = np.round(clf_cabin.predict(df[pd.isnull(df.CabinClass)][cabin_features]))

In [None]:
clean_data(dataset)
clean_data(submission)
fill_age(dataset)
fill_age(submission)
fill_cabin(dataset)
fill_cabin(submission)

In [None]:
def age_group_classification(x):
    if x <= 10:
        return 0
    elif x <= 35:
        return 1
    else:
        return 2

dataset['AgeGroup'] = dataset.Age.apply(age_group_classification)

In [None]:
#dataset.head(30)

df_union = pd.concat([dataset, submission])
age_features = ['Sex', 'SibSp', 'Parch', 'Fare', 'Title', 'Pclass', 'Embarked']
age_features = ['Title']

clf_age = sklearn.linear_model.RidgeCV(alphas=[0.1, 0.5, 1, 3, 5, 10])
#clf_age = sklearn.linear_model.LogisticRegression()
clf_age = sklearn.ensemble.RandomForestRegressor(n_estimators=500)

clf_age.fit(df_union[~pd.isnull(df_union.Age)][age_features], df_union[~pd.isnull(df_union.Age)].Age)

ax = dataset.Age.hist(alpha=0.3, label=['Original'], bins=10)

pd.DataFrame(clf_age.predict(df_union[age_features]), columns=['PredictedAge']) \
    .hist(alpha=0.3, label=['Predicted'], color='r', ax=ax)

#dataset['AgeAfter'] = dataset.Age
#dataset.loc[pd.isnull(dataset.Age), 'AgeAfter'] = dataset[pd.isnull(dataset.Age)][['Age', 'Title']].apply(lambda x: title_mean_age[x.Title], axis=1)

#dataset.AgeWithPredicted.hist(alpha=0.4, ax=ax, bins=10)
#dataset.AgeAfter.hist(alpha=0.4, ax=ax, bins=10)

Check for null entries in our features

In [None]:
dataset[x_features][pd.isnull(dataset[x_features]).any(axis=1)]

In [None]:
x_features = ['Pclass', 'Sex', 'PersonType', 'Age', 'FamilySize', 'Fare', 'Embarked', 'Title',
              'SibSp', 'Parch', 'Mother', 'CabinClass']
features = x_features.copy() + ['Survived']

# Make sure that there are no Nan entries in our dataset
print("Nan Entries (dataset, submission):",
      len(dataset[pd.isnull(dataset[x_features]).any(axis=1).values]),
      len(submission[pd.isnull(submission[x_features]).any(axis=1).values]))

### Model Fitting

**Base Line**

In [None]:
len(dataset[dataset.Survived == 0]) / len(dataset)

#### Cross Validate

In [None]:
# Cross Validation
itrain, itest = sklearn.cross_validation.train_test_split(range(dataset.shape[0]), train_size=.70)

mask=np.ones(dataset.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)
mask[:10]

In [None]:
train = dataset[mask]
test = dataset[~mask]

In [None]:
params_c = {'C':[0.005, 0.1,0.5,1,2,5]}
params_n_estimators = {'n_estimators':[500,1000,2000,4000]}

clfs = []

#clfs.append(sklearn.grid_search.GridSearchCV(sklearn.linear_model.LogisticRegression(), params_c, cv=5, scoring='f1'))
#clfs.append(sklearn.grid_search.GridSearchCV(sklearn.svm.SVC(), params_c, cv=5, n_jobs=4))
clfs.append(sklearn.grid_search.GridSearchCV(sklearn.ensemble.RandomForestClassifier(), params_n_estimators, cv=5, n_jobs=4))
#clfs.append(sklearn.grid_search.GridSearchCV(sklearn.ensemble.AdaBoostClassifier(), params_n_estimators, cv=5, n_jobs=4))
#clfs.append(sklearn.naive_bayes.GaussianNB())

for clf in clfs:
    clf.fit(train[x_features], train.Survived)

print('models fitted')

In [None]:
for clf in clfs: 
    if isinstance(clf, sklearn.grid_search.GridSearchCV):
        print(clf.best_estimator_)
        print(clf.best_params_)
        print(clf.best_score_)
        print(clf.grid_scores_)
        print('\n')
    #elif isinstance(clf, sklearn.naive_bayes.GaussianNB):
    else:
        print(clf)

How well have we trained against our own training set

In [None]:
for clf in clfs:
    y_train_pred = clf.predict(train[x_features])
    print(type(clf.best_estimator_) if isinstance(clf, sklearn.grid_search.GridSearchCV) else type(clf))
    print(sklearn.metrics.classification_report(y_train_pred, train.Survived))


Test our classifiers against our validation set

In [None]:
# Testing against our own test data

for clf in clfs: 
    print(clf.best_estimator_.__class__.__name__ if isinstance(clf, sklearn.grid_search.GridSearchCV) else clf.__class__.__name__)
    y_test_pred = clf.predict(test[x_features])
    print(sklearn.metrics.classification_report(y_test_pred, test.Survived))

Let's check for over and underfitting with a learning curve

In [None]:
#for clf in clfs:
#    name = clf.best_estimator_.__class__.__name__ if isinstance(clf, sklearn.grid_search.GridSearchCV) else clf.__class__.__name__
#    plot_learning_curve(clf, name, train[x_features], train.Survived, train_sizes=np.linspace(0.3,1,5))

In [None]:
for clf in clfs:
    if isinstance(clf, sklearn.grid_search.GridSearchCV):
        if isinstance(clf.best_estimator_, sklearn.ensemble.forest.RandomForestClassifier):
            print('Random Forest:')
            print(*zip(clf.best_estimator_.feature_importances_, x_features))

**Submission**

In [None]:
final_submission = []
for i, clf in enumerate(clfs):
    # Retrain the best predictor with our entire test set
    clf.best_estimator_.fit(dataset[x_features], dataset.Survived)
    
    final_submission.append(submission.copy())
    final_submission[i]['Survived'] = pd.DataFrame(clf.predict(submission[x_features]))
    name = clf.best_estimator_.__class__.__name__ if isinstance(clf, sklearn.grid_search.GridSearchCV) else clf.__class__.__name__
    final_submission[i].to_csv('titanic_{}.csv'.format(name), columns=['PassengerId', 'Survived'], index=False)

In [None]:
final_submission[0].head()

**Helpers**

In [None]:
def logit_to_prob(x):
    # equivalent to np.exp(x)/ (1+np.exp(x))
    return 1 / (1+np.exp(-x))

In [None]:
def prob_to_logit(x):
    return np.log(x/(1-x))

In [None]:
from sklearn import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

Logistic Regression Exploring

In [None]:
clf = sklearn.linear_model.LogisticRegression(C=100000000)

#new_train = train[pd.notnull(train.Age)]
new_train = train
features = ['Sex']

clf.fit(new_train[features], new_train.Survived)

In [None]:
#print('log odds ratio:', np.sum(np.exp(clf.coef_)) + np.exp(clf.intercept_))

print('base log odds ratio:', np.exp(clf.intercept_))
print('female log odds ratio:',  clf.intercept_ + clf.coef_)
print('probability of male surviving:', clf.intercept_, logit_to_prob(clf.intercept_))
#print("increase in prob of survival if you're female", clf.coef_, logit_to_prob(clf.coef_))
print(np.exp(clf.coef_) - 1)

print('probability of survival when female:', clf.intercept_ + clf.coef_, logit_to_prob(clf.intercept_ + clf.coef_))

clf.intercept_, clf.coef_

In [None]:
clf.coef_, np.exp(clf.coef_), logit_to_prob(clf.coef_ + clf.intercept_)

In [None]:
df_sex = pd.DataFrame(train.Survived.groupby(train.Sex).sum())
df_sex['Sex'] = train.Sex.groupby(train.Sex).count()
df_sex['ProbOfSurvival'] = df_sex.Survived / df_sex.Sex
df_sex['Logit(ProbOfSurvival)'] = prob_to_logit(df_sex.ProbOfSurvival)

male_to_female_survival_odds = df_sex.iloc[0].ProbOfSurvival / df_sex.iloc[1].ProbOfSurvival
print('Males to Female survival ratio: ', male_to_female_survival_odds)
print('Log odds of Males to Female survival ratio: ', np.log(male_to_female_survival_odds))
print(109/577)

df_sex.head()

In [None]:
clf.coef_, np.exp(clf.coef_)

In [None]:
df_sex.iloc[1].ProbOfSurvival / df_sex.iloc[0].ProbOfSurvival 

In [None]:
train.Survived[(train.Survived == 1) & (train.Sex == 1)].sum()

In [None]:
0.742038/0.188908

In [None]:
features_2 = ['Sex', 'Age', 'Fare', 'Survived']
train_2 = train[features_2][pd.notnull(train.Sex) & pd.notnull(train.Age) & pd.notnull(train.Fare)]
#Y_train_2 = train.Survived[pd.notnull(train.Sex) & pd.notnull(train.Age) & pd.notnull(train.Fare)]
print(train_2.shape)
train_2.head()

In [None]:
clf_2 = sklearn.linear_model.LogisticRegression(C=1e10)
clf_2.fit(train_2.drop('Survived', axis=1), train_2.Survived)

In [None]:
clf_2.intercept_, clf_2.coef_

In [None]:
np.exp(clf_2.intercept_), np.exp(clf_2.coef_)

In [None]:
before = train_2[(train_2.Sex == 0) & (train_2.Fare <= 10) & (train_2.Age < 35)].groupby('Survived').count()
before

In [None]:
22/126

In [None]:
train_2[(train_2.Sex == 0) & (train_2.Fare <= 10) & (train_2.Age < 36)].groupby('Survived').count()

In [None]:
22/131

In [None]:
(22/131)/(22/126)

In [None]:
####

In [None]:
train_2[(train_2.Sex == 0) & (train_2.Fare <= 10) & (train_2.Age < 35)].groupby('Survived').count()

In [None]:
train_2[(train_2.Sex == 1) & (train_2.Fare <= 10) & (train_2.Age < 35)].groupby('Survived').count()

In [None]:
print(16/21, 22/126, (22/126)/(16/21), (16/21)/(22/126))
np.log((22/126)/(16/21))

In [None]:
np.log((25/160)/(19/21))

In [None]:
####

In [None]:
females = train_2[(train_2.Sex == 1) & (train_2.Fare <= 10) & (train_2.Age < 35)].groupby('Survived').count()
females

In [None]:
def odds(female, male):
    female_surival = female.Sex[1] / female.Sex[0]

females.Sex[1] / females.Sex[0]

In [None]:
####

In [None]:
train_2[(train_2.Sex == 0) & (train_2.Fare <= 10) & (train_2.Age < 35)].groupby('Survived').count()

In [None]:
train_2[(train_2.Sex == 0) & (train_2.Fare <= 11) & (train_2.Age < 35)].groupby('Survived').count()

In [None]:
23/137

In [None]:
22/126

In [None]:
(22/126)/(23/137)

In [None]:
###

In [None]:
clf_3 = LogisticRegression(C=1e10)

train_age = train[(pd.notnull(train.Age))]
clf_3.fit(train_age.Age.reshape(-1,1), train_age.Survived.reshape(-1,1))

In [None]:
clf_3.intercept_, clf_3.coef_, np.exp(clf_3.intercept_)

In [None]:
train_age.Survived[train_age.Fare < 28].groupby(train_age.Survived).count()

In [None]:
118/160

In [None]:
train_age.Survived[train_age.Age < 29].groupby(train_age.Survived).count()

In [None]:
124/177

In [None]:
(124/177)/(118/160)

In [None]:
(148/216)/(118/160)

In [None]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

(clf_3.intercept_ + clf_3.coef_)[0][0]

In [None]:
fig, ax = plt.subplots(figsize=(16,10))
train_age.plot(kind='scatter', x='Fare', y='Survived', ax=ax)

ax.plot(np.linspace(train_age.Fare.min(), train_age.Age.max()),
        [sigmoid((clf_3.intercept_ + clf_3.coef_*x)[0][0]) for x in np.linspace(train_age.Age.min(), train_age.Age.max())])

In [None]:
#### age factor for female

In [None]:
clf_4 = sklearn.linear_model.LogisticRegression(C=1e10)

train_4 = train[features_4]
features_4 = ['Sex', 'Age', 'Survived']
#clf_4.fit(train

In [None]:
clf_4.fit(train[features_4].drop('Survived', axis=1), train[features_4].Survived)

In [None]:
clf_4.intercept_, clf_4.coef_[0][0], clf_4.coef_[0][1]

In [None]:
fig, ax = plt.subplots(figsize=(5,10))
plt.scatter(train_4[train_4.Sex == 1].Age, train_4[train_4.Sex == 1].Survived)
plt.plot(np.linspace(train_4[train_4.Sex == 1].Age.min(), train_4[train_4.Sex == 1].Age.max()),
         [sigmoid(clf_4.intercept_[0] + clf_4.coef_[0][0] + clf_4.coef_[0][1]*x) 
             for x in np.linspace(train_4[train_4.Sex == 1].Age.min(), train_4[train_4.Sex == 1].Age.max())])

In [None]:
xx = np.linspace(-50, 50)
plt.plot(xx, sigmoid(xx))

In [None]:
#####

In [None]:
y_pred = clf.predict(new_train[features])
# y_pred

In [None]:
sklearn.metrics.confusion_matrix(y_pred, new_train.Survived)

In [None]:
print(sklearn.metrics.classification_report(y_pred, new_train.Survived))

In [None]:
sklearn.preprocessing.scale(train[['Pclass', 'Sex']])

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.learning_curve import learning_curve


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


digits = load_digits()
X, y = digits.data, digits.target


title = "Learning Curves (Naive Bayes)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=100,
                                   test_size=0.2, random_state=0)

estimator = GaussianNB()
plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4)

title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
# SVC is more expensive so we do a lower number of CV iterations:
cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=10,
                                   test_size=0.2, random_state=0)
estimator = SVC(gamma=0.001)
plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4)

plt.show()

**Scrap**

In [None]:
fig, axes = plt.subplots(3,3, figsize=(16,16))

# Survival Count
axes[0,0].set_title('Survival Count by Population')
axes[0,0].set_ylabel('# Survived')
axes[0,0].set_xlabel('Survived')
population_survival_count = dataset.Survived.value_counts()
population_survival_count.plot(kind='bar', ax=axes[0,0])

axes[0,1].set_title('Survival Count for Females')
axes[0,1].set_ylabel('# Survived')
axes[0,1].set_xlabel('Survived')
females_survival_count = dataset.Survived[dataset.Sex == 'female'].value_counts().sort_index(ascending=True)
females_survival_count.plot(kind='bar', ax=axes[0,1])

axes[0,2].set_title('Survival Count for Males')
axes[0,2].set_ylabel('# Survived')
axes[0,2].set_xlabel('Survived')
male_surivival_count = dataset.Survived[dataset.Sex == 'male'].value_counts()
male_surivival_count.plot(kind='bar', ax=axes[0,2])

# Percent Survived
axes[1,0].set_title('Survival Rate by Population')
axes[1,0].set_ylabel('% Survived')
axes[1,0].set_xlabel('Survived')
(population_survival_count/population_survival_count.sum()).plot(kind='bar', ax=axes[1,0])

axes[1,1].set_title('Survival Rate for Females')
axes[1,1].set_ylabel('% Survived')
axes[1,1].set_xlabel('Survived')
(females_survival_count/females_survival_count.sum()).plot(kind='bar', ax=axes[1,1])

axes[1,2].set_title('Survival Rate for Males')
axes[1,2].set_ylabel('% Survived')
axes[1,2].set_xlabel('Survived')
(male_surivival_count/male_surivival_count.sum()).plot(kind='bar', ax=axes[1,2])


In [None]:
# Plots out the count on the first row and the percentage breakdown on the second given some feature name
def explore_features(features, dependent='Survived', df=dataset):
    
    # We got only 1 feature, let's plot the count and the ratio
    if len(features) == 1:
        feature = features[0]
        unique_features = df[feature].dropna().unique()
        columns = len(unique_features)

        # Create figure and axes
        fig, axes = plt.subplots(2, columns, figsize=(4*columns, 2*4))

        for col, val in enumerate(np.sort(unique_features)):
            feature_count = df[dependent][df[feature] == val].value_counts().sort_index(ascending=True)
            axes[0, col].set_title('{} Count for {}={}'.format(dependent, feature, val))
            axes[0, col].set_ylabel('# {}'.format(dependent)) if col == 0 else None
            axes[0, col].set_xlabel(dependent)
            feature_count.plot(kind='bar', ax=axes[0, col])

            axes[1, col].set_title('{} Pct for {}={}'.format(dependent, feature, val))
            axes[1, col].set_ylabel('% {}'.format(dependent)) if col == 0 else None
            axes[1, col].set_xlabel(dependent)
            (feature_count/feature_count.sum()).plot(kind='bar', ax=axes[1, col])
    
    # We got a list of 2 numbers, let's plot the surivival with the two features on x and y
    elif len(features) == 2:
        feature_x = features[0]
        feature_y = features[1]
        
        unique_feature_x = df[feature_x].dropna().unique()
        unique_feature_y = df[feature_y].dropna().unique()
        
        columns = len(unique_feature_x)
        rows = len(unique_feature_y) 
        
        fig, axes = plt.subplots(rows, columns, figsize=(4*columns, 4*rows))
        
        

In [None]:
explore_features(['Sex'])

In [None]:
explore_features(['Pclass'])

In [None]:
explore_features(['Embarked'])