In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
combine = [train_df, test_df]

In [None]:
print(train_df.columns.values)

- Which features are categorical?
- Which features are Numerical? Are they discrete or continuous?

In [None]:
# preview the data
train_df.head(10)

- Which features are mixed data types?
- Which features may contain errors or typos?
- Which features contain blank, null or empty values?
- What are the data types for various features?

In [None]:
train_df.info()
print('_'*40)
test_df.in-fo()

In [None]:
# What is the distribution of numerical features values across the samples?
train_df.describe()

- Total samples are 891 or 40% of the actual number of passengers on board the Titanic (2,224).

- Most passengers (> 75%) did not travel with parents or children.

- Fares varied significantly with few passengers paying as high as $512.

- Few elderly passengers.

In [None]:
# What is the distribution of categorical features?
train_df.describe(include=['O'])

- Names are unique across the dataset (count=unique=891)
- Sex variable as two possible values with 577 males (65%)
- Cabin values have several duplicates across samples => several passengers shared a cabin.


# Correlating.

We want to know how well does each feature correlate with Survival. 


# Challenges:
----------------------
- Find correlation between Pclass and survival rate
- Find other features that are correlated with survival rate
- Find other features that are not correlated with survival rate

<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.

In [None]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

# Completing.

1. We may want to complete Age feature as it is definitely correlated to survival.
2. We may want to complete the Embarked feature as it may also correlate with survival or another important feature.


# Correcting

1. Ticket feature may be dropped from our analysis as it contains high ratio of duplicates (22%) and there may not be a correlation between Ticket and survival.
2. Cabin feature may be dropped as it is highly incomplete or contains many null values both in training and test dataset.
3. PassengerId may be dropped from training dataset as it does not contribute to survival.
4. Name feature is relatively non-standard, may not contribute directly to survival, so maybe dropped.

# Creating

1. We may want to create a new feature called Family based on Parch and SibSp to get total count of family members on board.
2. We may want to engineer the Name feature to extract Title as a new feature.
3. We may want to create new feature for Age bands. This turns a continous numerical feature into an ordinal categorical feature.
4. We may also want to create a Fare range feature if it helps our analysis.

# Classifying

We may also add to our assumptions based on the problem description noted earlier.

1. Women (Sex=female) were more likely to have survived.
2. Children (Age<?) were more likely to have survived.
3. The upper-class passengers (Pclass=1) were more likely to have survived.

# Analyze by visualizing data

**Correlating numerical features**

In [None]:
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
g = sns.FacetGrid(train_df, col='Survived')
g.map(plt.hist, 'Age', bins=20)

**Observations**

- Infants (Age <=4) had high survival rate.
- Oldest passengers (Age = 80) survived.
- Large number of 15-25 year olds did not survive.
- Most passengers are in 15-35 age range.

**Correlating numerical and ordinal features**

We can combine multiple features for identifying correlations using a single plot. This can be done with numerical and categorical features which have numeric values.

In [None]:
# grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived')
grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend();

**Observations**

- Pclass=3 had most passengers, however most did not survive.
- Infant passengers in Pclass=2 and Pclass=3 mostly survived.
- Most passengers in Pclass=1 survived.

**Correlating categorical features**

In [None]:
grid = sns.FacetGrid(train_df, row='Embarked', size=2.2, aspect=1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep', order=[1,2,3], hue_order=["female","male"])
grid.add_legend()

**Observations**

- Female passengers had much better survival rate than males.
- Ports of embarkation have varying survival rates for Pclass=3 and among male passengers.

**Correlating categorical and numerical features**

We may also want to correlate categorical features (with non-numeric values) and numeric features. We can consider correlating Embarked (Categorical non-numeric), Sex (Categorical non-numeric), Fare (Numeric continuous), with Survived (Categorical numeric).

In [None]:
grid = sns.FacetGrid(train_df, row='Embarked', col='Survived', size=2.2, aspect=1.6)
grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None)
grid.add_legend()

**Observations**

- Higher fare paying passengers had better survival. Confirms our assumption for creating fare ranges.
- Port of embarkation correlates with survival rates.

# Wrangle data


We have collected several assumptions and decisions regarding our datasets and solution requirements. So far we did not have to change a single feature or value to arrive at these. Let us now execute our decisions and assumptions for correcting, creating, and completing goals.

**Correcting by dropping features**

In [None]:
# Dropping features
print("Before", train_df.shape, test_df.shape, combine[0].shape, 
      combine[1].shape)

train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
combine = [train_df, test_df]

"After", train_df.shape, test_df.shape, combine[0].shape, 
combine[1].shape

In [None]:
train_df.head()

**Creating new feature extracting from existing**

We want to analyze if Name feature can be engineered to extract titles and test correlation between titles and survival. We can replace many titles with a more common name or classify them as Rare.

# Challenge

Replace common names with 'Rare'
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.


In [None]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.')

whatver = pd.crosstab(train_df['Title'], train_df['Sex'])

In [None]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 
  'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

In [None]:
train_df.head()

In [None]:
# We can convert the categorical titles to ordinal.

title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

train_df.head()

In [None]:
# Now we can safely drop the Name feature from train and test datasets and PassengerId from the test dataset

train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name'], axis=1)
combine = [train_df, test_df]
train_df.shape, test_df.shape

In [None]:
train_df.head()

**Converting a categorical feature**

Now we can convert features which contain strings to numerical values

In [None]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

train_df.head()

**Completing a numerical continuous feature**

Now we should start estimating and completing features with missing or null values. We will first do this for the Age feature.

We can consider three methods to complete a numerical continuous feature.

1- A simple way is to generate random numbers between mean and standard deviation.

2- More accurate way of guessing missing values is to use other correlated features. In our case we note correlation among Age, Gender, and Pclass. Guess Age values using median values for Age across sets of Pclass and Gender feature combinations. So, median Age for Pclass=1 and Gender=0, Pclass=1 and Gender=1, and so on...

3- Combine methods 1 and 2. So instead of guessing age values based on median, use random numbers between mean and standard deviation, based on sets of Pclass and Gender combinations.

Method 1 and 3 will introduce random noise into our models. The results from multiple executions might vary. We will prefer method 2.

In [None]:
# preparing an empty array to contain guessed Age values based on Pclass x Gender combinations.
guess_ages = np.zeros((2,3))

# iterate over Sex (0 or 1) and Pclass (1, 2, 3) to calculate guessed values of Age for the six combinations.
for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & (dataset['Pclass'] == j+1)]['Age'].dropna()

            # age_mean = guess_df.mean()
            # age_std = guess_df.std()
            # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1), 'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

train_df.head()

In [None]:
# let's create Age bands and determine correlations with Survived.

train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)


In [None]:
# let's replace Age with ordinals based on these bands.

for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4
train_df.head()

In [None]:
# remove AgeBand from dataset

train_df = train_df.drop(['AgeBand'], axis=1)
combine = [train_df, test_df]
train_df.head()

**Create new feature combining existing features**

In [None]:
# create a new feature for FamilySize which combines Parch and SibSp

for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)




 # Challenge
 
 Create feature called IsAlone for passengers who don't have children/spouses/parents/siblings
 
 <br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
 

In [None]:
# We can create another feature called IsAlone.

for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

In [None]:
# remove Parch and SibSp and FamilySize

train_df = train_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
test_df = test_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
combine = [train_df, test_df]

train_df.head()

**Completing a categorical feature**

Embarked feature takes S, Q, C values based on port of embarkation. Our training dataset has two missing values. We simply fill these with the most common occurance.


In [None]:
freq_port = train_df.Embarked.dropna().mode()[0]
print(freq_port)

In [None]:
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
    
train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

**Convert categorical feature to numeric**

In [None]:
train_df.head()

for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

print("After")
train_df.head()

**Quick completing and converting a numeric feature**

In [None]:
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
test_df.head()

# Challenge

- Create FareBand feature
- Convert the Fare feature to ordinal values based on the FareBand.



<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.
<br />.


 Time: 10 mins

In [None]:
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)


In [None]:
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_df = train_df.drop(['FareBand'], axis=1)
combine = [train_df, test_df]
    
train_df.head(10)

In [None]:
test_df.head(10)

In [None]:
# machine learning
from sklearn.linear_model import LogisticRegression

In [None]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

In [None]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log