For more information about working with Colaboratory notebooks, see [Overview of Colaboratory](/notebooks/basic_features_overview.ipynb).


# package imports

In [0]:
import sklearn
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning imports

from sklearn import ensemble
from sklearn import gaussian_process
from sklearn import linear_model
from sklearn import tree
from sklearn import naive_bayes
from sklearn import svm
from sklearn import discriminant_analysis
from sklearn import neighbors
from sklearn import model_selection
from sklearn import metrics

In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Raw data imports

In [0]:
DIR = "/content/drive/My Drive/Colab Notebooks/titanic/"
train_df = pd.read_csv(DIR + "train.csv")
test_df = pd.read_csv(DIR + "test.csv")

In [0]:
! ls drive/'My Drive'/'Colab Notebooks'/

# explore data

In [0]:
cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',  'Embarked']

## explore - all in one

In [0]:
train_df.info()
train_df.head()
train_df.describe()
train_df.describe(include=['O'])
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)
pd.crosstab(train_df['Survived'], train_df['Sex'])

## Group bys

In [0]:
cols_show =["Pclass", "Sex", "Fare", "Survived"]
train_df[cols_show].groupby("Sex", as_index = False).mean().head()


# prepare data

In [0]:
combine = [train_df, test_df]

In [0]:
for dataset in combine:
  dataset['Sex']      = dataset['Sex']     .map( {'female': 1, 'male': 0} )#.astype(int)
  dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} )#.astype(int)
  freq_port           = train_df.Embarked.dropna().mode()[0]
  dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
  mean_age            = train_df.Age.dropna().mean()
  dataset['Age']      = dataset['Age'].fillna(mean_age)
  mean_fare           = train_df.Fare.dropna().mean()
  dataset['Fare']     = dataset['Fare'].fillna(mean_fare)

In [0]:
X_train = train_df[cols]
Y_train = train_df["Survived"]
X_test  = test_df[cols]             # there is no Y_pred
# print(X_train.head())
# print(X_test.head())

# model fitting and predictions

In [0]:
model = ensemble.RandomForestClassifier()                      # replace with any other 
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

# model evaluation

In [0]:
model.score(X_train, Y_train)

In [0]:
cnf_matrix = metrics.confusion_matrix(Y_train, model.predict(X_train))
print(cnf_matrix)
print(f"error from confusion matrix is {(cnf_matrix[0,1]+cnf_matrix[1,0])/cnf_matrix.sum():2.4f}")

In [0]:
scores = model_selection.cross_val_score(model, X_train, Y_train, cv=5)
print(scores)

# Visualization

## Seaborn style

In [0]:
fig, axis1 = plt.subplots(1,1,figsize=(14,12))
sns.barplot(x = 'Sex', y = 'Survived', hue = 'Embarked', data=train_df, ax = axis1)
axis1.set_title('Sex vs Embarked Survival Comparison')


## Matplotlib style

In [0]:
plt.figure(figsize=(10, 6) )
plt.scatter(x = "Age", y = "Fare", c = "Survived", s=train_df["Parch"]*30, alpha = 0.7,  data = train_df, cmap=plt.cm.Accent)
plt.legend()
plt.xlabel("Age")
plt.ylabel("Fare")
plt.title("Fare vs Age")
plt.ylim((-10,300))

## pandas style

In [0]:
ax = train_df.Age.plot(kind='kde')                # example of a probability distribution 
ax.set_title("probability distribution of age")
ax.set_xlabel("Age")