In [None]:
import numpy as np
import pandas as pd
import scipy as sp
import sklearn as sk
from sklearn import preprocessing, feature_extraction, feature_selection, model_selection, metrics
import xgboost as xgb
import matplotlib.pyplot as plt

In [None]:
project = 'kaggle_titanic'

import os.path
import sys
current_dir = os.path.abspath('./')
project_dir = current_dir[:current_dir.rfind(project)+len(project)+1]
sys.path.insert(0, project_dir)

# import

In [None]:
train_path = project_dir + 'data/raw/train.csv'
test_path = project_dir + 'data/raw/test.csv'

In [None]:
train_df = pd.DataFrame.from_csv(train_path)
test_df = pd.DataFrame.from_csv(test_path)

## validation

In [None]:
# training set slightly imbalanced
train_df['Survived'].value_counts()

In [None]:
for column in train_df.columns:
    print(train_df[column].value_counts())

## preprocessing

In [None]:
class MLPrep(object):
    
    def __init__(self, df, x_columns, y_column,
                 replace_dict=None, feature_dict=None, *args, **kwargs):
        super(MLPrep, self).__init__()
        self.df = df.copy()
        self.x_columns = x_columns
        self.y_column = y_column
        self.replace_dict = replace_dict
        self.feature_dict = feature_dict
        
    def replace(self, replace_dict=None):
        """replace values in columns by function values or static ones"""
        if replace_dict is None:
            if self.replace_dict is None:
                print('Nothing to do, no dict specified')
                pass
        else:
            self.replace_dict = replace_dict
          

        for column, replace_item in self.replace_dict.items():
            for value, replace in replace_item.items():
                if callable(replace):
                    replace_val = replace(self.df[column])
                else:
                    replace_val = replace
    
                if value is np.nan:
                    self.df.loc[self.df[column].isnull(), column] = replace_val
                elif callable(value):
                    self.df.loc[value(self.df[column]), column] = replace_val
                else:
                    self.df.loc[self.df[column]==value, column] = replace_val
            
    def feature(self, feature_dict=None):
        """call feature building functions on columns
        feature_dict is of form:
        {'new_col': function}
        {'new_col': (function, (args,))}
        best way for function with extra arguments:
        {'new_col': partial(function, kwarg=val)}
        
        function must act on whole df
        function(df, **kwargs)
        """
        if feature_dict is None:
            if self.feature_dict is None:
                print('Nothing to do, no dict specified')
                pass
        else:
            self.feature_dict = feature_dict
            
        for column, func in self.feature_dict.items():
            if isinstance(func, tuple):
                self.df[column] = func[0](self.df, *func[1])
            else:
                self.df[column] = func(self.df)
            
        
    def get_X(self, vectorizer=None, sparse=True, return_features=False):
        X_dict = self.df[self.x_columns].to_dict(orient='records')
        if vectorizer is None:
            self.vectorizer_ = feature_extraction.DictVectorizer(sparse=sparse)
            X = self.vectorizer_.fit_transform(X_dict)
        else:
            self.vectorizer_ = vectorizer
            X = self.vectorizer_.transform(X_dict)
            
        self.feature_columns_ = self.vectorizer_.vocabulary_
        
        if return_features:
            return X, self.feature_columns_
        else:
            return X
        
    def get_y(self):
        y = self.df[self.y_column]
        return np.array(y.replace(y.sort_values().unique(), range(len(y.unique()))))
        

In [None]:
x_columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Deck']
y_column = 'Survived'
replace_dict = {'Age': {np.nan: np.mean},
                'Fare': {np.nan: np.mean},
                'Embarked': {np.nan: 'nan'},
                'Cabin': {np.nan: 'nan'}
               }

def get_deck(df, col):
    return df['Cabin'].apply(lambda s: s[:1])

from functools import partial
feature_dict = {'Deck': partial(get_deck, col='Cabin'),
                'none': lambda df: 0}

# feature model
fm = MLPrep(train_df, x_columns, y_column, replace_dict, feature_dict)

fm.replace()
fm.feature()

X = fm.get_X()
y = fm.get_y()

In [None]:
fm.feature_columns_

In [None]:
feature_selection.chi2(X, y)

## xgboost

In [None]:
xgb_params = {'objective':'binary:logistic',
              'max_depth':3,
              'learning_rate':0.1,
              'n_estimators':60,
              'gamma':0,
              'max_delta_step':0,
              'nthread':-1,
              'silent':False}

In [None]:
model = xgb.XGBClassifier(**xgb_params)

In [None]:
# xgb feature importance
model.fit(X, y)
xgb.plot_importance(model, importance_type='weight')
xgb.plot_importance(model, importance_type='cover')
xgb.plot_importance(model, importance_type='gain')
plt.show()

In [None]:
import pickle
import joblib

pickle.dump(model, open('modelpickle', 'wb'))
joblib.dump(model, open('modeljobib', 'wb'))

In [None]:
# manual cross val
cv_split = model_selection.StratifiedKFold(n_splits=4).split(X, y)

for train_idx, test_idx in cv_split:
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    model.fit(X_train, y_train)
    print(model.score(X_test, y_test))

In [None]:
# more compact than above:
model_selection.cross_val_score(model, X, y, scoring='accuracy', cv=4, n_jobs=-1)

In [None]:
# parameter search
param_grid = {'max_depth': [3,4,5],
              'learning_rate': [0.03, 0.04, 0.1],
              'n_estimators': [60, 100, 200],
              'gamma': [0],
              'min_child_weight': [1],
              'max_delta_step': [0],
              'subsample': [1],
              'colsample_bytree': [1],
              'colsample_bylevel': [1],
              'reg_alpha': [0],
              'reg_lambda': [1],
              'scale_pos_weight': [1],
             }
search_model = model_selection.GridSearchCV(model, param_grid, scoring='accuracy', n_jobs=-1, cv=4, verbose=1)
search_model.fit(X, y)
print(search_model.best_params_)
search_model.score(X, y)

In [None]:
# score for different hyperparameters. actually depends heavily on cv train-test-split. so better average over multiple runs
param_name = 'n_estimators'
param_range = range(10,250,20)
param_scores = model_selection.validation_curve(model, X, y, param_name, param_range, cv=4, scoring='accuracy', n_jobs=-1)

# score on training set
plt.plot(param_range, np.mean(param_scores[0], axis=1))
#score on test set
plt.plot(param_range, np.mean(param_scores[1], axis=1))

plt.show()

In [None]:
# learning curve, how training improves with increasing samples
learning_scores = model_selection.learning_curve(model, X, y, train_sizes=np.linspace(0.1, 1.0, 5), cv=4, scoring='accuracy', n_jobs=-1)

# learning_scores on training set
plt.plot(learning_scores[0], np.mean(learning_scores[1], axis=1))
# learning_scores on test set
plt.plot(learning_scores[0], np.mean(learning_scores[2], axis=1))
plt.show()

In [None]:
model_selection.permutation_test_score(model, X, y, cv=4, scoring='accuracy', n_jobs=-1)

## metrics

In [None]:
# finally: classification
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

In [None]:
# confusion matrix
metrics.confusion_matrix(y_test, y_pred)

In [None]:
prec, rec, thresh = metrics.precision_recall_curve(y_test, y_pred)

plt.plot(prec, rec)
plt.show()
print(metrics.average_precision_score(y_test, y_pred))
print(metrics.f1_score(y_test, y_pred))

In [None]:
# ROC curve
fpr, tpr, thresh = metrics.roc_curve(y_test, y_pred, drop_intermediate=False)
plt.plot(fpr, tpr)
plt.show()
# AUC
metrics.auc(fpr, tpr)

In [None]:
metrics.log_loss(y_test, y_pred)

In [None]:
# balanced metric for binary (even imbalanced), MCC elem [-1,1]
metrics.matthews_corrcoef(y_test, y_pred)

# generic models

In [None]:
from sklearn import tree, ensemble, naive_bayes, neighbors, svm

In [None]:
tree_params = {'criterion': 'gini',
               'max_depth': 4,
               'min_samples_split': 2,
              }

In [None]:
model = tree.DecisionTreeClassifier(**tree_params)

# statistical analysis