In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import cos, sin

#!pip install category_encoders
#!pip install xgboost 
import xgboost as xgb


In [None]:
#df = pd.read_csv("../input/train_features.csv", header=0)

df = pd.read_csv("train_features.csv", header=0)
df_test = pd.read_csv("test_features.csv",header=0)
df_labels = pd.read_csv("train_labels.csv", header=0)
df_labels['status_group'].value_counts()
df = df.merge(df_labels, on='id')


In [None]:
# Based on the most voted answer we can easily define a function that gives us a dataframe to preview the missing values and the % of missing values in each column:
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns
    
missing_values_table(df)

In [None]:
# Lets see how many objects there are in all object columns. 
counts_ = []
features = list(df.select_dtypes(include=['object']))
for x in features: counts_.append(df[x].nunique());
print(f'Encoding will create ~ {sum(counts_)} new features.')
print(f'Encoding will also delete {len(features)} features')

fc = []
for x in features: fc.append([x, df[x].nunique()])

pd.DataFrame(data=fc, columns = ['features','counts']).sort_values(by=['counts']).set_index(['features'])

In [None]:
# Encode my Y label and return a list of my labels. 
def labeler(dataframe, column):
    from sklearn.preprocessing import LabelEncoder
    dataframe =  dataframe.copy()
    le = LabelEncoder()
    dataframe['labels'] = le.fit_transform(dataframe[column])
    labels = [0,1,2]
    label_names = list(le.inverse_transform(labels))
    label_list = [labels, label_names]
    return dataframe['labels'], label_list

# Turn Lat/Long into x,y,z, coord plane. 
def lat_long(dataframe):
    from math import cos, sin 
    dataframe =  dataframe.copy()
    dataframe['x_coord'] = dataframe.latitude.apply(lambda x: cos(x)) * dataframe.longitude.apply(lambda x: cos(x))
    dataframe['y_coord'] = dataframe.latitude.apply(lambda x: cos(x)) * dataframe.longitude.apply(lambda x: sin(x))
    dataframe['z_coord'] = dataframe.latitude.apply(lambda x: sin(x))
    dataframe = dataframe.drop(columns=['latitude', 'longitude'])
    return dataframe

# Fix silly boolean issue. 
def no_bool(dataframe, columns):
    dataframe =  dataframe.copy()
    for column in columns:
        dataframe[column] = dataframe[column].replace({True: 'Yes', False: 'No'})
    return dataframe
        

In [None]:
# Fix or Enhance Features
data = df
data = lat_long(df)
data = no_bool(data, ['permit', 'public_meeting'])

# Define those datasets
X = data.drop(columns=['id', 'status_group','scheme_name', 'recorded_by',])
y, label_list = labeler(df, 'status_group')
y.head()

# First passthrough features. These are any I don't want to mess with. 
passthrough_features = []

# Ones to binary encode (high cardinality)
binary_features = ['date_recorded','lga','funder','installer', 'subvillage', 'wpt_name']

# Ones that aren't actually numeric.
not_numeric = ['region_code', 'district_code']

# Defining my one-hot variables. 
one_hot_features = list(X.select_dtypes(include=['object']))
for x in binary_features: one_hot_features.remove(x)
for x in not_numeric: one_hot_features.append(x)

# Define my numeric features
numeric_features = list(X.select_dtypes(include=['float64', 'int64']))
for x in not_numeric: numeric_features.remove(x)
one_hot_features

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, RobustScaler, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import f_classif, chi2, SelectKBest
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from category_encoders.hashing import HashingEncoder
from category_encoders.binary import BinaryEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

# Preprocessing pipelines for both numeric and categorical data.
# Using column_transformer https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html


# Define my custom pipeline functions for each type of data. Columns not expressly included are dropped.  
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

polynom_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
    ('polynom', PolynomialFeatures())])

one_hot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder()),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('binary', BinaryEncoder(drop_invariant=True))])


# Create preprocessor pipeline
PreProcessor = ColumnTransformer(
    transformers=[
#        ('pass', 'passthrough', passthrough_features),
        ('biy', binary_transformer, binary_features),
        ('num', numeric_transformer, numeric_features),
        ('o-h', one_hot_transformer, one_hot_features)
    ],
    n_jobs = -2)

# Lets test it.

In [None]:
# Test Train Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
PreProcessor.fit_transform(X_train, y_train)

In [None]:
clf = make_pipeline(PreProcessor,  LogisticRegression())

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_val, y_val))

### Trying with Ridge Classifier

In [86]:
pipe = make_pipeline(
    PreProcessor, 
    SelectKBest(f_classif), 
    RidgeClassifier())

param_grid = {
    'selectkbest__k': range(1, len(X_train.columns)+1), 
    'ridgeclassifier__alpha': [0.001, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
    }

# Fit on the train set, with grid search cross-validation
gs = GridSearchCV(pipe, param_grid=param_grid, cv=5, 
                      scoring='accuracy', 
                      verbose=10, n_jobs=-2)
gs.fit(X_train, y_train)
print("model score: %.3f" % gs.score(X_train, y_train))
print("model score: %.3f" % gs.score(X_val, y_val))

Fitting 5 folds for each of 266 candidates, totalling 1330 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-2)]: Done  10 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-2)]: Done  19 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-2)]: Done  28 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-2)]: Done  39 tasks      | elapsed:   31.0s
[Parallel(n_jobs=-2)]: Done  50 tasks      | elapsed:   38.4s
[Parallel(n_jobs=-2)]: Done  63 tasks      | elapsed:   47.0s
[Parallel(n_jobs=-2)]: Done  76 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-2)]: Done  91 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-2)]: Done 106 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-2)]: Done 123 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-2)]: Done 140 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-2)]: Done 159 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-2)]: Done 178 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-2)]: Done 199 tasks      | elapsed:  

model score: 0.753
model score: 0.702


In [96]:
pipe2 = make_pipeline(
    PreProcessor, 
    SelectKBest(f_classif), 
    LogisticRegression())

param_grid2 = {
    'selectkbest__k': range(1, len(X_train.columns)+1), 
    'logisticregression__C': [0.01, 0.1, 1, 10]
    }

# Fit on the train set, with grid search cross-validation
gs2 = GridSearchCV(pipe2, param_grid=param_grid2, cv=3, 
                      scoring='accuracy', 
                      verbose=10, n_jobs=-2)
gs2.fit(X_train, y_train)
print("model score: %.3f" % gs2.score(X_train, y_train))
print("model score: %.3f" % gs2.score(X_val, y_val))

Fitting 3 folds for each of 152 candidates, totalling 456 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-2)]: Done  10 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-2)]: Done  19 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-2)]: Done  28 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-2)]: Done  39 tasks      | elapsed:   25.6s
[Parallel(n_jobs=-2)]: Done  50 tasks      | elapsed:   31.7s
[Parallel(n_jobs=-2)]: Done  63 tasks      | elapsed:   38.9s
[Parallel(n_jobs=-2)]: Done  76 tasks      | elapsed:   45.6s
[Parallel(n_jobs=-2)]: Done  91 tasks      | elapsed:   55.4s
[Parallel(n_jobs=-2)]: Done 106 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-2)]: Done 123 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-2)]: Done 140 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-2)]: Done 159 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-2)]: Done 178 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-2)]: Done 199 tasks      | elapsed:  

model score: 0.717
model score: 0.707


In [98]:
# XGBoost
XGPipe = make_pipeline(PreProcessor, SelectKBest(f_classif), XGBClassifier())

param_grid = {
    'selectkbest__k': range(1, len(X_train.columns)+1), 
    'xgbclassifier__learning_rate':[0.001, 0.0045, 0.0065, 0.010], 
    'xgbclassifier__reg_lambda':[0, 0.01, 0.10, 0.50, 1]
    }
grid = {}


# Fit on the train set, with grid search cross-validation
XGsearch = GridSearchCV(XGPipe, param_grid=param_grid, cv=3, 
                      scoring='accuracy', 
                      verbose=10, n_jobs=-2)
XGsearch.fit(X_train, y_train)
print("model score: %.3f" % XGsearch.score(X_train, y_train))
print("model score: %.3f" % XGsearch.score(X_val, y_val))




#scores = cross_validate(clf, X, y, cv=3, n_jobs=-2, scoring='accuracy',return_train_score=True, return_estimator=True)
#print(scores)
#print('Accuracy: %.3f stdev: %.2f' % (np.mean(np.abs(scores)), np.std(scores)))
#clf.fit(X_train,y_train)

# You can just drop it into a pandas dataframe and BOOM: pretty print! 
# pd.DataFrame(scores).rename(columns={"test_score": 'validation_score'})
# The test score is actually the scores from each validation cycle.

Fitting 3 folds for each of 760 candidates, totalling 2280 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-2)]: Done  10 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-2)]: Done  19 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-2)]: Done  28 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-2)]: Done  39 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-2)]: Done  50 tasks      | elapsed:   34.9s
[Parallel(n_jobs=-2)]: Done  63 tasks      | elapsed:   41.7s
[Parallel(n_jobs=-2)]: Done  76 tasks      | elapsed:   48.7s
[Parallel(n_jobs=-2)]: Done  91 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-2)]: Done 106 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-2)]: Done 123 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-2)]: Done 140 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-2)]: Done 159 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-2)]: Done 178 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-2)]: Done 199 tasks      | elapsed:  

model score: 0.699
model score: 0.691


In [None]:
from sklearn.feature_selection import RFECV
clf7 = make_pipeline(PreProcessor, RFECV(estimator=XGBClassifier(), step=1, min_features_to_select=1, cv=3, scoring='accuracy', verbose=10, n_jobs=-2))
clf7.fit(X_train, y_train)
print("model score: %.3f" % clf7.score(X_train, y_train))
print("model score: %.3f" % clf7.score(X_val, y_val))

In [None]:
from sklearn.feature_selection import RFECV
clf8 = make_pipeline(PreProcessor, RFECV(estimator=LogisticRegression(), step=1, min_features_to_select=1, cv=3, scoring='accuracy', verbose=10, n_jobs=-2))
clf8.fit(X_train, y_train)
print("model score: %.3f" % clf8.score(X_train, y_train))
print("model score: %.3f" % clf8.score(X_val, y_val))

In [None]:
# Rewriting my Dummy Regression Baseline one as a function
def baseline(data):
    name = "Dummy Regression Baseline"
    # Split data into train and test
    X_train, X_test, y_train, y_test = split(data)

    # Define an estimator and param_grid
    # WHEN DEFINING YOU CAN GIVE IT A NAME OTHERWISE IT WILL USE THE PIPELINE NAME AUTOGEN NAME (name of the function but lowercase)
    pipe = make_pipeline(
        PreProcesser(), 
        DummyRegressor(strategy='mean'))
    pipe.fit(X_train, y_train)

    scorer = 'MAE'

    ### Get the scores with the MAE Function
    y_pred_train = pipe.predict(X_train)  
    y_pred_test = pipe.predict(X_test)

    train_score = mean_absolute_error(y_train, y_pred_train)
    test_score = mean_absolute_error(y_test, y_pred_test)
    score_variance = test_score - train_score
    cv_score = 0.0000000000000
    best_params = pipe.get_params
    best_estimator = ""
    selected_names = list(X_train.columns)
    unselected_names = []

    return [name, scorer, train_score, test_score, score_variance, cv_score, selected_names, unselected_names, best_params, best_estimator]


# Rewriting my GridSearch CV as a function 
def compare(data, name):
    X_train, X_test, y_train, y_test = split(data)

    pipe = make_pipeline(
        PreProcessor, 
        SelectKBest(f_regression), 
        Ridge())

    param_grid = {
        'selectkbest__k': range(1, len(X_train.columns)+1), 
        'ridge__alpha': [0.1, 1.0, 10.]
    }

    scorer = 'MAE'

    # Fit on the train set, with grid search cross-validation
    gs = GridSearchCV(pipe, param_grid=param_grid, cv=3, 
                      scoring='neg_mean_absolute_error', 
                      verbose=0)
    gs.fit(X_train, y_train)

    train_score = -gs.score(X_train, y_train)
    test_score = -gs.score(X_test, y_test)
    score_variance = test_score - train_score
    cv_score = -gs.best_score_
    best_params = gs.best_params_
    best_estimator = gs.best_estimator_

    # selected features? 
    # 'selectkbest' is the autogenerated name of the SelectKBest() function in the pipeline
    selector = gs.best_estimator_.named_steps['selectkbest']
    all_names = X_train.columns

    # get_support returns a mask of the columns in True / False
    selected_mask = selector.get_support()
    # Passing the boolean list as the column names creates a masked list.  
    selected_names = list(all_names[selected_mask])
    unselected_names = list(all_names[~selected_mask])

    return [name, scorer, train_score, test_score, score_variance, cv_score, selected_names, unselected_names, best_params, best_estimator]

In [None]:
#Create a  DataFrame with the passengers ids and our prediction regarding whether they survived or not
submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predictions})

#Visualize the first 5 rows
submission.head()

#Convert DataFrame to a csv file that can be uploaded
#This is saved in the same directory as your notebook
filename = 'Titanic Predictions 1.csv'
submission.to_csv(filename,index=False)
print('Saved file: ' + filename)