In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import cos, sin

In [3]:
#df = pd.read_csv("../input/train_features.csv", header=0)

df = pd.read_csv("train_features.csv", header=0)
df_test = pd.read_csv("test_features.csv",header=0)
df_labels = pd.read_csv("train_labels.csv", header=0)
df_labels['status_group'].value_counts()
df = df.merge(df_labels, on='id')


In [4]:
# Based on the most voted answer we can easily define a function that gives us a dataframe to preview the missing values and the % of missing values in each column:
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns
    
missing_values_table(df)

Your selected dataframe has 41 columns.
There are 7 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
scheme_name,28166,47.4
scheme_management,3877,6.5
installer,3655,6.2
funder,3635,6.1
public_meeting,3334,5.6
permit,3056,5.1
subvillage,371,0.6


In [5]:
# Lets see how many objects there are in all object columns. 
counts_ = []
features = list(df.select_dtypes(include=['object']))
for x in features: counts_.append(df[x].nunique());
print(f'Encoding will create ~ {sum(counts_)} new features.')
print(f'Encoding will also delete {len(features)} features')

fc = []
for x in features: fc.append([x, df[x].nunique()])

pd.DataFrame(data=fc, columns = ['features','counts']).sort_values(by=['counts']).set_index(['features'])

Encoding will create ~ 66174 new features.
Encoding will also delete 31 features


Unnamed: 0_level_0,counts
features,Unnamed: 1_level_1
recorded_by,1
public_meeting,2
permit,2
status_group,3
source_class,3
quantity_group,5
quantity,5
management_group,5
quality_group,6
waterpoint_type_group,6


In [13]:
# Encode my Y label and return a list of my labels. 
def labeler(dataframe, column):
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    dataframe['labels'] = le.fit_transform(dataframe[column])
    labels = [0,1,2]
    label_names = list(le.inverse_transform(labels))
    label_list = [labels, label_names]
    return dataframe['labels'], label_list

# Turn Lat/Long into x,y,z, coord plane. 
def lat_long(dataframe):
    from math import cos, sin 
    dataframe['x_coord'] = dataframe.latitude.apply(lambda x: cos(x)) * dataframe.longitude.apply(lambda x: cos(x))
    dataframe['y_coord'] = dataframe.latitude.apply(lambda x: cos(x)) * dataframe.longitude.apply(lambda x: sin(x))
    dataframe['z_coord'] = dataframe.latitude.apply(lambda x: sin(x))
    dataframe = dataframe.drop(columns=['latitude', 'longitude'])
    return dataframe

# Fix silly boolean issue. 
def no_bool(dataframe, columns):
    for column in columns:
        dataframe[column] = dataframe[column].replace({True: 'Yes', False: 'No'})
    return dataframe
        

In [9]:
# Fix or Enhance Features
data = lat_long(df)
data = no_bool(data, ['permit', 'public_meeting'])

# Define those datasets
X = data.drop(columns=['id', 'status_group', 'recorded_by'])
y, label_list = labeler(df, 'status_group')
y.head()

# First passthrough features. These are any I don't want to mess with. 
passthrough_features = []

# Ones to binary encode (high cardinality)
binary_features = ['lga','date_recorded','funder','installer','scheme_name', 'subvillage', 'wpt_name']

# Ones that aren't actually numeric.
not_numeric = ['region_code', 'district_code']

# Defining my one-hot variables. 
one_hot_features = list(X.select_dtypes(include=['object']))
for x in binary_features: one_hot_features.remove(x)
for x in not_numeric: one_hot_features.append(x)

# Define my numeric features
numeric_features = list(X.select_dtypes(include=['float64', 'int64']))
for x in not_numeric: numeric_features.remove(x)
numeric_features

['amount_tsh',
 'gps_height',
 'num_private',
 'population',
 'construction_year',
 'x_coord',
 'y_coord',
 'z_coord']

In [None]:
# Define those datasets

X = df.drop(columns=['id', 'status_group'])
y, label_list = labeler(df, 'status_group')
y.head()

In [None]:
drop_cols = 'lga', 'date_recorded','funder', 'subvillage', 'wpt_name', 'funder','scheme_name','latitude'

one_hot = list(df.select_dtypes(include=['object']))
'region_code',
numeric_ordinal = 
one_hot_drop =
ordinal_features = ['basin','region','lga','ward','public_meeting','recorded_by','scheme_management','scheme_name','permit','extraction_type','extraction_type_group','extraction_type_class','management','management_group','payment','payment_type','water_quality','quality_group','quantity','quantity_group','source','source_type','source_class','waterpoint_type','waterpoint_type_group']

In [10]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, RobustScaler, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Preprocessing pipelines for both numeric and categorical data.
# Using column_transformer https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html


# Define my custom pipeline functions for each type of data. Columns not expressly included are dropped.  
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

polynom_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()), 
    ('polynom', PolynomialFeatures())])

one_hot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder()),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


# Create preprocessor pipeline
PreProcessor = ColumnTransformer(
    transformers=[
#        ('pass', 'passthrough', passthrough_features),
        ('num', numeric_transformer, numeric_features),
        ('o-h', one_hot_transformer, one_hot_features)
    ],
    n_jobs = -2)

# Lets test it.

In [12]:
# Test Train Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
PreProcessor.fit_transform(X_train, y_train)

(47520, 40) (47520,)


TypeError: '<' not supported between instances of 'str' and 'bool'

In [None]:
#clf = Pipeline(steps=[('preprocessor', PreProcessor),('selectkbest', SelectKBest(f_regression))('logisticregression', LogisticRegression())])
clf = make_pipeline(PreProcessor, SelectKBest(f_regression), LogisticRegression())

# Test Train Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_val, y_val))

In [None]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='lbfgs'))])

X = data.drop('survived', axis=1)
y = data['survived']

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))


In [None]:

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = list(df.select_dtypes(include=['float64', 'int64']))
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='lbfgs'))])

X = data.drop('survived', axis=1)
y = data['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

In [None]:
# Rewriting my Dummy Regression Baseline one as a function
def baseline(data):
    name = "Dummy Regression Baseline"
    # Split data into train and test
    X_train, X_test, y_train, y_test = split(data)

    # Define an estimator and param_grid
    # WHEN DEFINING YOU CAN GIVE IT A NAME OTHERWISE IT WILL USE THE PIPELINE NAME AUTOGEN NAME (name of the function but lowercase)
    pipe = make_pipeline(
        PreProcesser(), 
        DummyRegressor(strategy='mean'))
    pipe.fit(X_train, y_train)

    scorer = 'MAE'

    ### Get the scores with the MAE Function
    y_pred_train = pipe.predict(X_train)  
    y_pred_test = pipe.predict(X_test)

    train_score = mean_absolute_error(y_train, y_pred_train)
    test_score = mean_absolute_error(y_test, y_pred_test)
    score_variance = test_score - train_score
    cv_score = 0.0000000000000
    best_params = pipe.get_params
    best_estimator = ""
    selected_names = list(X_train.columns)
    unselected_names = []

    return [name, scorer, train_score, test_score, score_variance, cv_score, selected_names, unselected_names, best_params, best_estimator]


# Rewriting my GridSearch CV as a function 
def compare(data, name):
    X_train, X_test, y_train, y_test = split(data)

    pipe = make_pipeline(
        PreProcessor, 
        SelectKBest(f_regression), 
        Ridge())

    param_grid = {
        'selectkbest__k': range(1, len(X_train.columns)+1), 
        'ridge__alpha': [0.1, 1.0, 10.]
    }

    scorer = 'MAE'

    # Fit on the train set, with grid search cross-validation
    gs = GridSearchCV(pipe, param_grid=param_grid, cv=3, 
                      scoring='neg_mean_absolute_error', 
                      verbose=0)
    gs.fit(X_train, y_train)

    train_score = -gs.score(X_train, y_train)
    test_score = -gs.score(X_test, y_test)
    score_variance = test_score - train_score
    cv_score = -gs.best_score_
    best_params = gs.best_params_
    best_estimator = gs.best_estimator_

    # selected features? 
    # 'selectkbest' is the autogenerated name of the SelectKBest() function in the pipeline
    selector = gs.best_estimator_.named_steps['selectkbest']
    all_names = X_train.columns

    # get_support returns a mask of the columns in True / False
    selected_mask = selector.get_support()
    # Passing the boolean list as the column names creates a masked list.  
    selected_names = list(all_names[selected_mask])
    unselected_names = list(all_names[~selected_mask])

    return [name, scorer, train_score, test_score, score_variance, cv_score, selected_names, unselected_names, best_params, best_estimator]

In [None]:
#Create a  DataFrame with the passengers ids and our prediction regarding whether they survived or not
submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predictions})

#Visualize the first 5 rows
submission.head()

#Convert DataFrame to a csv file that can be uploaded
#This is saved in the same directory as your notebook
filename = 'Titanic Predictions 1.csv'
submission.to_csv(filename,index=False)
print('Saved file: ' + filename)