In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)  # Unlimited columns
import nbimporter
# Imported from my other notebook
from data_cleanup import cleanup1
from data_cleanup import cleanup2
from data_cleanup import cleanup3

Importing Jupyter notebook from data_cleanup.ipynb


In [2]:
# Import all data
sample_submission = pd.read_csv('original_data/sample_submission.csv')
test_features = pd.read_csv('original_data/test_features.csv')
train_features = pd.read_csv('original_data/train_features.csv')
train_labels = pd.read_csv('original_data/train_labels.csv')

In [None]:
# Clean data with previously defined cleanup function
train1 = cleanup(train_features)

# Baseline prediction
Always start with a stupid model, no exceptions.  In this case, the stupid model is assuming the majority class.

In [None]:
# Predict that all rows belong to the majority class
majority_class = train_labels['status_group'].mode()[0]
y_pred = np.full(len(train_labels), majority_class)
y_true = train_labels['status_group']

# Check the accuracy of that prediction
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_true, y_pred)

# OHE + Logistic Regression
I'll start by one-hot encoding all the categorical variables and running a simple logistic regression.  Many of the features have way too much cardinality for one-hot encoding.  Let's separate them into two lists by cardinality.

In [None]:
# Sort all features by cardinality.
train1.select_dtypes(exclude='number').nunique().sort_values(ascending=False)

In [None]:
# I'll first drop the categorical variables that have too many 
# unique values, so that regression doesn't take forever
cols_to_drop = ['id',
                 'wpt_name',
                 'subvillage',
                 'scheme_name',
                 'installer',
                 'ward',
                 'funder',
                 ]


cols_to_keep =  ['lga',
                 'region_code',
                 'region',
                 'district_code',
                 'extraction_type_group',
                 'management',
                 'source',
                 'scheme_management',
                 'extraction_type',
                 'basin',
                 'water_quality',
                 'payment_type',
                 'extraction_type_class',
                 'waterpoint_type',
                 'source_type',
                 'payment',
                 'waterpoint_type_group',
                 'quality_group',
                 'quantity',
                 'quantity_group',
                 'management_group',
                 'public_meeting',
                 'permit',
                 'source_class']
# X = train1.drop(columns= )
X = train1[cols_to_keep]
y_true = train_labels['status_group']

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn_pandas import DataFrameMapper

# Get a list of numerical and categorical columns
numerical_cols = X.select_dtypes(include='number').columns.tolist()
categorical_cols = X.select_dtypes(exclude='number').columns.tolist()

# # Use a mapper to apply transformations selectively
mapper = DataFrameMapper(
  [([col], StandardScaler()) for col in numerical_cols] +
  [([col], OneHotEncoder(categories='auto')) for col in categorical_cols]
)

# # Define an estimator and param_grid
pipe = make_pipeline(
    mapper, 
    LogisticRegression(solver='lbfgs', multi_class='ovr',
                      max_iter=500))


In [None]:
%%time
pipe.fit(X,y_true)

In [None]:
y_pred = pipe.predict(X)
accuracy_score(y_true, y_pred)

Alright, that was the score with all categories except for those that have thousands of possible values and make the final dataframe way too big.

# Make a submission file

In [None]:
# Clean up the test dataset
test1 = cleanup1(test_features)

# Extract the same columns used for training
X_test = test1[cols_to_keep]

# Run the prediction, using the pipeline fit to the training data
y_pred = pipe.predict(X_test)

# Make a dataframe with the answers
y_submit = pd.DataFrame({'id':test_features['id'],
                         'status_group':y_pred} )
# make a submission CSV file
y_submit.to_csv('DMAn.csv', index=False)

# All I need for restarted kernel

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)  # Unlimited columns
from sklearn.metrics import accuracy_score, classification_report

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV

In [None]:
sample_submission = pd.read_csv('original_data/sample_submission.csv')
test_features = pd.read_csv('original_data/test_features.csv')
train_features = pd.read_csv('original_data/train_features.csv')
train_labels = pd.read_csv('original_data/train_labels.csv')

In [None]:
def cleanup1(X):
    """
    Ensures that all the features are good to go for the first 
    logistic regression.
    
    >> Input
    X: Full-featured dataset
    
    >> Output
    X2: Cleaned dataset
    """
    
    # Looking at all the features with missing values, it looks like those
    # features are all categorical variables where 'unknown' would be a
    # category we can work with.  I'll replace the NANs accordingly.
    X2 = X.fillna('unknown')
    
    # Regression on dates won't work.  Instead, I'll turn the 
    # date_recorded column into the number of years since 2000
    # (the earliest date in the training date is from 2000, and the
    # latest from 2013.)
    dates = pd.to_datetime(X2.date_recorded)
    year2000 = pd.to_datetime('2000-01-01')
    years = [i.days/365 for i in (dates - year2000)]
    X2.date_recorded = years
    
    # region_code and district_code are int64, but they should really be
    # treated as categories (and there's only 20-30 classes in each).
    # I'll cast them as strings instead.
    X2.region_code = X2.region_code.astype('str')
    X2.district_code = X2.district_code.astype('str')
    
    # recorded_by has only one value everywhere, and is therefore useless
    X2 = X2.drop(columns='recorded_by')
    
    # To prevent data conversion warnings, I'll turn all the numerical
    # features (except id) into float64.
    
    # Also, some columns contained bool values and NANs.  
    # (e.g., public_meeting, permit)
    # I replaced the NANs with strings, so I'll cast the whole series 
    # as strings to prevent future problems with data type heterogeneity.
    type_dict = {'amount_tsh':'float64',
                 'date_recorded':'float64',
                 'gps_height':'float64',
                 'longitude':'float64',
                 'latitude':'float64',
                 'num_private':'float64',
                 'population':'float64',
                 'construction_year':'float64',
                 'public_meeting':'str',
                 'permit':'str'}
    
    X2 = X2.astype(dtype = type_dict)
    

    
    
    return X2

In [None]:
train1 = cleanup1(train_features)
train1.shape

# LogReg with OHE and Binary Encoding

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.decomposition import PCA
from category_encoders import BinaryEncoder

bad_cats = ['wpt_name',
                 'subvillage',
                 'scheme_name',
                 'installer',
                 'ward',
                 'funder',
                 ]


good_cats =  ['lga',
                 'region_code',
                 'region',
                 'district_code',
                 'extraction_type_group',
                 'management',
                 'source',
                 'scheme_management',
                 'extraction_type',
                 'basin',
                 'water_quality',
                 'payment_type',
                 'extraction_type_class',
                 'waterpoint_type',
                 'source_type',
                 'payment',
                 'waterpoint_type_group',
                 'quality_group',
                 'quantity',
                 'quantity_group',
                 'management_group',
                 'public_meeting',
                 'permit',
                 'source_class']

X = train1.drop(columns='id')
y_true = train_labels['status_group']

# Get a list of numerical columns
numerical_cols = X.select_dtypes(include='number').columns.tolist()

# # Use a mapper to apply transformations selectively
mapper = DataFrameMapper(
    [([col], StandardScaler()) for col in numerical_cols] +
    [([col], OneHotEncoder(categories='auto')) for col in good_cats] +
    [([col], BinaryEncoder()) for col in bad_cats]   
)

# # Define an estimator and param_grid
pipe1 = make_pipeline(
    mapper,
    PCA(n_components=0.99)
)

pipe2 = make_pipeline(
    LogisticRegression(solver = 'lbfgs', multi_class='ovr',
                      max_iter=500))

param_grid = {}

gs = GridSearchCV(pipe2, cv=2, param_grid=param_grid,
                  scoring='accuracy', 
                  verbose=10)

In [None]:
%%time
X_transformed = pipe1.fit_transform(X,y_true)

In [None]:
%%time
pipe2.fit(X_transformed,y_true)

In [None]:
y_pred = pipe2.predict(X_transformed)
accuracy_score(y_true, y_pred)

## Adding polynomial features

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.decomposition import PCA
from category_encoders import BinaryEncoder

bad_cats = ['wpt_name',
                 'subvillage',
                 'scheme_name',
                 'installer',
                 'ward',
                 'funder',
                 ]


good_cats =  ['lga',
                 'region_code',
                 'region',
                 'district_code',
                 'extraction_type_group',
                 'management',
                 'source',
                 'scheme_management',
                 'extraction_type',
                 'basin',
                 'water_quality',
                 'payment_type',
                 'extraction_type_class',
                 'waterpoint_type',
                 'source_type',
                 'payment',
                 'waterpoint_type_group',
                 'quality_group',
                 'quantity',
                 'quantity_group',
                 'management_group',
                 'public_meeting',
                 'permit',
                 'source_class']

X = train1.drop(columns='id')
y_true = train_labels['status_group']

# Get a list of numerical columns
numerical_cols = X.select_dtypes(include='number').columns.tolist()

# # Use a mapper to apply transformations selectively
scaler_encoder = DataFrameMapper(
    [([col], StandardScaler()) for col in numerical_cols] +
    [([col], OneHotEncoder(categories='auto')) for col in good_cats] +
    [([col], BinaryEncoder()) for col in bad_cats]     
)


poly_maker = DataFrameMapper(
    [([col], PolynomialFeatures()) for col in (numerical_cols + )]    
)

# # Define an estimator and param_grid
pipe1 = make_pipeline(
    scaler_encoder,
    PCA(n_components=0.99)
)

pipe2 = make_pipeline(
    LogisticRegression(solver = 'lbfgs', multi_class='ovr',
                      max_iter=500))

param_grid = {}

gs = GridSearchCV(pipe2, cv=2, param_grid=param_grid,
                  scoring='accuracy', 
                  verbose=10)

In [None]:
X_expanded = mapper.fit_transform(X,y_true)

In [None]:
X_expanded.shape

In [None]:
pd.DataFrame(X_expanded, columns=mapper.transformed_names_).head()

In [None]:
%%time
X_transformed = pipe1.fit_transform(X,y_true)

In [None]:
%%time
pipe2.fit(X_transformed,y_true)

In [None]:
y_pred = pipe2.predict(X_transformed)
accuracy_score(y_true, y_pred)

In [None]:
X_transformed.shape