In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)  # Unlimited columns

# Data inspection
First, let's take a look at the dataset

In [None]:
sample_submission = pd.read_csv('original_data/sample_submission.csv')
test_features = pd.read_csv('original_data/test_features.csv')
train_features = pd.read_csv('original_data/train_features.csv')
train_labels = pd.read_csv('original_data/train_labels.csv')

In [None]:
sample_submission.shape, test_features.shape, train_features.shape, train_labels.shape

In [57]:
train_features.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [None]:
train_features.isnull().sum()

# Data cleanup
I don't want to start out by using all the features available, as several of them are probably useless and require lots of cleanup.  So I'll create several cleanup functions that extract different subsets of the features.  I write them as functions so that I can easily apply them to the training and testing data equally.  This way, also, I don't need to commit to a particular set of features ahead of time.

In [None]:
def cleanup1(X):
    """
    Ensures that all the features are good to go for the first 
    logistic regression.
    
    >> Input
    X: Full-featured dataset
    
    >> Output
    X2: Cleaned dataset
    """
    
    # Looking at all the features with missing values, it looks like those
    # features are all categorical variables where 'unknown' would be a
    # category we can work with.  I'll replace the NANs accordingly.
    X2 = X.fillna('unknown')
    
    # Regression on dates won't work.  Instead, I'll turn the 
    # date_recorded column into the number of years since 2000
    # (the earliest date in the training date is from 2000, and the
    # latest from 2013.)
    dates = pd.to_datetime(X2.date_recorded)
    year2000 = pd.to_datetime('2000-01-01')
    years = [i.days/365 for i in (dates - year2000)]
    X2.date_recorded = years
    
    # region_code and district_code are int64, but they should really be
    # treated as categories (and there's only 20-30 classes in each).
    # I'll cast them as strings instead.
    X2.region_code = X2.region_code.astype('str')
    X2.district_code = X2.district_code.astype('str')
    
    # recorded_by has only one value everywhere, and is therefore useless
    X2 = X2.drop(columns='recorded_by')
    
    # To prevent data conversion warnings, I'll turn all the numerical
    # features (except id) into float64.
    
    # Also, some columns contained bool values and NANs.  
    # (e.g., public_meeting, permit)
    # I replaced the NANs with strings, so I'll cast the whole series 
    # as strings to prevent future problems with data type heterogeneity.
    type_dict = {'amount_tsh':'float64',
                 'date_recorded':'float64',
                 'gps_height':'float64',
                 'longitude':'float64',
                 'latitude':'float64',
                 'num_private':'float64',
                 'population':'float64',
                 'construction_year':'float64',
                 'public_meeting':'str',
                 'permit':'str'}
    
    X2 = X2.astype(dtype = type_dict)
    

    
    
    return X2

In [None]:
train1 = cleanup1(train_features)
train1.shape

In [None]:
train1.head()

# Baseline prediction
Always start with a stupid model, no exceptions.  In this case, the stupid model is assuming the majority class.

In [None]:
# Predict that all rows belong to the majority class
majority_class = train_labels['status_group'].mode()[0]
y_pred = np.full(len(train_labels), majority_class)
y_true = train_labels['status_group']

# Check the accuracy of that prediction
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_true, y_pred)

In [None]:
train1.select_dtypes(exclude='number').nunique().sort_values(ascending=False)

# OHE + Logistic Regression
I'll start by one-hot encoding all the categorical variables and running a simple logistic regression

In [31]:
# I'll first drop the categorical variables that have too many 
# unique values, so that regression doesn't take forever
cols_to_drop = ['id',
                 'wpt_name',
                 'subvillage',
                 'scheme_name',
                 'installer',
                 'ward',
                 'funder',
                 ]


cols_to_keep =  ['lga',
                 'region_code',
                 'region',
                 'district_code',
                 'extraction_type_group',
                 'management',
                 'source',
                 'scheme_management',
                 'extraction_type',
                 'basin',
                 'water_quality',
                 'payment_type',
                 'extraction_type_class',
                 'waterpoint_type',
                 'source_type',
                 'payment',
                 'waterpoint_type_group',
                 'quality_group',
                 'quantity',
                 'quantity_group',
                 'management_group',
                 'public_meeting',
                 'permit',
                 'source_class']
# X = train1.drop(columns= )
X = train1[cols_to_keep]
y_true = train_labels['status_group']

In [32]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn_pandas import DataFrameMapper

# Get a list of numerical and categorical columns
numerical_cols = X.select_dtypes(include='number').columns.tolist()
categorical_cols = X.select_dtypes(exclude='number').columns.tolist()

# # Use a mapper to apply transformations selectively
mapper = DataFrameMapper(
  [([col], StandardScaler()) for col in numerical_cols] +
  [([col], OneHotEncoder(categories='auto')) for col in categorical_cols]
)

# # Define an estimator and param_grid
pipe = make_pipeline(
    mapper, 
    LogisticRegression(solver='lbfgs', multi_class='ovr',
                      max_iter=500))


In [33]:
%%time
pipe.fit(X,y_true)

CPU times: user 55.1 s, sys: 412 ms, total: 55.5 s
Wall time: 56 s


Pipeline(memory=None,
     steps=[('dataframemapper', DataFrameMapper(default=False, df_out=False,
        features=[(['lga'], OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)), (['region_code'], OneHotEncoder(categoric... penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=0, warm_start=False))])

In [55]:
y_pred = pipe.predict(X)
accuracy_score(y_true, y_pred)

0.749983164983165

Alright, that was the score with all categories except for those that have thousands of possible values and make the final dataframe way too big.

# Make a submission file

In [56]:
# Clean up the test dataset
test1 = cleanup1(test_features)

# Extract the same columns used for training
X_test = test1[cols_to_keep]

# Run the prediction, using the pipeline fit to the training data
y_pred = pipe.predict(X_test)

# Make a dataframe with the answers
y_submit = pd.DataFrame({'id':test_features['id'],
                         'status_group':y_pred} )
# make a submission CSV file
y_submit.to_csv('DMAn.csv', index=False)

# All I need for restarted kernel

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)  # Unlimited columns
from sklearn.metrics import accuracy_score, classification_report

In [2]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV

In [3]:
sample_submission = pd.read_csv('original_data/sample_submission.csv')
test_features = pd.read_csv('original_data/test_features.csv')
train_features = pd.read_csv('original_data/train_features.csv')
train_labels = pd.read_csv('original_data/train_labels.csv')

In [4]:
def cleanup1(X):
    """
    Ensures that all the features are good to go for the first 
    logistic regression.
    
    >> Input
    X: Full-featured dataset
    
    >> Output
    X2: Cleaned dataset
    """
    
    # Looking at all the features with missing values, it looks like those
    # features are all categorical variables where 'unknown' would be a
    # category we can work with.  I'll replace the NANs accordingly.
    X2 = X.fillna('unknown')
    
    # Regression on dates won't work.  Instead, I'll turn the 
    # date_recorded column into the number of years since 2000
    # (the earliest date in the training date is from 2000, and the
    # latest from 2013.)
    dates = pd.to_datetime(X2.date_recorded)
    year2000 = pd.to_datetime('2000-01-01')
    years = [i.days/365 for i in (dates - year2000)]
    X2.date_recorded = years
    
    # region_code and district_code are int64, but they should really be
    # treated as categories (and there's only 20-30 classes in each).
    # I'll cast them as strings instead.
    X2.region_code = X2.region_code.astype('str')
    X2.district_code = X2.district_code.astype('str')
    
    # recorded_by has only one value everywhere, and is therefore useless
    X2 = X2.drop(columns='recorded_by')
    
    # To prevent data conversion warnings, I'll turn all the numerical
    # features (except id) into float64.
    
    # Also, some columns contained bool values and NANs.  
    # (e.g., public_meeting, permit)
    # I replaced the NANs with strings, so I'll cast the whole series 
    # as strings to prevent future problems with data type heterogeneity.
    type_dict = {'amount_tsh':'float64',
                 'date_recorded':'float64',
                 'gps_height':'float64',
                 'longitude':'float64',
                 'latitude':'float64',
                 'num_private':'float64',
                 'population':'float64',
                 'construction_year':'float64',
                 'public_meeting':'str',
                 'permit':'str'}
    
    X2 = X2.astype(dtype = type_dict)
    

    
    
    return X2

In [5]:
train1 = cleanup1(train_features)
train1.shape

(59400, 39)

# LogReg with OHE and Binary Encoding

In [51]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.decomposition import PCA
from category_encoders import BinaryEncoder

bad_cats = ['wpt_name',
                 'subvillage',
                 'scheme_name',
                 'installer',
                 'ward',
                 'funder',
                 ]


good_cats =  ['lga',
                 'region_code',
                 'region',
                 'district_code',
                 'extraction_type_group',
                 'management',
                 'source',
                 'scheme_management',
                 'extraction_type',
                 'basin',
                 'water_quality',
                 'payment_type',
                 'extraction_type_class',
                 'waterpoint_type',
                 'source_type',
                 'payment',
                 'waterpoint_type_group',
                 'quality_group',
                 'quantity',
                 'quantity_group',
                 'management_group',
                 'public_meeting',
                 'permit',
                 'source_class']

X = train1.drop(columns='id')
y_true = train_labels['status_group']

# Get a list of numerical columns
numerical_cols = X.select_dtypes(include='number').columns.tolist()

# # Use a mapper to apply transformations selectively
mapper = DataFrameMapper(
    [([col], StandardScaler()) for col in numerical_cols] +
    [([col], OneHotEncoder(categories='auto')) for col in good_cats] +
    [([col], BinaryEncoder()) for col in bad_cats]   
)

# # Define an estimator and param_grid
pipe1 = make_pipeline(
    mapper,
    PCA(n_components=0.99)
)

pipe2 = make_pipeline(
    LogisticRegression(solver = 'lbfgs', multi_class='ovr',
                      max_iter=500))

param_grid = {}

gs = GridSearchCV(pipe2, cv=2, param_grid=param_grid,
                  scoring='accuracy', 
                  verbose=10)

In [52]:
%%time
X_transformed = pipe1.fit_transform(X,y_true)

CPU times: user 10.1 s, sys: 1.27 s, total: 11.3 s
Wall time: 7.95 s


In [53]:
%%time
pipe2.fit(X_transformed,y_true)

CPU times: user 12.6 s, sys: 121 ms, total: 12.7 s
Wall time: 6.61 s


Pipeline(memory=None,
     steps=[('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [54]:
y_pred = pipe2.predict(X_transformed)
accuracy_score(y_true, y_pred)

0.7461111111111111

## Adding polynomial features

In [44]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.decomposition import PCA
from category_encoders import BinaryEncoder

bad_cats = ['wpt_name',
                 'subvillage',
                 'scheme_name',
                 'installer',
                 'ward',
                 'funder',
                 ]


good_cats =  ['lga',
                 'region_code',
                 'region',
                 'district_code',
                 'extraction_type_group',
                 'management',
                 'source',
                 'scheme_management',
                 'extraction_type',
                 'basin',
                 'water_quality',
                 'payment_type',
                 'extraction_type_class',
                 'waterpoint_type',
                 'source_type',
                 'payment',
                 'waterpoint_type_group',
                 'quality_group',
                 'quantity',
                 'quantity_group',
                 'management_group',
                 'public_meeting',
                 'permit',
                 'source_class']

X = train1.drop(columns='id')
y_true = train_labels['status_group']

# Get a list of numerical columns
numerical_cols = X.select_dtypes(include='number').columns.tolist()

# # Use a mapper to apply transformations selectively
scaler_encoder = DataFrameMapper(
    [([col], StandardScaler()) for col in numerical_cols] +
    [([col], OneHotEncoder(categories='auto')) for col in good_cats] +
    [([col], BinaryEncoder()) for col in bad_cats]     
)


poly_maker = DataFrameMapper(
    [([col], PolynomialFeatures()) for col in (numerical_cols + )]    
)

# # Define an estimator and param_grid
pipe1 = make_pipeline(
    scaler_encoder,
    PCA(n_components=0.99)
)

pipe2 = make_pipeline(
    LogisticRegression(solver = 'lbfgs', multi_class='ovr',
                      max_iter=500))

param_grid = {}

gs = GridSearchCV(pipe2, cv=2, param_grid=param_grid,
                  scoring='accuracy', 
                  verbose=10)

In [45]:
X_expanded = mapper.fit_transform(X,y_true)

In [48]:
X_expanded.shape

(59400, 463)

In [50]:
pd.DataFrame(X_expanded, columns=mapper.transformed_names_).head()

Unnamed: 0,amount_tsh,date_recorded,gps_height,longitude,latitude,num_private,population,construction_year,lga_x0_Arusha Rural,lga_x0_Arusha Urban,lga_x0_Babati,lga_x0_Bagamoyo,lga_x0_Bahi,lga_x0_Bariadi,lga_x0_Biharamulo,lga_x0_Bukoba Rural,lga_x0_Bukoba Urban,lga_x0_Bukombe,lga_x0_Bunda,lga_x0_Chamwino,lga_x0_Chato,lga_x0_Chunya,lga_x0_Dodoma Urban,lga_x0_Geita,lga_x0_Hai,lga_x0_Hanang,lga_x0_Handeni,lga_x0_Igunga,lga_x0_Ilala,lga_x0_Ileje,lga_x0_Ilemela,lga_x0_Iramba,lga_x0_Iringa Rural,lga_x0_Kahama,lga_x0_Karagwe,lga_x0_Karatu,lga_x0_Kasulu,lga_x0_Kibaha,lga_x0_Kibondo,lga_x0_Kigoma Rural,lga_x0_Kigoma Urban,lga_x0_Kilindi,lga_x0_Kilolo,lga_x0_Kilombero,lga_x0_Kilosa,lga_x0_Kilwa,lga_x0_Kinondoni,lga_x0_Kisarawe,lga_x0_Kishapu,lga_x0_Kiteto,lga_x0_Kondoa,lga_x0_Kongwa,lga_x0_Korogwe,lga_x0_Kwimba,lga_x0_Kyela,lga_x0_Lindi Rural,lga_x0_Lindi Urban,lga_x0_Liwale,lga_x0_Longido,lga_x0_Ludewa,lga_x0_Lushoto,lga_x0_Mafia,lga_x0_Magu,lga_x0_Makete,lga_x0_Manyoni,lga_x0_Masasi,lga_x0_Maswa,lga_x0_Mbarali,lga_x0_Mbeya Rural,lga_x0_Mbinga,lga_x0_Mbozi,lga_x0_Mbulu,lga_x0_Meatu,lga_x0_Meru,lga_x0_Misenyi,lga_x0_Missungwi,lga_x0_Mkinga,lga_x0_Mkuranga,lga_x0_Monduli,lga_x0_Morogoro Rural,lga_x0_Morogoro Urban,lga_x0_Moshi Rural,lga_x0_Moshi Urban,lga_x0_Mpanda,lga_x0_Mpwapwa,lga_x0_Mtwara Rural,lga_x0_Mtwara Urban,lga_x0_Mufindi,lga_x0_Muheza,lga_x0_Muleba,lga_x0_Musoma Rural,lga_x0_Mvomero,lga_x0_Mwanga,lga_x0_Nachingwea,lga_x0_Namtumbo,lga_x0_Nanyumbu,lga_x0_Newala,lga_x0_Ngara,lga_x0_Ngorongoro,lga_x0_Njombe,lga_x0_Nkasi,lga_x0_Nyamagana,lga_x0_Nzega,lga_x0_Pangani,lga_x0_Rombo,lga_x0_Rorya,lga_x0_Ruangwa,lga_x0_Rufiji,lga_x0_Rungwe,lga_x0_Same,lga_x0_Sengerema,lga_x0_Serengeti,lga_x0_Shinyanga Rural,lga_x0_Shinyanga Urban,lga_x0_Siha,lga_x0_Sikonge,lga_x0_Simanjiro,lga_x0_Singida Rural,lga_x0_Singida Urban,lga_x0_Songea Rural,lga_x0_Songea Urban,lga_x0_Sumbawanga Rural,lga_x0_Sumbawanga Urban,lga_x0_Tabora Urban,lga_x0_Tandahimba,lga_x0_Tanga,lga_x0_Tarime,lga_x0_Temeke,lga_x0_Tunduru,lga_x0_Ukerewe,lga_x0_Ulanga,lga_x0_Urambo,lga_x0_Uyui,region_code_x0_1,region_code_x0_10,region_code_x0_11,region_code_x0_12,region_code_x0_13,region_code_x0_14,region_code_x0_15,region_code_x0_16,region_code_x0_17,region_code_x0_18,region_code_x0_19,region_code_x0_2,region_code_x0_20,region_code_x0_21,region_code_x0_24,region_code_x0_3,region_code_x0_4,region_code_x0_40,region_code_x0_5,region_code_x0_6,region_code_x0_60,region_code_x0_7,region_code_x0_8,region_code_x0_80,region_code_x0_9,region_code_x0_90,region_code_x0_99,region_x0_Arusha,region_x0_Dar es Salaam,region_x0_Dodoma,region_x0_Iringa,region_x0_Kagera,region_x0_Kigoma,region_x0_Kilimanjaro,region_x0_Lindi,region_x0_Manyara,region_x0_Mara,region_x0_Mbeya,region_x0_Morogoro,region_x0_Mtwara,region_x0_Mwanza,region_x0_Pwani,region_x0_Rukwa,region_x0_Ruvuma,region_x0_Shinyanga,region_x0_Singida,region_x0_Tabora,region_x0_Tanga,district_code_x0_0,district_code_x0_1,district_code_x0_13,district_code_x0_2,district_code_x0_23,district_code_x0_3,district_code_x0_30,district_code_x0_33,district_code_x0_4,district_code_x0_43,district_code_x0_5,district_code_x0_53,district_code_x0_6,district_code_x0_60,district_code_x0_62,district_code_x0_63,district_code_x0_67,district_code_x0_7,district_code_x0_8,district_code_x0_80,extraction_type_group_x0_afridev,extraction_type_group_x0_gravity,extraction_type_group_x0_india mark ii,extraction_type_group_x0_india mark iii,extraction_type_group_x0_mono,extraction_type_group_x0_nira/tanira,extraction_type_group_x0_other,extraction_type_group_x0_other handpump,extraction_type_group_x0_other motorpump,extraction_type_group_x0_rope pump,extraction_type_group_x0_submersible,extraction_type_group_x0_swn 80,extraction_type_group_x0_wind-powered,management_x0_company,management_x0_other,management_x0_other - school,management_x0_parastatal,management_x0_private operator,management_x0_trust,management_x0_unknown,management_x0_vwc,management_x0_water authority,management_x0_water board,management_x0_wua,management_x0_wug,source_x0_dam,source_x0_hand dtw,source_x0_lake,source_x0_machine dbh,source_x0_other,source_x0_rainwater harvesting,source_x0_river,source_x0_shallow well,source_x0_spring,source_x0_unknown,scheme_management_x0_Company,scheme_management_x0_None,scheme_management_x0_Other,scheme_management_x0_Parastatal,scheme_management_x0_Private operator,scheme_management_x0_SWC,scheme_management_x0_Trust,scheme_management_x0_VWC,scheme_management_x0_WUA,scheme_management_x0_WUG,scheme_management_x0_Water Board,scheme_management_x0_Water authority,scheme_management_x0_unknown,extraction_type_x0_afridev,extraction_type_x0_cemo,extraction_type_x0_climax,extraction_type_x0_gravity,extraction_type_x0_india mark ii,extraction_type_x0_india mark iii,extraction_type_x0_ksb,extraction_type_x0_mono,extraction_type_x0_nira/tanira,extraction_type_x0_other,extraction_type_x0_other - mkulima/shinyanga,extraction_type_x0_other - play pump,extraction_type_x0_other - rope pump,extraction_type_x0_other - swn 81,extraction_type_x0_submersible,extraction_type_x0_swn 80,extraction_type_x0_walimi,extraction_type_x0_windmill,basin_x0_Internal,basin_x0_Lake Nyasa,basin_x0_Lake Rukwa,basin_x0_Lake Tanganyika,basin_x0_Lake Victoria,basin_x0_Pangani,basin_x0_Rufiji,basin_x0_Ruvuma / Southern Coast,basin_x0_Wami / Ruvu,water_quality_x0_coloured,water_quality_x0_fluoride,water_quality_x0_fluoride abandoned,water_quality_x0_milky,water_quality_x0_salty,water_quality_x0_salty abandoned,water_quality_x0_soft,water_quality_x0_unknown,payment_type_x0_annually,payment_type_x0_monthly,payment_type_x0_never pay,payment_type_x0_on failure,payment_type_x0_other,payment_type_x0_per bucket,payment_type_x0_unknown,extraction_type_class_x0_gravity,extraction_type_class_x0_handpump,extraction_type_class_x0_motorpump,extraction_type_class_x0_other,extraction_type_class_x0_rope pump,extraction_type_class_x0_submersible,extraction_type_class_x0_wind-powered,waterpoint_type_x0_cattle trough,waterpoint_type_x0_communal standpipe,waterpoint_type_x0_communal standpipe multiple,waterpoint_type_x0_dam,waterpoint_type_x0_hand pump,waterpoint_type_x0_improved spring,waterpoint_type_x0_other,source_type_x0_borehole,source_type_x0_dam,source_type_x0_other,source_type_x0_rainwater harvesting,source_type_x0_river/lake,source_type_x0_shallow well,source_type_x0_spring,payment_x0_never pay,payment_x0_other,payment_x0_pay annually,payment_x0_pay monthly,payment_x0_pay per bucket,payment_x0_pay when scheme fails,payment_x0_unknown,waterpoint_type_group_x0_cattle trough,waterpoint_type_group_x0_communal standpipe,waterpoint_type_group_x0_dam,waterpoint_type_group_x0_hand pump,waterpoint_type_group_x0_improved spring,waterpoint_type_group_x0_other,quality_group_x0_colored,quality_group_x0_fluoride,quality_group_x0_good,quality_group_x0_milky,quality_group_x0_salty,quality_group_x0_unknown,quantity_x0_dry,quantity_x0_enough,quantity_x0_insufficient,quantity_x0_seasonal,quantity_x0_unknown,quantity_group_x0_dry,quantity_group_x0_enough,quantity_group_x0_insufficient,quantity_group_x0_seasonal,quantity_group_x0_unknown,management_group_x0_commercial,management_group_x0_other,management_group_x0_parastatal,management_group_x0_unknown,management_group_x0_user-group,public_meeting_x0_False,public_meeting_x0_True,public_meeting_x0_unknown,permit_x0_False,permit_x0_True,permit_x0_unknown,source_class_x0_groundwater,source_class_x0_surface,source_class_x0_unknown,wpt_name_0,wpt_name_1,wpt_name_2,wpt_name_3,wpt_name_4,wpt_name_5,wpt_name_6,wpt_name_7,wpt_name_8,wpt_name_9,wpt_name_10,wpt_name_11,wpt_name_12,wpt_name_13,wpt_name_14,wpt_name_15,wpt_name_16,subvillage_0,subvillage_1,subvillage_2,subvillage_3,subvillage_4,subvillage_5,subvillage_6,subvillage_7,subvillage_8,subvillage_9,subvillage_10,subvillage_11,subvillage_12,subvillage_13,subvillage_14,subvillage_15,scheme_name_0,scheme_name_1,scheme_name_2,scheme_name_3,scheme_name_4,scheme_name_5,scheme_name_6,scheme_name_7,scheme_name_8,scheme_name_9,scheme_name_10,scheme_name_11,scheme_name_12,installer_0,installer_1,installer_2,installer_3,installer_4,installer_5,installer_6,installer_7,installer_8,installer_9,installer_10,installer_11,installer_12,ward_0,ward_1,ward_2,ward_3,ward_4,ward_5,ward_6,ward_7,ward_8,ward_9,ward_10,ward_11,ward_12,funder_0,funder_1,funder_2,funder_3,funder_4,funder_5,funder_6,funder_7,funder_8,funder_9,funder_10,funder_11,amount_tsh_1,amount_tsh_x0,amount_tsh_x0^2,date_recorded_1,date_recorded_x0,date_recorded_x0^2,gps_height_1,gps_height_x0,gps_height_x0^2,longitude_1,longitude_x0,longitude_x0^2,latitude_1,latitude_x0,latitude_x0^2,num_private_1,num_private_x0,num_private_x0^2,population_1,population_x0,population_x0^2,construction_year_1,construction_year_x0,construction_year_x0^2
0,1.895665,-1.141136,1.041252,0.131052,-1.408791,-0.038749,-0.150399,0.733857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,6000.0,36000000.0,1.0,11.205479,125.56277,1.0,1390.0,1932100.0,1.0,34.938093,1220.670325,1.0,-9.856322,97.147079,1.0,0.0,0.0,1.0,109.0,11881.0,1.0,1999.0,3996001.0
1,-0.10597,1.022152,1.054237,0.09461,1.207934,-0.038749,0.21229,0.745416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,13.186301,173.878544,1.0,1399.0,1957201.0,1.0,34.698766,1204.004369,1.0,-2.147466,4.611609,1.0,0.0,0.0,1.0,280.0,78400.0,1.0,2010.0,4040100.0
2,-0.09763,0.995223,0.025541,0.515158,0.639751,-0.038749,0.14866,0.744365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,25.0,625.0,1.0,13.161644,173.228868,1.0,686.0,470596.0,1.0,37.460664,1403.301382,1.0,-3.821329,14.602552,1.0,0.0,0.0,1.0,250.0,62500.0,1.0,2009.0,4036081.0
3,-0.10597,0.911444,-0.584751,0.671308,-1.84972,-0.038749,-0.25857,0.720196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,13.084932,171.215433,1.0,263.0,69169.0,1.0,38.486161,1481.184579,1.0,-11.155298,124.440667,1.0,0.0,0.0,1.0,58.0,3364.0,1.0,1986.0,3944196.0
4,-0.10597,-0.779092,-0.9642,-0.448669,1.317271,-0.038749,-0.381587,-1.366788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,11.536986,133.102053,1.0,0.0,0.0,1.0,31.130847,969.129617,1.0,-1.825359,3.331935,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [40]:
%%time
X_transformed = pipe1.fit_transform(X,y_true)

CPU times: user 9.18 s, sys: 951 ms, total: 10.1 s
Wall time: 6.71 s


In [41]:
%%time
pipe2.fit(X_transformed,y_true)

CPU times: user 12.4 s, sys: 113 ms, total: 12.5 s
Wall time: 6.4 s


Pipeline(memory=None,
     steps=[('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [42]:
y_pred = pipe2.predict(X_transformed)
accuracy_score(y_true, y_pred)

0.7461111111111111

In [43]:
X_transformed.shape

(59400, 194)