## Make necessary imports

In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.linear_model import LinearRegression, Lasso

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor


pd.set_option("display.max_columns", 200) 

In [61]:
# training and test data containing NaN values
train_df = pd.read_csv('train_complete.csv')
test_df = pd.read_csv('test_complete.csv')

In [60]:
# Datasets with the mean imputed for missing values
train_df_mean = pd.read_csv('train_df_mean.csv')
test_df_mean = pd.read_csv('test_df_mean.csv')

In [59]:
# Datasets with the zeros imputed for missing values
train_df_zero = pd.read_csv('train_df_zero.csv')
test_df_zero = pd.read_csv('test_df_zero.csv')

In [24]:
train_df_mean[test_df_mean.columns].columns

Index(['ID', 'State_Name', 'Year', 'Age',
       'Arithmetic_Mean_mean_Lead_(TSP)_STP',
       'Arithmetic_Mean_min_Lead_(TSP)_STP',
       'Arithmetic_Mean_max_Lead_(TSP)_STP',
       'Arithmetic_Mean_mean_Lead_(TSP)_STP_l2',
       'Arithmetic_Mean_min_Lead_(TSP)_STP_l2',
       'Arithmetic_Mean_max_Lead_(TSP)_STP_l2',
       ...
       'X1st_Max_Value_max_Benzene_l8', 'X1st_Max_Value_max_Ethylbenzene_l2',
       'X1st_Max_Value_max_Ethylbenzene_l5',
       'X1st_Max_Value_max_Ethylbenzene_l8', 'X1st_Max_Value_max_Toluene_l2',
       'X1st_Max_Value_max_Toluene_l5', 'X1st_Max_Value_max_Toluene_l8',
       'X1st_Max_Value_max_o-Xylene_l2', 'X1st_Max_Value_max_o-Xylene_l5',
       'X1st_Max_Value_max_o-Xylene_l8'],
      dtype='object', length=199)

In [20]:
test_df_mean.columns

Index(['ID', 'State_Name', 'Year', 'Age',
       'Arithmetic_Mean_mean_Lead_(TSP)_STP',
       'Arithmetic_Mean_min_Lead_(TSP)_STP',
       'Arithmetic_Mean_max_Lead_(TSP)_STP',
       'Arithmetic_Mean_mean_Lead_(TSP)_STP_l2',
       'Arithmetic_Mean_min_Lead_(TSP)_STP_l2',
       'Arithmetic_Mean_max_Lead_(TSP)_STP_l2',
       ...
       'X1st_Max_Value_max_Benzene_l8', 'X1st_Max_Value_max_Ethylbenzene_l2',
       'X1st_Max_Value_max_Ethylbenzene_l5',
       'X1st_Max_Value_max_Ethylbenzene_l8', 'X1st_Max_Value_max_Toluene_l2',
       'X1st_Max_Value_max_Toluene_l5', 'X1st_Max_Value_max_Toluene_l8',
       'X1st_Max_Value_max_o-Xylene_l2', 'X1st_Max_Value_max_o-Xylene_l5',
       'X1st_Max_Value_max_o-Xylene_l8'],
      dtype='object', length=199)

In [12]:
train_df_mean.head()

Unnamed: 0,ID,State_Name,Year,Age,Incidence,Arithmetic_Mean_mean_Lead_(TSP)_STP,Arithmetic_Mean_min_Lead_(TSP)_STP,Arithmetic_Mean_max_Lead_(TSP)_STP,Arithmetic_Mean_mean_Lead_(TSP)_STP_l2,Arithmetic_Mean_min_Lead_(TSP)_STP_l2,Arithmetic_Mean_max_Lead_(TSP)_STP_l2,Arithmetic_Mean_mean_Lead_(TSP)_STP_l5,Arithmetic_Mean_min_Lead_(TSP)_STP_l5,Arithmetic_Mean_max_Lead_(TSP)_STP_l5,Arithmetic_Mean_mean_Lead_(TSP)_STP_l8,Arithmetic_Mean_min_Lead_(TSP)_STP_l8,Arithmetic_Mean_max_Lead_(TSP)_STP_l8,Arithmetic_Mean_mean_Arsenic_PM2.5_LC,Arithmetic_Mean_mean_Chromium_PM2.5_LC,Arithmetic_Mean_mean_Lead_PM2.5_LC,Arithmetic_Mean_mean_Manganese_PM2.5_LC,Arithmetic_Mean_mean_Nickel_PM2.5_LC,Arithmetic_Mean_min_Arsenic_PM2.5_LC,Arithmetic_Mean_min_Chromium_PM2.5_LC,Arithmetic_Mean_min_Lead_PM2.5_LC,Arithmetic_Mean_min_Manganese_PM2.5_LC,Arithmetic_Mean_min_Nickel_PM2.5_LC,Arithmetic_Mean_max_Arsenic_PM2.5_LC,Arithmetic_Mean_max_Chromium_PM2.5_LC,Arithmetic_Mean_max_Lead_PM2.5_LC,Arithmetic_Mean_max_Manganese_PM2.5_LC,Arithmetic_Mean_max_Nickel_PM2.5_LC,X1st_Max_Value_mean_Arsenic_PM2.5_LC,X1st_Max_Value_mean_Chromium_PM2.5_LC,X1st_Max_Value_mean_Lead_PM2.5_LC,X1st_Max_Value_mean_Manganese_PM2.5_LC,X1st_Max_Value_mean_Nickel_PM2.5_LC,X1st_Max_Value_min_Arsenic_PM2.5_LC,X1st_Max_Value_min_Chromium_PM2.5_LC,X1st_Max_Value_min_Lead_PM2.5_LC,X1st_Max_Value_min_Manganese_PM2.5_LC,X1st_Max_Value_min_Nickel_PM2.5_LC,X1st_Max_Value_max_Arsenic_PM2.5_LC,X1st_Max_Value_max_Chromium_PM2.5_LC,X1st_Max_Value_max_Lead_PM2.5_LC,X1st_Max_Value_max_Manganese_PM2.5_LC,X1st_Max_Value_max_Nickel_PM2.5_LC,Arithmetic_Mean_mean_Arsenic_PM2.5_LC_l2,Arithmetic_Mean_mean_Chromium_PM2.5_LC_l2,Arithmetic_Mean_mean_Lead_PM2.5_LC_l2,Arithmetic_Mean_mean_Arsenic_PM2.5_LC_l5,Arithmetic_Mean_mean_Chromium_PM2.5_LC_l5,Arithmetic_Mean_mean_Lead_PM2.5_LC_l5,Arithmetic_Mean_mean_Arsenic_PM2.5_LC_l8,Arithmetic_Mean_mean_Chromium_PM2.5_LC_l8,Arithmetic_Mean_mean_Lead_PM2.5_LC_l8,Arithmetic_Mean_mean_Nitric_oxide_(NO),Arithmetic_Mean_mean_Oxides_of_nitrogen_(NOx),Arithmetic_Mean_min_Nitric_oxide_(NO),Arithmetic_Mean_min_Oxides_of_nitrogen_(NOx),Arithmetic_Mean_max_Nitric_oxide_(NO),Arithmetic_Mean_max_Oxides_of_nitrogen_(NOx),X1st_Max_Value_mean_Nitric_oxide_(NO),X1st_Max_Value_mean_Oxides_of_nitrogen_(NOx),X1st_Max_Value_min_Nitric_oxide_(NO),X1st_Max_Value_min_Oxides_of_nitrogen_(NOx),X1st_Max_Value_max_Nitric_oxide_(NO),X1st_Max_Value_max_Oxides_of_nitrogen_(NOx),Arithmetic_Mean_mean_Nitric_oxide_(NO)_l2,Arithmetic_Mean_mean_Nitric_oxide_(NO)_l5,Arithmetic_Mean_mean_Nitric_oxide_(NO)_l8,Arithmetic_Mean_mean_Oxides_of_nitrogen_(NOx)_l2,Arithmetic_Mean_mean_Oxides_of_nitrogen_(NOx)_l5,Arithmetic_Mean_mean_Oxides_of_nitrogen_(NOx)_l8,Arithmetic_Mean_min_Nitric_oxide_(NO)_l2,Arithmetic_Mean_min_Nitric_oxide_(NO)_l5,Arithmetic_Mean_min_Nitric_oxide_(NO)_l8,Arithmetic_Mean_min_Oxides_of_nitrogen_(NOx)_l2,Arithmetic_Mean_min_Oxides_of_nitrogen_(NOx)_l5,Arithmetic_Mean_min_Oxides_of_nitrogen_(NOx)_l8,Arithmetic_Mean_max_Nitric_oxide_(NO)_l2,Arithmetic_Mean_max_Nitric_oxide_(NO)_l5,Arithmetic_Mean_max_Nitric_oxide_(NO)_l8,Arithmetic_Mean_max_Oxides_of_nitrogen_(NOx)_l2,Arithmetic_Mean_max_Oxides_of_nitrogen_(NOx)_l5,Arithmetic_Mean_max_Oxides_of_nitrogen_(NOx)_l8,X1st_Max_Value_mean_Nitric_oxide_(NO)_l2,X1st_Max_Value_mean_Nitric_oxide_(NO)_l5,X1st_Max_Value_mean_Nitric_oxide_(NO)_l8,X1st_Max_Value_mean_Oxides_of_nitrogen_(NOx)_l2,X1st_Max_Value_mean_Oxides_of_nitrogen_(NOx)_l5,X1st_Max_Value_mean_Oxides_of_nitrogen_(NOx)_l8,X1st_Max_Value_min_Nitric_oxide_(NO)_l2,X1st_Max_Value_min_Nitric_oxide_(NO)_l5,X1st_Max_Value_min_Nitric_oxide_(NO)_l8,X1st_Max_Value_min_Oxides_of_nitrogen_(NOx)_l2,X1st_Max_Value_min_Oxides_of_nitrogen_(NOx)_l5,X1st_Max_Value_min_Oxides_of_nitrogen_(NOx)_l8,X1st_Max_Value_max_Nitric_oxide_(NO)_l2,X1st_Max_Value_max_Nitric_oxide_(NO)_l5,X1st_Max_Value_max_Nitric_oxide_(NO)_l8,X1st_Max_Value_max_Oxides_of_nitrogen_(NOx)_l2,X1st_Max_Value_max_Oxides_of_nitrogen_(NOx)_l5,X1st_Max_Value_max_Oxides_of_nitrogen_(NOx)_l8,Arithmetic_Mean_mean_Benzene,Arithmetic_Mean_mean_Ethylbenzene,Arithmetic_Mean_mean_Toluene,Arithmetic_Mean_mean_o-Xylene,Arithmetic_Mean_min_Benzene,Arithmetic_Mean_min_Ethylbenzene,Arithmetic_Mean_min_Toluene,Arithmetic_Mean_min_o-Xylene,Arithmetic_Mean_max_Benzene,Arithmetic_Mean_max_Ethylbenzene,Arithmetic_Mean_max_Toluene,Arithmetic_Mean_max_o-Xylene,X1st_Max_Value_mean_Benzene,X1st_Max_Value_mean_Ethylbenzene,X1st_Max_Value_mean_Toluene,X1st_Max_Value_mean_o-Xylene,X1st_Max_Value_min_Benzene,X1st_Max_Value_min_Ethylbenzene,X1st_Max_Value_min_Toluene,X1st_Max_Value_min_o-Xylene,X1st_Max_Value_max_Benzene,X1st_Max_Value_max_Ethylbenzene,X1st_Max_Value_max_Toluene,X1st_Max_Value_max_o-Xylene,Arithmetic_Mean_mean_Benzene_l2,Arithmetic_Mean_mean_Benzene_l5,Arithmetic_Mean_mean_Benzene_l8,Arithmetic_Mean_mean_Ethylbenzene_l2,Arithmetic_Mean_mean_Ethylbenzene_l5,Arithmetic_Mean_mean_Ethylbenzene_l8,Arithmetic_Mean_mean_Toluene_l2,Arithmetic_Mean_mean_Toluene_l5,Arithmetic_Mean_mean_Toluene_l8,Arithmetic_Mean_mean_o-Xylene_l2,Arithmetic_Mean_mean_o-Xylene_l5,Arithmetic_Mean_mean_o-Xylene_l8,Arithmetic_Mean_min_Benzene_l2,Arithmetic_Mean_min_Benzene_l5,Arithmetic_Mean_min_Benzene_l8,Arithmetic_Mean_min_Ethylbenzene_l2,Arithmetic_Mean_min_Ethylbenzene_l5,Arithmetic_Mean_min_Ethylbenzene_l8,Arithmetic_Mean_min_Toluene_l2,Arithmetic_Mean_min_Toluene_l5,Arithmetic_Mean_min_Toluene_l8,Arithmetic_Mean_min_o-Xylene_l2,Arithmetic_Mean_min_o-Xylene_l5,Arithmetic_Mean_min_o-Xylene_l8,Arithmetic_Mean_max_Benzene_l2,Arithmetic_Mean_max_Benzene_l5,Arithmetic_Mean_max_Benzene_l8,Arithmetic_Mean_max_Ethylbenzene_l2,Arithmetic_Mean_max_Ethylbenzene_l5,Arithmetic_Mean_max_Ethylbenzene_l8,Arithmetic_Mean_max_Toluene_l2,Arithmetic_Mean_max_Toluene_l5,Arithmetic_Mean_max_Toluene_l8,Arithmetic_Mean_max_o-Xylene_l2,Arithmetic_Mean_max_o-Xylene_l5,Arithmetic_Mean_max_o-Xylene_l8,X1st_Max_Value_mean_Benzene_l2,X1st_Max_Value_mean_Benzene_l5,X1st_Max_Value_mean_Benzene_l8,X1st_Max_Value_mean_Ethylbenzene_l2,X1st_Max_Value_mean_Ethylbenzene_l5,X1st_Max_Value_mean_Ethylbenzene_l8,X1st_Max_Value_mean_Toluene_l2,X1st_Max_Value_mean_Toluene_l5,X1st_Max_Value_mean_Toluene_l8,X1st_Max_Value_mean_o-Xylene_l2,X1st_Max_Value_mean_o-Xylene_l5,X1st_Max_Value_mean_o-Xylene_l8,X1st_Max_Value_min_Benzene_l2,X1st_Max_Value_min_Benzene_l5,X1st_Max_Value_min_Benzene_l8,X1st_Max_Value_min_Ethylbenzene_l2,X1st_Max_Value_min_Ethylbenzene_l5,X1st_Max_Value_min_Ethylbenzene_l8,X1st_Max_Value_min_Toluene_l2,X1st_Max_Value_min_Toluene_l5,X1st_Max_Value_min_Toluene_l8,X1st_Max_Value_min_o-Xylene_l2,X1st_Max_Value_min_o-Xylene_l5,X1st_Max_Value_min_o-Xylene_l8,X1st_Max_Value_max_Benzene_l2,X1st_Max_Value_max_Benzene_l5,X1st_Max_Value_max_Benzene_l8,X1st_Max_Value_max_Ethylbenzene_l2,X1st_Max_Value_max_Ethylbenzene_l5,X1st_Max_Value_max_Ethylbenzene_l8,X1st_Max_Value_max_Toluene_l2,X1st_Max_Value_max_Toluene_l5,X1st_Max_Value_max_Toluene_l8,X1st_Max_Value_max_o-Xylene_l2,X1st_Max_Value_max_o-Xylene_l5,X1st_Max_Value_max_o-Xylene_l8
0,f8312a4,Alabama,1990,65-69,4685.284313,0.664543,0.0,7.96,1.071302,0.0,28.17,0.691675,0.0,16.03,0.834853,0.016,18.58,0.000399,0.001116,0.002005,0.002033,0.000479,-1.2e-05,-0.000445,-0.001638,-0.000727,-0.000171,0.009005,0.082452,0.047502,0.073049,0.034792,0.000399,0.001116,0.002005,0.002033,0.000479,-1.2e-05,-0.000445,-0.001638,-0.000727,-0.000171,0.009005,0.082452,0.047502,0.073049,0.034792,0.000418,0.001056,0.0021,0.000442,0.000973,0.002173,0.000457,0.000847,0.002289,9.083971,20.05713,-0.058621,1.050109,134.842762,159.746244,32.944139,52.451887,0.280583,2.370423,366.155663,390.156338,10.160472,11.308066,12.497826,20.605571,21.701361,23.331338,0.102843,0.220613,0.335293,1.173778,1.355,1.531234,147.364181,177.502719,190.76271,164.78773,175.179862,185.224439,36.348465,40.377393,44.348384,54.101111,57.231046,61.487186,0.532343,0.652465,0.648571,2.514,2.768713,3.012035,399.708251,446.703521,474.372571,401.712364,421.851485,444.706127,2.712121,1.207758,7.193124,1.414438,0.281283,0.104499,0.501686,0.105842,56.164412,41.010646,266.990348,48.981439,3.297701,1.482747,8.679266,1.717965,0.303909,0.11467,0.548623,0.116637,87.145355,53.888221,321.425416,60.195836,3.082646,3.46141,3.814644,1.856836,2.071154,2.3512,8.258543,9.466964,10.439874,1.703468,1.963858,2.217179,0.308668,0.320681,0.352772,0.105631,0.11639,0.134298,0.607514,0.675899,0.744957,0.2035,0.240124,0.284168,58.833808,66.393586,75.779948,46.319705,52.979106,60.064317,280.369296,320.993086,350.15904,53.028796,61.911668,71.568925,3.674089,4.10034,4.518205,2.135001,2.375157,2.685904,9.747054,11.060755,12.170928,2.018855,2.309456,2.600001,0.331969,0.34623,0.381002,0.116342,0.128777,0.14907,0.655801,0.730225,0.806594,0.21501,0.253483,0.300291,88.327448,96.527158,107.137492,59.280761,67.303917,75.316029,335.461202,380.477819,417.452319,64.431452,74.388688,84.915583
1,3effa36,Alabama,1990,70-74,4827.052043,0.664543,0.0,7.96,1.071302,0.0,28.17,0.691675,0.0,16.03,0.834853,0.016,18.58,0.000399,0.001116,0.002005,0.002033,0.000479,-1.2e-05,-0.000445,-0.001638,-0.000727,-0.000171,0.009005,0.082452,0.047502,0.073049,0.034792,0.000399,0.001116,0.002005,0.002033,0.000479,-1.2e-05,-0.000445,-0.001638,-0.000727,-0.000171,0.009005,0.082452,0.047502,0.073049,0.034792,0.000418,0.001056,0.0021,0.000442,0.000973,0.002173,0.000457,0.000847,0.002289,9.083971,20.05713,-0.058621,1.050109,134.842762,159.746244,32.944139,52.451887,0.280583,2.370423,366.155663,390.156338,10.160472,11.308066,12.497826,20.605571,21.701361,23.331338,0.102843,0.220613,0.335293,1.173778,1.355,1.531234,147.364181,177.502719,190.76271,164.78773,175.179862,185.224439,36.348465,40.377393,44.348384,54.101111,57.231046,61.487186,0.532343,0.652465,0.648571,2.514,2.768713,3.012035,399.708251,446.703521,474.372571,401.712364,421.851485,444.706127,2.712121,1.207758,7.193124,1.414438,0.281283,0.104499,0.501686,0.105842,56.164412,41.010646,266.990348,48.981439,3.297701,1.482747,8.679266,1.717965,0.303909,0.11467,0.548623,0.116637,87.145355,53.888221,321.425416,60.195836,3.082646,3.46141,3.814644,1.856836,2.071154,2.3512,8.258543,9.466964,10.439874,1.703468,1.963858,2.217179,0.308668,0.320681,0.352772,0.105631,0.11639,0.134298,0.607514,0.675899,0.744957,0.2035,0.240124,0.284168,58.833808,66.393586,75.779948,46.319705,52.979106,60.064317,280.369296,320.993086,350.15904,53.028796,61.911668,71.568925,3.674089,4.10034,4.518205,2.135001,2.375157,2.685904,9.747054,11.060755,12.170928,2.018855,2.309456,2.600001,0.331969,0.34623,0.381002,0.116342,0.128777,0.14907,0.655801,0.730225,0.806594,0.21501,0.253483,0.300291,88.327448,96.527158,107.137492,59.280761,67.303917,75.316029,335.461202,380.477819,417.452319,64.431452,74.388688,84.915583
2,1e8044b,Alabama,1990,75-79,4377.956914,0.664543,0.0,7.96,1.071302,0.0,28.17,0.691675,0.0,16.03,0.834853,0.016,18.58,0.000399,0.001116,0.002005,0.002033,0.000479,-1.2e-05,-0.000445,-0.001638,-0.000727,-0.000171,0.009005,0.082452,0.047502,0.073049,0.034792,0.000399,0.001116,0.002005,0.002033,0.000479,-1.2e-05,-0.000445,-0.001638,-0.000727,-0.000171,0.009005,0.082452,0.047502,0.073049,0.034792,0.000418,0.001056,0.0021,0.000442,0.000973,0.002173,0.000457,0.000847,0.002289,9.083971,20.05713,-0.058621,1.050109,134.842762,159.746244,32.944139,52.451887,0.280583,2.370423,366.155663,390.156338,10.160472,11.308066,12.497826,20.605571,21.701361,23.331338,0.102843,0.220613,0.335293,1.173778,1.355,1.531234,147.364181,177.502719,190.76271,164.78773,175.179862,185.224439,36.348465,40.377393,44.348384,54.101111,57.231046,61.487186,0.532343,0.652465,0.648571,2.514,2.768713,3.012035,399.708251,446.703521,474.372571,401.712364,421.851485,444.706127,2.712121,1.207758,7.193124,1.414438,0.281283,0.104499,0.501686,0.105842,56.164412,41.010646,266.990348,48.981439,3.297701,1.482747,8.679266,1.717965,0.303909,0.11467,0.548623,0.116637,87.145355,53.888221,321.425416,60.195836,3.082646,3.46141,3.814644,1.856836,2.071154,2.3512,8.258543,9.466964,10.439874,1.703468,1.963858,2.217179,0.308668,0.320681,0.352772,0.105631,0.11639,0.134298,0.607514,0.675899,0.744957,0.2035,0.240124,0.284168,58.833808,66.393586,75.779948,46.319705,52.979106,60.064317,280.369296,320.993086,350.15904,53.028796,61.911668,71.568925,3.674089,4.10034,4.518205,2.135001,2.375157,2.685904,9.747054,11.060755,12.170928,2.018855,2.309456,2.600001,0.331969,0.34623,0.381002,0.116342,0.128777,0.14907,0.655801,0.730225,0.806594,0.21501,0.253483,0.300291,88.327448,96.527158,107.137492,59.280761,67.303917,75.316029,335.461202,380.477819,417.452319,64.431452,74.388688,84.915583
3,d875d65,Alabama,1990,80-84,3822.732993,0.664543,0.0,7.96,1.071302,0.0,28.17,0.691675,0.0,16.03,0.834853,0.016,18.58,0.000399,0.001116,0.002005,0.002033,0.000479,-1.2e-05,-0.000445,-0.001638,-0.000727,-0.000171,0.009005,0.082452,0.047502,0.073049,0.034792,0.000399,0.001116,0.002005,0.002033,0.000479,-1.2e-05,-0.000445,-0.001638,-0.000727,-0.000171,0.009005,0.082452,0.047502,0.073049,0.034792,0.000418,0.001056,0.0021,0.000442,0.000973,0.002173,0.000457,0.000847,0.002289,9.083971,20.05713,-0.058621,1.050109,134.842762,159.746244,32.944139,52.451887,0.280583,2.370423,366.155663,390.156338,10.160472,11.308066,12.497826,20.605571,21.701361,23.331338,0.102843,0.220613,0.335293,1.173778,1.355,1.531234,147.364181,177.502719,190.76271,164.78773,175.179862,185.224439,36.348465,40.377393,44.348384,54.101111,57.231046,61.487186,0.532343,0.652465,0.648571,2.514,2.768713,3.012035,399.708251,446.703521,474.372571,401.712364,421.851485,444.706127,2.712121,1.207758,7.193124,1.414438,0.281283,0.104499,0.501686,0.105842,56.164412,41.010646,266.990348,48.981439,3.297701,1.482747,8.679266,1.717965,0.303909,0.11467,0.548623,0.116637,87.145355,53.888221,321.425416,60.195836,3.082646,3.46141,3.814644,1.856836,2.071154,2.3512,8.258543,9.466964,10.439874,1.703468,1.963858,2.217179,0.308668,0.320681,0.352772,0.105631,0.11639,0.134298,0.607514,0.675899,0.744957,0.2035,0.240124,0.284168,58.833808,66.393586,75.779948,46.319705,52.979106,60.064317,280.369296,320.993086,350.15904,53.028796,61.911668,71.568925,3.674089,4.10034,4.518205,2.135001,2.375157,2.685904,9.747054,11.060755,12.170928,2.018855,2.309456,2.600001,0.331969,0.34623,0.381002,0.116342,0.128777,0.14907,0.655801,0.730225,0.806594,0.21501,0.253483,0.300291,88.327448,96.527158,107.137492,59.280761,67.303917,75.316029,335.461202,380.477819,417.452319,64.431452,74.388688,84.915583
4,46e6695,Alabama,1990,85-89,3470.199503,0.664543,0.0,7.96,1.071302,0.0,28.17,0.691675,0.0,16.03,0.834853,0.016,18.58,0.000399,0.001116,0.002005,0.002033,0.000479,-1.2e-05,-0.000445,-0.001638,-0.000727,-0.000171,0.009005,0.082452,0.047502,0.073049,0.034792,0.000399,0.001116,0.002005,0.002033,0.000479,-1.2e-05,-0.000445,-0.001638,-0.000727,-0.000171,0.009005,0.082452,0.047502,0.073049,0.034792,0.000418,0.001056,0.0021,0.000442,0.000973,0.002173,0.000457,0.000847,0.002289,9.083971,20.05713,-0.058621,1.050109,134.842762,159.746244,32.944139,52.451887,0.280583,2.370423,366.155663,390.156338,10.160472,11.308066,12.497826,20.605571,21.701361,23.331338,0.102843,0.220613,0.335293,1.173778,1.355,1.531234,147.364181,177.502719,190.76271,164.78773,175.179862,185.224439,36.348465,40.377393,44.348384,54.101111,57.231046,61.487186,0.532343,0.652465,0.648571,2.514,2.768713,3.012035,399.708251,446.703521,474.372571,401.712364,421.851485,444.706127,2.712121,1.207758,7.193124,1.414438,0.281283,0.104499,0.501686,0.105842,56.164412,41.010646,266.990348,48.981439,3.297701,1.482747,8.679266,1.717965,0.303909,0.11467,0.548623,0.116637,87.145355,53.888221,321.425416,60.195836,3.082646,3.46141,3.814644,1.856836,2.071154,2.3512,8.258543,9.466964,10.439874,1.703468,1.963858,2.217179,0.308668,0.320681,0.352772,0.105631,0.11639,0.134298,0.607514,0.675899,0.744957,0.2035,0.240124,0.284168,58.833808,66.393586,75.779948,46.319705,52.979106,60.064317,280.369296,320.993086,350.15904,53.028796,61.911668,71.568925,3.674089,4.10034,4.518205,2.135001,2.375157,2.685904,9.747054,11.060755,12.170928,2.018855,2.309456,2.600001,0.331969,0.34623,0.381002,0.116342,0.128777,0.14907,0.655801,0.730225,0.806594,0.21501,0.253483,0.300291,88.327448,96.527158,107.137492,59.280761,67.303917,75.316029,335.461202,380.477819,417.452319,64.431452,74.388688,84.915583


In [57]:
# Splitting data for data containing imputed means before pipeline
X = train_df_mean[test_df_mean.columns].drop(columns='ID', axis = 1)
y = train_df_mean['Incidence']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

num_columns = make_column_selector(dtype_include=np.number)
cat_columns = make_column_selector(dtype_exclude=np.number)

In [58]:
# Splitting data for data containing imputed 0 before pipeline
X_2 = train_df_zero[test_df_zero.columns].drop(columns='ID', axis = 1)
y_2 = train_df_zero['Incidence']
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, random_state=42)

num_columns_2 = make_column_selector(dtype_include=np.number)
cat_columns_2 = make_column_selector(dtype_exclude=np.number)

**Set up pipe with column transform**

In [None]:
# IMPUTE STEP

In [63]:
numeric_preprocessor = Pipeline(steps=[
    ("imputer", IterativeImputer(estimator=LinearRegression(), max_iter=500)),
    ("scaler", StandardScaler())
])

In [7]:
categorical_preprocessor = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [8]:
# https://realpython.com/python-bitwise-operators/#bitwise-not
preprocessor = ColumnTransformer(transformers = [
    ("categorical", categorical_preprocessor, cat_columns),
    ("numerical", numeric_preprocessor, num_columns)
])

In [9]:
lasso_pipe = Pipeline(steps=[
    # (IMPUTE STUFF GOES HERE)
    ("preprocessor", preprocessor),
    ("lasso", Lasso())
])

In [None]:
knn_pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("knn", Lasso())
])

In [10]:
lasso_pipe.fit(X_train, y_train)

In [11]:
lasso_pipe.score(X_train, y_train), lasso_pipe.score(X_test, y_test)

(0.8655765383275731, 0.8528402820970804)

In [62]:
lasso_params = {
    'lasso__alpha' : [0.75, 1, 1.2],
    'lasso__max_iter' : [80000],
    'lasso__alpha' : [ 1],
    'lasso__tol' : [ 0.0001, 0.00001]
}
lasso_gs = GridSearchCV(lasso_pipe, # what object are we optimizing?
                  lasso_params, # what parameters values are we searching?
                  cv = 3, # 3-fold cross-validation.
                  verbose = 1)

lasso_gs_2 = GridSearchCV(lasso_pipe, # what object are we optimizing?
                  lasso_params, # what parameters values are we searching?
                  cv = 3, # 3-fold cross-validation.
                  verbose = 1)

In [48]:
lasso_gs.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [49]:
lasso_gs_2.fit(X_train_2, y_train_2)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [52]:
lasso_gs.best_params_

{'lasso__alpha': 1, 'lasso__max_iter': 100000, 'lasso__tol': 0.0001}

In [54]:
# the data used to fit and test has the mean value for the column imputed for NaN
lasso_gs.score(X_train, y_train), lasso_gs.score(X_test, y_test)

(0.8655765383275731, 0.8528402820970804)

In [51]:
# the data used to fit and test has 0s imputed for NaN
lasso_gs_2.score(X_train_2, y_train_2), lasso_gs_2.score(X_test_2, y_test_2)

(0.8623217490782447, 0.8492659568590457)

In [122]:
lasso_preds = lasso_gs.best_estimator_.predict(test_df_mean)

In [123]:
lasso_preds = pd.Series(lasso_preds)
type(lasso_preds)

pandas.core.series.Series

In [120]:
preds_df = test_df_mean['ID']
type(preds_df)

pandas.core.series.Series

In [127]:
comp_df = pd.DataFrame({'ID': test_df_mean['ID'], 'Incidence': lasso_preds})

In [128]:
comp_df.to_csv('lasso_preds.csv', index=False)