In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score

## train_complete dataset

In [2]:
train = pd.read_csv('../train_complete.csv')
train.head()

Unnamed: 0,ID,State_Name,Year,Age,Incidence,Arithmetic_Mean_mean_Lead_(TSP)_STP,Arithmetic_Mean_min_Lead_(TSP)_STP,Arithmetic_Mean_max_Lead_(TSP)_STP,Arithmetic_Mean_mean_Lead_(TSP)_STP_l2,Arithmetic_Mean_min_Lead_(TSP)_STP_l2,...,X1st_Max_Value_max_Benzene_l8,X1st_Max_Value_max_Ethylbenzene_l2,X1st_Max_Value_max_Ethylbenzene_l5,X1st_Max_Value_max_Ethylbenzene_l8,X1st_Max_Value_max_Toluene_l2,X1st_Max_Value_max_Toluene_l5,X1st_Max_Value_max_Toluene_l8,X1st_Max_Value_max_o-Xylene_l2,X1st_Max_Value_max_o-Xylene_l5,X1st_Max_Value_max_o-Xylene_l8
0,f8312a4,Alabama,1990,65-69,4685.284313,0.664543,0.0,7.96,1.071302,0.0,...,,,,,,,,,,
1,3effa36,Alabama,1990,70-74,4827.052043,0.664543,0.0,7.96,1.071302,0.0,...,,,,,,,,,,
2,1e8044b,Alabama,1990,75-79,4377.956914,0.664543,0.0,7.96,1.071302,0.0,...,,,,,,,,,,
3,d875d65,Alabama,1990,80-84,3822.732993,0.664543,0.0,7.96,1.071302,0.0,...,,,,,,,,,,
4,46e6695,Alabama,1990,85-89,3470.199503,0.664543,0.0,7.96,1.071302,0.0,...,,,,,,,,,,


In [3]:
train.shape

(4500, 200)

In [6]:
# Eliminate columns that have more than 20% missing values

train.dropna(thresh = train.shape[0]*0.8, axis = 1, inplace = True)

In [27]:
# create a column_keep to keep the same column of train and test

column_keep = train.drop(columns='Incidence').columns

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 50 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   ID                                         4500 non-null   object 
 1   State_Name                                 4500 non-null   object 
 2   Year                                       4500 non-null   int64  
 3   Age                                        4500 non-null   object 
 4   Incidence                                  4500 non-null   float64
 5   Arithmetic_Mean_mean_Arsenic_PM2.5_LC      3960 non-null   float64
 6   Arithmetic_Mean_mean_Chromium_PM2.5_LC     3960 non-null   float64
 7   Arithmetic_Mean_mean_Lead_PM2.5_LC         3960 non-null   float64
 8   Arithmetic_Mean_mean_Manganese_PM2.5_LC    3960 non-null   float64
 9   Arithmetic_Mean_mean_Nickel_PM2.5_LC       3960 non-null   float64
 10  Arithmetic_Mean_min_Arse

In [28]:
train_dummy = pd.get_dummies(train, columns=['Age'], drop_first=True)
train_dummy.head()

Unnamed: 0,ID,State_Name,Year,Incidence,Arithmetic_Mean_mean_Arsenic_PM2.5_LC,Arithmetic_Mean_mean_Chromium_PM2.5_LC,Arithmetic_Mean_mean_Lead_PM2.5_LC,Arithmetic_Mean_mean_Manganese_PM2.5_LC,Arithmetic_Mean_mean_Nickel_PM2.5_LC,Arithmetic_Mean_min_Arsenic_PM2.5_LC,...,Arithmetic_Mean_min_Nitric_oxide_(NO)_l2,Arithmetic_Mean_max_Nitric_oxide_(NO)_l2,X1st_Max_Value_mean_Nitric_oxide_(NO)_l2,X1st_Max_Value_min_Nitric_oxide_(NO)_l2,X1st_Max_Value_max_Nitric_oxide_(NO)_l2,Age_70-74,Age_75-79,Age_80-84,Age_85-89,Age_90-94
0,f8312a4,Alabama,1990,4685.284313,,,,,,,...,,,,,,0,0,0,0,0
1,3effa36,Alabama,1990,4827.052043,,,,,,,...,,,,,,1,0,0,0,0
2,1e8044b,Alabama,1990,4377.956914,,,,,,,...,,,,,,0,1,0,0,0
3,d875d65,Alabama,1990,3822.732993,,,,,,,...,,,,,,0,0,1,0,0
4,46e6695,Alabama,1990,3470.199503,,,,,,,...,,,,,,0,0,0,1,0


## test_completed dataset

In [19]:
test = pd.read_csv('../test_complete.csv')
test.head()

Unnamed: 0,ID,State_Name,Year,Age,Arithmetic_Mean_mean_Lead_(TSP)_STP,Arithmetic_Mean_min_Lead_(TSP)_STP,Arithmetic_Mean_max_Lead_(TSP)_STP,Arithmetic_Mean_mean_Lead_(TSP)_STP_l2,Arithmetic_Mean_min_Lead_(TSP)_STP_l2,Arithmetic_Mean_max_Lead_(TSP)_STP_l2,...,X1st_Max_Value_max_Benzene_l8,X1st_Max_Value_max_Ethylbenzene_l2,X1st_Max_Value_max_Ethylbenzene_l5,X1st_Max_Value_max_Ethylbenzene_l8,X1st_Max_Value_max_Toluene_l2,X1st_Max_Value_max_Toluene_l5,X1st_Max_Value_max_Toluene_l8,X1st_Max_Value_max_o-Xylene_l2,X1st_Max_Value_max_o-Xylene_l5,X1st_Max_Value_max_o-Xylene_l8
0,6c06615,Delaware,1990,65-69,,,,,,,...,,,,,,,,,,
1,e0cf76e,Delaware,1990,70-74,,,,,,,...,,,,,,,,,,
2,e9f95f0,Delaware,1990,75-79,,,,,,,...,,,,,,,,,,
3,5dffe44,Delaware,1990,80-84,,,,,,,...,,,,,,,,,,
4,9f9ea9b,Delaware,1990,85-89,,,,,,,...,,,,,,,,,,


In [20]:
test.shape

(4680, 199)

In [25]:
test = test[column_keep]

In [26]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4680 entries, 0 to 4679
Data columns (total 49 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   ID                                         4680 non-null   object 
 1   State_Name                                 4680 non-null   object 
 2   Year                                       4680 non-null   int64  
 3   Age                                        4680 non-null   object 
 4   Arithmetic_Mean_mean_Arsenic_PM2.5_LC      3852 non-null   float64
 5   Arithmetic_Mean_mean_Chromium_PM2.5_LC     3852 non-null   float64
 6   Arithmetic_Mean_mean_Lead_PM2.5_LC         3852 non-null   float64
 7   Arithmetic_Mean_mean_Manganese_PM2.5_LC    3852 non-null   float64
 8   Arithmetic_Mean_mean_Nickel_PM2.5_LC       3852 non-null   float64
 9   Arithmetic_Mean_min_Arsenic_PM2.5_LC       3852 non-null   float64
 10  Arithmetic_Mean_min_Chro

In [49]:
test_dummy = pd.get_dummies(test, columns=['Age'], drop_first=True)

In [51]:
test_numeric = test_dummy.drop(columns=['ID','State_Name'])

In [52]:
test_numeric.shape

(4680, 51)

### Imputing data using Iterative Imputer

In [53]:
# Create varible X, y

X = train_dummy.drop(columns=['Incidence','ID','State_Name'])
y = train_dummy['Incidence']

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [55]:
it_imp = IterativeImputer(
    estimator=LinearRegression(),
    max_iter=100
)

In [56]:
X_train_imp_missing = it_imp.fit_transform(X_train)
X_test_imp_missing = it_imp.transform(X_test)
test_imp_missing = it_imp.transform(test_numeric)

## Modeling

In [58]:
lr = LinearRegression()
lr.fit(X_train_imp_missing,y_train)

In [60]:
lr.score(X_train_imp_missing,y_train), lr.score(X_test_imp_missing,y_test)

(0.7691887545836131, 0.75440765345233)

In [61]:
test_preds = lr.predict(test_imp_missing)

In [62]:
test_preds_lr = pd.DataFrame({
    'ID': test['ID'],
    'Incidence': test_preds,
})

test_preds_lr.head()

Unnamed: 0,ID,Incidence
0,6c06615,2445.184973
1,e0cf76e,3062.225065
2,e9f95f0,3500.914633
3,5dffe44,3326.510797
4,9f9ea9b,2769.857415


In [63]:
test_preds_lr.to_csv('../lr_model_3.csv', index=False)